From f6fd4b9dbd15dba36f7e5ad23de407b5c26b1460 Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Wed, 22 Nov 2023 11:55:10 -0800 Subject: [PATCH] Print stack traces more reliably with concurrency (#12086) Summary: It's been relatively easy to break our stack trace printer: * If another thread reaches a signal condition such as a related SEGV or assertion failure while one is trying to print a stack trace from the signal handler, it seems to end the process abruptly without a stack trace. * If the process exits normally in one thread (such as main finishing) while another is trying to print a stack trace from the signal handler, it seems the process will often end normally without a stack trace. This change attempts to fix these issues, with * Keep the custom signal handler installed as long as possible, so that other threads will most likely re-enter our custom handler. (We only switch back to default for triggering core dump or whatever after stack trace.) * Use atomics and sleeps to implement a crude recursive mutex for ensuring all threads hitting the custom signal handler wait on the first that is trying to print a stack trace, while recursive signals in the same thread can still be handled cleanly. * Use an atexit handler to hook into normal exit to (a) wait on a pending printing of stack trace when detectable and applicable, and (b) detect and warn when printing a stack trace might be interrupted by a process exit in progress. (I don't know how to pause that *after* our atexit handler has been called; the best I know how to do is warn, "In a race with process already exiting...".) Pull Request resolved: https://github.com/facebook/rocksdb/pull/12086 Test Plan: manual, including with TSAN. I added this code to the end of a unit test file: ``` for (size_t i = 0; i < 3; ++i) { std::thread t([]() { assert(false); }); t.detach(); } ``` Followed by either `sleep(100)` or `usleep(100)` or usual process exit. And for recursive signal testing, inject `abort()` at various places in the handler. Reviewed By: cbi42 Differential Revision: D51531882 Pulled By: pdillinger fbshipit-source-id: 3473b863a43e61b722dfb7a2ed12a8120949b09c --- port/stack_trace.cc | 96 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 17 deletions(-) diff --git a/port/stack_trace.cc b/port/stack_trace.cc index a5a6d2e77c..2924307527 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -25,6 +25,7 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { #include #include +#include #include #include #include @@ -48,6 +49,9 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { #endif // GLIBC version #endif // OS_LINUX +#include +#include + #include "port/lang.h" namespace ROCKSDB_NAMESPACE { @@ -311,28 +315,85 @@ void* SaveStack(int* num_frames, int first_frames_to_skip) { return callstack; } +static std::atomic g_thread_handling_stack_trace{0}; +static int g_recursion_count = 0; +static std::atomic g_at_exit_called{false}; + static void StackTraceHandler(int sig) { + fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); + // Crude recursive mutex with no signal-unsafe system calls, to avoid + // re-entrance from multiple threads and avoid core dumping while trying + // to print the stack trace. + uint64_t tid = 0; + { + const auto ptid = pthread_self(); + // pthread_t is an opaque type + memcpy(&tid, &ptid, std::min(sizeof(tid), sizeof(ptid))); + // Essentially ensure non-zero + ++tid; + } + for (;;) { + uint64_t expected = 0; + if (g_thread_handling_stack_trace.compare_exchange_strong(expected, tid)) { + // Acquired mutex + g_recursion_count = 0; + break; + } + if (expected == tid) { + ++g_recursion_count; + fprintf(stderr, "Recursive call to stack trace handler (%d)\n", + g_recursion_count); + break; + } + // Sleep before trying again + usleep(1000); + } + + if (g_recursion_count > 2) { + // Give up after too many recursions + fprintf(stderr, "Too many recursive calls to stack trace handler (%d)\n", + g_recursion_count); + } else { + if (g_at_exit_called.load(std::memory_order_acquire)) { + fprintf(stderr, "In a race with process already exiting...\n"); + } + + // skip the top three signal handler related frames + PrintStack(3); + + // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of + // a signal" have failed, so just warn the user about them. +#ifdef __SANITIZE_THREAD__ + fprintf(stderr, + "==> NOTE: any above warnings about \"signal-unsafe call\" are\n" + "==> ignorable, as they are expected when generating a stack\n" + "==> trace because of a signal under TSAN. Consider why the\n" + "==> signal was generated to begin with, and the stack trace\n" + "==> in the TSAN warning can be useful for that. (The stack\n" + "==> trace printed by the signal handler is likely obscured\n" + "==> by TSAN output.)\n"); +#endif + } + // reset to default handler signal(sig, SIG_DFL); - fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig)); - // skip the top three signal handler related frames - PrintStack(3); - - // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of - // a signal" have failed, so just warn the user about them. -#ifdef __SANITIZE_THREAD__ - fprintf(stderr, - "==> NOTE: any above warnings about \"signal-unsafe call\" are\n" - "==> ignorable, as they are expected when generating a stack\n" - "==> trace because of a signal under TSAN. Consider why the\n" - "==> signal was generated to begin with, and the stack trace\n" - "==> in the TSAN warning can be useful for that. (The stack\n" - "==> trace printed by the signal handler is likely obscured\n" - "==> by TSAN output.)\n"); -#endif - // re-signal to default handler (so we still get core dump if needed...) raise(sig); + + // release the mutex, in case this is somehow recoverable + if (g_recursion_count > 0) { + --g_recursion_count; + } else { + g_thread_handling_stack_trace.store(0, std::memory_order_release); + } +} + +static void AtExit() { + // wait for stack trace handler to finish, if needed + while (g_thread_handling_stack_trace.load(std::memory_order_acquire)) { + usleep(1000); + } + g_at_exit_called.store(true, std::memory_order_release); } void InstallStackTraceHandler() { @@ -342,6 +403,7 @@ void InstallStackTraceHandler() { signal(SIGSEGV, StackTraceHandler); signal(SIGBUS, StackTraceHandler); signal(SIGABRT, StackTraceHandler); + atexit(AtExit); // Allow ouside debugger to attach, even with Yama security restrictions. // This is needed even outside of PrintStack() so that external mechanisms // can dump stacks if they suspect that a test has hung.