mirror of https://github.com/facebook/rocksdb.git
Print stack traces more reliably with concurrency (#12086)
Summary: It's been relatively easy to break our stack trace printer: * If another thread reaches a signal condition such as a related SEGV or assertion failure while one is trying to print a stack trace from the signal handler, it seems to end the process abruptly without a stack trace. * If the process exits normally in one thread (such as main finishing) while another is trying to print a stack trace from the signal handler, it seems the process will often end normally without a stack trace. This change attempts to fix these issues, with * Keep the custom signal handler installed as long as possible, so that other threads will most likely re-enter our custom handler. (We only switch back to default for triggering core dump or whatever after stack trace.) * Use atomics and sleeps to implement a crude recursive mutex for ensuring all threads hitting the custom signal handler wait on the first that is trying to print a stack trace, while recursive signals in the same thread can still be handled cleanly. * Use an atexit handler to hook into normal exit to (a) wait on a pending printing of stack trace when detectable and applicable, and (b) detect and warn when printing a stack trace might be interrupted by a process exit in progress. (I don't know how to pause that *after* our atexit handler has been called; the best I know how to do is warn, "In a race with process already exiting...".) Pull Request resolved: https://github.com/facebook/rocksdb/pull/12086 Test Plan: manual, including with TSAN. I added this code to the end of a unit test file: ``` for (size_t i = 0; i < 3; ++i) { std::thread t([]() { assert(false); }); t.detach(); } ``` Followed by either `sleep(100)` or `usleep(100)` or usual process exit. And for recursive signal testing, inject `abort()` at various places in the handler. Reviewed By: cbi42 Differential Revision: D51531882 Pulled By: pdillinger fbshipit-source-id: 3473b863a43e61b722dfb7a2ed12a8120949b09c
This commit is contained in:
parent
a140b519b1
commit
f6fd4b9dbd
|
@ -25,6 +25,7 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
|
|||
|
||||
#include <cxxabi.h>
|
||||
#include <execinfo.h>
|
||||
#include <pthread.h>
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -48,6 +49,9 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
|
|||
#endif // GLIBC version
|
||||
#endif // OS_LINUX
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
|
||||
#include "port/lang.h"
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
|
@ -311,28 +315,85 @@ void* SaveStack(int* num_frames, int first_frames_to_skip) {
|
|||
return callstack;
|
||||
}
|
||||
|
||||
static std::atomic<uint64_t> g_thread_handling_stack_trace{0};
|
||||
static int g_recursion_count = 0;
|
||||
static std::atomic<bool> g_at_exit_called{false};
|
||||
|
||||
static void StackTraceHandler(int sig) {
|
||||
fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
|
||||
// Crude recursive mutex with no signal-unsafe system calls, to avoid
|
||||
// re-entrance from multiple threads and avoid core dumping while trying
|
||||
// to print the stack trace.
|
||||
uint64_t tid = 0;
|
||||
{
|
||||
const auto ptid = pthread_self();
|
||||
// pthread_t is an opaque type
|
||||
memcpy(&tid, &ptid, std::min(sizeof(tid), sizeof(ptid)));
|
||||
// Essentially ensure non-zero
|
||||
++tid;
|
||||
}
|
||||
for (;;) {
|
||||
uint64_t expected = 0;
|
||||
if (g_thread_handling_stack_trace.compare_exchange_strong(expected, tid)) {
|
||||
// Acquired mutex
|
||||
g_recursion_count = 0;
|
||||
break;
|
||||
}
|
||||
if (expected == tid) {
|
||||
++g_recursion_count;
|
||||
fprintf(stderr, "Recursive call to stack trace handler (%d)\n",
|
||||
g_recursion_count);
|
||||
break;
|
||||
}
|
||||
// Sleep before trying again
|
||||
usleep(1000);
|
||||
}
|
||||
|
||||
if (g_recursion_count > 2) {
|
||||
// Give up after too many recursions
|
||||
fprintf(stderr, "Too many recursive calls to stack trace handler (%d)\n",
|
||||
g_recursion_count);
|
||||
} else {
|
||||
if (g_at_exit_called.load(std::memory_order_acquire)) {
|
||||
fprintf(stderr, "In a race with process already exiting...\n");
|
||||
}
|
||||
|
||||
// skip the top three signal handler related frames
|
||||
PrintStack(3);
|
||||
|
||||
// Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of
|
||||
// a signal" have failed, so just warn the user about them.
|
||||
#ifdef __SANITIZE_THREAD__
|
||||
fprintf(stderr,
|
||||
"==> NOTE: any above warnings about \"signal-unsafe call\" are\n"
|
||||
"==> ignorable, as they are expected when generating a stack\n"
|
||||
"==> trace because of a signal under TSAN. Consider why the\n"
|
||||
"==> signal was generated to begin with, and the stack trace\n"
|
||||
"==> in the TSAN warning can be useful for that. (The stack\n"
|
||||
"==> trace printed by the signal handler is likely obscured\n"
|
||||
"==> by TSAN output.)\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
// reset to default handler
|
||||
signal(sig, SIG_DFL);
|
||||
fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
|
||||
// skip the top three signal handler related frames
|
||||
PrintStack(3);
|
||||
|
||||
// Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of
|
||||
// a signal" have failed, so just warn the user about them.
|
||||
#ifdef __SANITIZE_THREAD__
|
||||
fprintf(stderr,
|
||||
"==> NOTE: any above warnings about \"signal-unsafe call\" are\n"
|
||||
"==> ignorable, as they are expected when generating a stack\n"
|
||||
"==> trace because of a signal under TSAN. Consider why the\n"
|
||||
"==> signal was generated to begin with, and the stack trace\n"
|
||||
"==> in the TSAN warning can be useful for that. (The stack\n"
|
||||
"==> trace printed by the signal handler is likely obscured\n"
|
||||
"==> by TSAN output.)\n");
|
||||
#endif
|
||||
|
||||
// re-signal to default handler (so we still get core dump if needed...)
|
||||
raise(sig);
|
||||
|
||||
// release the mutex, in case this is somehow recoverable
|
||||
if (g_recursion_count > 0) {
|
||||
--g_recursion_count;
|
||||
} else {
|
||||
g_thread_handling_stack_trace.store(0, std::memory_order_release);
|
||||
}
|
||||
}
|
||||
|
||||
static void AtExit() {
|
||||
// wait for stack trace handler to finish, if needed
|
||||
while (g_thread_handling_stack_trace.load(std::memory_order_acquire)) {
|
||||
usleep(1000);
|
||||
}
|
||||
g_at_exit_called.store(true, std::memory_order_release);
|
||||
}
|
||||
|
||||
void InstallStackTraceHandler() {
|
||||
|
@ -342,6 +403,7 @@ void InstallStackTraceHandler() {
|
|||
signal(SIGSEGV, StackTraceHandler);
|
||||
signal(SIGBUS, StackTraceHandler);
|
||||
signal(SIGABRT, StackTraceHandler);
|
||||
atexit(AtExit);
|
||||
// Allow ouside debugger to attach, even with Yama security restrictions.
|
||||
// This is needed even outside of PrintStack() so that external mechanisms
|
||||
// can dump stacks if they suspect that a test has hung.
|
||||
|
|
Loading…
Reference in New Issue