Skip to content

Commit

Permalink
Fixes skupperproject#1335 corrupted panic handler output (skupperproj…
Browse files Browse the repository at this point in the history
  • Loading branch information
kgiusti authored Jan 10, 2024
1 parent a05d50e commit 0222e78
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 11 deletions.
43 changes: 38 additions & 5 deletions router/src/panic.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "config.h"

#include "qpid/dispatch/atomic.h"
#include "qpid/dispatch/threading.h"

#include <assert.h>
Expand Down Expand Up @@ -82,6 +83,11 @@ typedef struct {
static mem_map_t mem_map[MAX_MAP_ENTRIES];
static int mem_map_count = 0;

// if multiple threads panic only allow one thread to unwind the stack otherwise we get multiple threads writing to
// stdout simultaineously which corrupts output. See github ISSUE-1335
//
static sys_atomic_t unwind_in_progress;

// for sorting the map
//
static int map_compare(const void *arg1, const void *arg2)
Expand Down Expand Up @@ -134,12 +140,19 @@ static void lib_map_init(void)
void panic_handler_init(void)
{
if (getenv("SKUPPER_ROUTER_DISABLE_PANIC_HANDLER") == 0) {
sys_atomic_init(&unwind_in_progress, 0);
lib_map_init();

// Note well: do not set SA_RESETHAND or SA_NODEFER. Since this process is multithreaded it is possible for more
// than one thread to panic simultaineously (e.g. heap corruption affects all threads). Setting SA_RESETHAND etc
// will restore the default handler _on_entry_ to panic_signal_handler(). If that is done and another thread
// panics then the default handler will run which will kill the process BEFORE the panic handler completes
// unwinding the stack.
//
struct sigaction sa = {
.sa_flags = SA_SIGINFO | SA_RESETHAND,
.sa_flags = SA_SIGINFO,
.sa_sigaction = panic_signal_handler,
};

sigemptyset(&sa.sa_mask);

for (int i = 0; panic_signals[i].signal != 0; ++i) {
Expand Down Expand Up @@ -398,6 +411,18 @@ static void panic_signal_handler(int signum, siginfo_t *siginfo, void *ucontext)
(void) siginfo;
(void) ucontext;

// github ISSUE-1335: Since this process is multi-threaded it is possible that more than one thread may crash
// simultaineously. Heap corruption for example will affect all threads. Only allow one thread to unwind the stack
// otherwise the output will be corrupted.
if (sys_atomic_inc(&unwind_in_progress) != 0) {
// There is already another thread running this panic handler! Do not allow this thread to return since doing so
// will cause the thread to immediately return to this handler since the faulting code will be re-run by the
// kernel. Simply hang the thread until the other thread finishes unwinding and returns to the kernel which will
// then kill the entire process (all threads) and create a core dump.
while (true)
sleep(10); // signal safe
}

print("\n*** SKUPPER-ROUTER FATAL ERROR ***\n"); // or "guru meditation error" (google it)
print("Version: ");
print(QPID_DISPATCH_VERSION);
Expand Down Expand Up @@ -452,10 +477,18 @@ static void panic_signal_handler(int signum, siginfo_t *siginfo, void *ucontext)
int err = unw_getcontext(&context);
if (err) {
print_libunwind_error(err);
return;
} else {
print_backtrace(&context);
}

print_backtrace(&context);
#endif
print("*** END ***\n");

// Restore the default handler. This will cause the kernel to kill the process and produce a core dump when this
// handler returns.

struct sigaction restore_default = {
.sa_handler = SIG_DFL,
};
sigemptyset(&restore_default.sa_mask);
sigaction(signum, &restore_default, 0);
}
22 changes: 16 additions & 6 deletions tests/system_tests_panic_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def setUpClass(cls):
def _start_router(self, name):
# create and start a router
config = [
('router', {'mode': 'standalone', 'id': name}),
('router', {'mode': 'standalone', 'id': name,
'workerThreads': 8}),
('listener', {'role': 'normal',
'port': self.tester.get_port()}),
('address', {'prefix': 'closest', 'distribution': 'closest'}),
Expand Down Expand Up @@ -103,7 +104,10 @@ def test_01_abort_handling(self):
"io.skupper.router.router/test/crash",
message=Message(subject="ABORT"),
presettle=True)
ts.wait()
try:
ts.wait()
except AsyncTestSender.TestSenderException:
pass

self.assertTrue(retry(lambda: router.poll() is not None))
self._validate_panic(router)
Expand All @@ -113,12 +117,15 @@ def test_02_segv_handling(self):
Crash the router via having it attempt to write to invalid memory and
verify the panic handler output
"""
router = self._start_router("HeapRouter")
router = self._start_router("SegvRouter")
ts = AsyncTestSender(router.addresses[0],
"io.skupper.router.router/test/crash",
message=Message(subject="SEGV"),
presettle=True)
ts.wait()
try:
ts.wait()
except AsyncTestSender.TestSenderException:
pass

self.assertTrue(retry(lambda: router.poll() is not None))
self._validate_panic(router)
Expand All @@ -128,12 +135,15 @@ def test_03_heap_handling(self):
Crash the router via having it attempt to overwrite heap memory and
verify the panic handler output
"""
router = self._start_router("SegvRouter")
router = self._start_router("HeapRouter")
ts = AsyncTestSender(router.addresses[0],
"io.skupper.router.router/test/crash",
message=Message(subject="HEAP"),
presettle=True)
ts.wait()
try:
ts.wait()
except AsyncTestSender.TestSenderException:
pass

self.assertTrue(retry(lambda: router.poll() is not None))
self._validate_panic(router)
Expand Down

0 comments on commit 0222e78

Please sign in to comment.