From 74549165ddf71d56b1ebac4cd56b0d5a4811d732 Mon Sep 17 00:00:00 2001 From: Kenneth Giusti Date: Tue, 10 Oct 2023 14:45:34 -0400 Subject: [PATCH] Fixes #1254: add function name to panic call stack output --- router/src/panic.c | 45 +++++++-- src/adaptors/http1/http1_client.c | 10 -- .../modules/test_hooks/core_test_hooks.c | 65 ++++++++++++- tests/system_tests_panic_handler.py | 95 ++++++++++++------- 4 files changed, 164 insertions(+), 51 deletions(-) diff --git a/router/src/panic.c b/router/src/panic.c index 6612c92aa..8c8617373 100644 --- a/router/src/panic.c +++ b/router/src/panic.c @@ -166,24 +166,39 @@ static void print(const char *str) fsync(STDERR_FILENO); } -// print a register as a hex string -// async signal safe +// Convert an unsigned value into a hex string and print it. Async signal safe. If trim then discard leading zeros. // -static void print_reg(uintptr_t reg) +static void print_hex_trim(uintptr_t value, bool trim) { - int i = sizeof(reg) * 2; + int i = sizeof(value) * 2; unsigned char *ptr = &buffer[BUFFER_SIZE - 1]; *ptr = 0; while (i--) { - uint8_t nybble = reg & 0x0F; + uint8_t nybble = value & 0x0F; *--ptr = nybble > 9 ? (nybble - 10) + 'a' : nybble + '0'; - reg >>= 4; + value >>= 4; + if (value == 0 && trim) + break; } print((char *) ptr); } +// Print a register in hex. No leading '0x' added. +// +static void print_reg(uintptr_t reg) +{ + print_hex_trim(reg, false); +} + +// Print an address offset, stripping leading zeros +// +static void print_offset(uintptr_t offset) +{ + print_hex_trim(offset, true); +} + // print a base-ten unsigned integer // async signal safe // @@ -320,11 +335,25 @@ static void print_stack_frame(int index, unw_cursor_t *cursor) print("] IP: 0x"); print_reg(ip); + // Cannot reliably invoke unw_get_proc_name() from a signal handler due to a bug: + // https://github.com/libunwind/libunwind/issues/123 + // v1.8.0 hasn't been released yet but I'm assuming it will contain the fix + // + if (UNW_VERSION_MAJOR > 1 || (UNW_VERSION_MAJOR == 1 && UNW_VERSION_MINOR >= 8)) { + unw_word_t off = 0; + if (unw_get_proc_name(cursor, (char *) buffer, BUFFER_SIZE, &off) == 0) { + print(":"); + print((char *) buffer); + print("+0x"); + print_offset((uintptr_t) off); + } + } + if (get_offset((uintptr_t) ip, &offset, &path)) { print(" ("); print(path); - print(" + 0x"); - print_reg(offset); + print("+0x"); + print_offset(offset); print(")"); } print("\n"); diff --git a/src/adaptors/http1/http1_client.c b/src/adaptors/http1/http1_client.c index 6ac6b0bbf..830766e2b 100644 --- a/src/adaptors/http1/http1_client.c +++ b/src/adaptors/http1/http1_client.c @@ -269,16 +269,6 @@ static void _handle_listener_accept(qd_adaptor_listener_t *adaptor_listener, pn_ // qd_http_listener_t *qd_http1_configure_listener(qd_http_listener_t *li, qd_dispatch_t *qd, qd_entity_t *entity) { - if (qd_router_test_hooks_enabled()) { - // Check for test command to trigger core dump functionality. This is only used in CI. - // See ISSUE-1083 - if (strcmp(li->config->adaptor_config->host, "$FORCE$CRASH$SEGV$") == 0) { - volatile int *badptr = (int *) 0xBEEFFACE; - - *badptr = 42; // this is expected to cause the router to crash! - } - } - if (li->config->adaptor_config->ssl_profile_name) { li->tls_domain = qd_tls_domain(li->config->adaptor_config, qd, LOG_HTTP_ADAPTOR, http1_alpn_protocols, HTTP1_NUM_ALPN_PROTOCOLS, true); diff --git a/src/router_core/modules/test_hooks/core_test_hooks.c b/src/router_core/modules/test_hooks/core_test_hooks.c index 45b3b8495..e82188c90 100644 --- a/src/router_core/modules/test_hooks/core_test_hooks.c +++ b/src/router_core/modules/test_hooks/core_test_hooks.c @@ -29,6 +29,9 @@ #include #include +#include +#include +#include typedef enum { TEST_NODE_ECHO, @@ -37,7 +40,8 @@ typedef enum { TEST_NODE_SOURCE, TEST_NODE_SOURCE_PS, TEST_NODE_DISCARD, - TEST_NODE_LOG, // trigger a log message + TEST_NODE_LOG, // trigger a log message + TEST_NODE_CRASH, // force the router to crash } test_node_behavior_t; typedef struct test_module_t test_module_t; @@ -76,12 +80,14 @@ struct test_module_t { test_node_t *source_ps_node; test_node_t *discard_node; test_node_t *log_node; + test_node_t *crash_node; test_client_t *test_client; }; static void endpoint_action(qdr_core_t *core, qdr_action_t *action, bool discard); static void _handle_log_request(qdr_core_t *core, qd_message_t *message); +static void _handle_crash_request(qdr_core_t *core, qd_message_t *message); static void source_send(test_endpoint_t *ep, bool presettled) { @@ -165,6 +171,7 @@ static void endpoint_action(qdr_core_t *core, qdr_action_t *action, bool discard case TEST_NODE_ECHO : case TEST_NODE_LOG : + case TEST_NODE_CRASH : break; } } @@ -219,6 +226,14 @@ static void on_first_attach(void *bind_context, error = qdr_error("qd:forbidden", "Test log function only accepts incoming links"); } break; + + case TEST_NODE_CRASH: + if (incoming) { + qdrc_endpoint_flow_CT(node->core, endpoint, 1, false); + } else { + error = qdr_error("qd:forbidden", "Test crash function only accepts incoming links"); + } + break; } if (!!error) { @@ -302,6 +317,7 @@ static void on_flow(void *link_context, case TEST_NODE_ECHO : case TEST_NODE_LOG : + case TEST_NODE_CRASH : break; } } @@ -348,6 +364,13 @@ static void on_transfer(void *link_context, _handle_log_request(ep->node->core, message); qdrc_endpoint_settle_CT(ep->node->core, delivery, PN_ACCEPTED); qdrc_endpoint_flow_CT(ep->node->core, ep->ep, 1, false); + break; + + case TEST_NODE_CRASH: + _handle_crash_request(ep->node->core, message); // not expected to return + qdrc_endpoint_settle_CT(ep->node->core, delivery, PN_ACCEPTED); + qdrc_endpoint_flow_CT(ep->node->core, ep->ep, 1, false); + break; } } @@ -415,6 +438,7 @@ static test_module_t *qdrc_test_hooks_core_endpoint_setup(qdr_core_t *core, test char *source_ps_address = "io.skupper.router.router/test/source_ps"; char *discard_address = "io.skupper.router.router/test/discard"; char *log_address = "io.skupper.router.router/test/log"; + char *crash_address = "io.skupper.router.router/test/crash"; module->echo_node = NEW(test_node_t); module->deny_node = NEW(test_node_t); @@ -423,6 +447,7 @@ static test_module_t *qdrc_test_hooks_core_endpoint_setup(qdr_core_t *core, test module->source_ps_node = NEW(test_node_t); module->discard_node = NEW(test_node_t); module->log_node = NEW(test_node_t); + module->crash_node = NEW(test_node_t); module->echo_node->core = core; module->echo_node->module = module; @@ -480,6 +505,14 @@ static test_module_t *qdrc_test_hooks_core_endpoint_setup(qdr_core_t *core, test DEQ_INIT(module->log_node->out_links); qdrc_endpoint_bind_mobile_address_CT(core, log_address, &descriptor, module->log_node); + module->crash_node->core = core; + module->crash_node->module = module; + module->crash_node->behavior = TEST_NODE_CRASH; + module->crash_node->desc = &descriptor; + DEQ_INIT(module->crash_node->in_links); + DEQ_INIT(module->crash_node->out_links); + qdrc_endpoint_bind_mobile_address_CT(core, crash_address, &descriptor, module->crash_node); + return module; } @@ -493,6 +526,7 @@ static void qdrc_test_hooks_core_endpoint_finalize(test_module_t *module) free(module->source_ps_node); free(module->discard_node); free(module->log_node); + free(module->crash_node); } @@ -711,6 +745,35 @@ static void _handle_log_request(qdr_core_t *core, qd_message_t *message) qd_log(LOG_ROUTER, level, "This is a test log message."); } +// TEST_NODE_CRASH - Force the router to crash. This is for testing the panic handler (see panic.c) +// +static void _handle_crash_request(qdr_core_t *core, qd_message_t *message) +{ + // Test hooks are only enabled in CI. If anyone is crazy enough to enable them in production they get the keep the + // pieces. Dump a log just in case someone reports this as a bug: + qd_log(LOG_ROUTER, QD_LOG_CRITICAL, "Forced Router CRASH via Test Hooks"); + sleep(2); + qd_iterator_t *s_itr = qd_message_field_iterator(message, QD_FIELD_SUBJECT); + if (s_itr) { + if (qd_iterator_equal(s_itr, (unsigned char *) "ABORT")) { + abort(); + } else if (qd_iterator_equal(s_itr, (unsigned char *) "SEGV")) { + volatile int *badptr = (int *) 0xBEEFFACE; + *badptr = 42; // this is expected to cause the router to crash! + } else if (qd_iterator_equal(s_itr, (unsigned char *) "HEAP")) { + // keep overwriting the heap until we crash + while (true) { + volatile unsigned long *ptr = (unsigned long *) malloc(10); + for (int k = 0; k < 2 * 1024 * 1024; k++) + *ptr++ = ~0UL; + sched_yield(); // let other threads try to malloc... + } + } + qd_iterator_free(s_itr); + } + +} + static bool qdrc_test_hooks_enable_CT(qdr_core_t *core) { return qd_router_test_hooks_enabled(); diff --git a/tests/system_tests_panic_handler.py b/tests/system_tests_panic_handler.py index 659375867..8a98af056 100644 --- a/tests/system_tests_panic_handler.py +++ b/tests/system_tests_panic_handler.py @@ -23,61 +23,47 @@ import re from system_test import TestCase, unittest, main_module, Process -from system_test import retry, Qdrouterd, QdManager, HTTP_LISTENER_TYPE +from system_test import retry, Qdrouterd, AsyncTestSender + +from proton import Message @unittest.skipIf(os.environ.get("QPID_RUNTIME_CHECK", None) != "OFF", "Skipping panic test: RUNTIME_CHECK enabled") class PanicHandlerTest(TestCase): """ - Force the router to crash and verify the panic handler ran + Force the router to crash and verify the panic handler produces a dump to + stdout. """ - @classmethod def setUpClass(cls): super(PanicHandlerTest, cls).setUpClass() + def _start_router(self, name): + # create and start a router config = [ - ('router', {'mode': 'interior', 'id': "PanicRouter"}), + ('router', {'mode': 'standalone', 'id': name}), ('listener', {'role': 'normal', - 'port': cls.tester.get_port()}), + 'port': self.tester.get_port()}), ('address', {'prefix': 'closest', 'distribution': 'closest'}), ('address', {'prefix': 'multicast', 'distribution': 'multicast'}), ] - # turn off python debug for this test to prevent it from dumping stuff - # to output + # turn off python and heap debug for this router so we can crash it os.environ.pop("PYTHONMALLOC", None) os.environ.pop("PYTHONDEVMODE", None) - config = Qdrouterd.Config(config) - cls.router = cls.tester.qdrouterd("PanicRouter", config, wait=True, - cl_args=["-T"], expect=Process.EXIT_FAIL) + os.environ.pop("MALLOC_CHECK_", None) - def test_01_panic_handler(self): - """ - Crash the router and verify the panic handler writes the crash to - stderr - """ - self.router.wait_startup_message() - - mgmt = QdManager(address=self.router.addresses[0], timeout=1) - # this call will crash the router - try: - mgmt.create(HTTP_LISTENER_TYPE, - {'address': 'closest/panic', - 'port': self.tester.get_port(), - 'protocolVersion': 'HTTP1', - 'host': '$FORCE$CRASH$SEGV$'}) - self.assertTrue(False, 'mgmt.create() should have crashed') - except Exception as exc: - pass # expected - the router just crashed - - self.assertTrue(retry(lambda: self.router.poll() is not None)) + config = Qdrouterd.Config(config) + router = self.tester.qdrouterd(name, config, wait=True, + cl_args=["-T"], expect=Process.EXIT_FAIL) + router.wait_startup_message() + return router + def _validate_panic(self, router): # do some simple validation on the generated panic output: - crash_dump = "" - with open(self.router.outfile_path, 'r') as fd: + with open(router.outfile_path, 'r') as fd: crash_dump = fd.read() regex = re.compile(r"^\*\*\* SKUPPER-ROUTER FATAL ERROR \*\*\*", re.MULTILINE) @@ -107,6 +93,51 @@ def test_01_panic_handler(self): match = regex.search(crash_dump) self.assertIsNotNone(match, "failed to dump end line") + def test_01_abort_handling(self): + """ + Crash the router via having it call abort() and verify the panic + handler output + """ + router = self._start_router("AbortRouter") + ts = AsyncTestSender(router.addresses[0], + "io.skupper.router.router/test/crash", + message=Message(subject="ABORT"), + presettle=True) + ts.wait() + + self.assertTrue(retry(lambda: router.poll() is not None)) + self._validate_panic(router) + + def test_02_segv_handling(self): + """ + Crash the router via having it attempt to write to invalid memory and + verify the panic handler output + """ + router = self._start_router("HeapRouter") + ts = AsyncTestSender(router.addresses[0], + "io.skupper.router.router/test/crash", + message=Message(subject="SEGV"), + presettle=True) + ts.wait() + + self.assertTrue(retry(lambda: router.poll() is not None)) + self._validate_panic(router) + + def test_03_heap_handling(self): + """ + Crash the router via having it attempt to overwrite heap memory and + verify the panic handler output + """ + router = self._start_router("SegvRouter") + ts = AsyncTestSender(router.addresses[0], + "io.skupper.router.router/test/crash", + message=Message(subject="HEAP"), + presettle=True) + ts.wait() + + self.assertTrue(retry(lambda: router.poll() is not None)) + self._validate_panic(router) + if __name__ == '__main__': unittest.main(main_module())