Skip to content

Commit

Permalink
Fixes skupperproject#1254: add function name to panic call stack output
Browse files Browse the repository at this point in the history
  • Loading branch information
kgiusti committed Oct 11, 2023
1 parent 3d81fb8 commit 30d71b7
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 51 deletions.
45 changes: 37 additions & 8 deletions router/src/panic.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,24 +166,32 @@ static void print(const char *str)
fsync(STDERR_FILENO);
}

// print a register as a hex string
// async signal safe
// Convert an unsigned value into a hex string and print it. Async signal safe. If trim then discard leading zeros.
//
static void print_reg(uintptr_t reg)
static void print_hex_trim(uintptr_t value, bool trim)
{
int i = sizeof(reg) * 2;
int i = sizeof(value) * 2;
unsigned char *ptr = &buffer[BUFFER_SIZE - 1];

*ptr = 0;
while (i--) {
uint8_t nybble = reg & 0x0F;
uint8_t nybble = value & 0x0F;

*--ptr = nybble > 9 ? (nybble - 10) + 'a' : nybble + '0';
reg >>= 4;
value >>= 4;
if (value == 0 && trim)
break;
}
print((char *) ptr);
}

// Print a register in hex. No leading '0x' added.
//
static void print_reg(uintptr_t reg)
{
print_hex_trim(reg, false);
}

// print a base-ten unsigned integer
// async signal safe
//
Expand All @@ -205,6 +213,13 @@ static void print_uint(uintptr_t num)

#ifdef HAVE_LIBUNWIND

// Print an address offset, stripping leading zeros
//
static void print_offset(uintptr_t offset)
{
print_hex_trim(offset, true);
}

// given an address find it's mapping and offset
// async signal safe
//
Expand Down Expand Up @@ -320,11 +335,25 @@ static void print_stack_frame(int index, unw_cursor_t *cursor)
print("] IP: 0x");
print_reg(ip);

// Cannot reliably invoke unw_get_proc_name() from a signal handler due to a bug:
// https://github.com/libunwind/libunwind/issues/123
// v1.8.0 hasn't been released yet but I'm assuming it will contain the fix
//
if (UNW_VERSION_MAJOR > 1 || (UNW_VERSION_MAJOR == 1 && UNW_VERSION_MINOR >= 8)) {
unw_word_t off = 0;
if (unw_get_proc_name(cursor, (char *) buffer, BUFFER_SIZE, &off) == 0) {
print(":");
print((char *) buffer);
print("+0x");
print_offset((uintptr_t) off);
}
}

if (get_offset((uintptr_t) ip, &offset, &path)) {
print(" (");
print(path);
print(" + 0x");
print_reg(offset);
print("+0x");
print_offset(offset);
print(")");
}
print("\n");
Expand Down
10 changes: 0 additions & 10 deletions src/adaptors/http1/http1_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -269,16 +269,6 @@ static void _handle_listener_accept(qd_adaptor_listener_t *adaptor_listener, pn_
//
qd_http_listener_t *qd_http1_configure_listener(qd_http_listener_t *li, qd_dispatch_t *qd, qd_entity_t *entity)
{
if (qd_router_test_hooks_enabled()) {
// Check for test command to trigger core dump functionality. This is only used in CI.
// See ISSUE-1083
if (strcmp(li->config->adaptor_config->host, "$FORCE$CRASH$SEGV$") == 0) {
volatile int *badptr = (int *) 0xBEEFFACE;

*badptr = 42; // this is expected to cause the router to crash!
}
}

if (li->config->adaptor_config->ssl_profile_name) {
li->tls_domain = qd_tls_domain(li->config->adaptor_config, qd, LOG_HTTP_ADAPTOR, http1_alpn_protocols,
HTTP1_NUM_ALPN_PROTOCOLS, true);
Expand Down
65 changes: 64 additions & 1 deletion src/router_core/modules/test_hooks/core_test_hooks.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>

typedef enum {
TEST_NODE_ECHO,
Expand All @@ -37,7 +40,8 @@ typedef enum {
TEST_NODE_SOURCE,
TEST_NODE_SOURCE_PS,
TEST_NODE_DISCARD,
TEST_NODE_LOG, // trigger a log message
TEST_NODE_LOG, // trigger a log message
TEST_NODE_CRASH, // force the router to crash
} test_node_behavior_t;

typedef struct test_module_t test_module_t;
Expand Down Expand Up @@ -76,12 +80,14 @@ struct test_module_t {
test_node_t *source_ps_node;
test_node_t *discard_node;
test_node_t *log_node;
test_node_t *crash_node;
test_client_t *test_client;
};


static void endpoint_action(qdr_core_t *core, qdr_action_t *action, bool discard);
static void _handle_log_request(qdr_core_t *core, qd_message_t *message);
static void _handle_crash_request(qdr_core_t *core, qd_message_t *message);

static void source_send(test_endpoint_t *ep, bool presettled)
{
Expand Down Expand Up @@ -165,6 +171,7 @@ static void endpoint_action(qdr_core_t *core, qdr_action_t *action, bool discard

case TEST_NODE_ECHO :
case TEST_NODE_LOG :
case TEST_NODE_CRASH :
break;
}
}
Expand Down Expand Up @@ -219,6 +226,14 @@ static void on_first_attach(void *bind_context,
error = qdr_error("qd:forbidden", "Test log function only accepts incoming links");
}
break;

case TEST_NODE_CRASH:
if (incoming) {
qdrc_endpoint_flow_CT(node->core, endpoint, 1, false);
} else {
error = qdr_error("qd:forbidden", "Test crash function only accepts incoming links");
}
break;
}

if (!!error) {
Expand Down Expand Up @@ -302,6 +317,7 @@ static void on_flow(void *link_context,

case TEST_NODE_ECHO :
case TEST_NODE_LOG :
case TEST_NODE_CRASH :
break;
}
}
Expand Down Expand Up @@ -348,6 +364,13 @@ static void on_transfer(void *link_context,
_handle_log_request(ep->node->core, message);
qdrc_endpoint_settle_CT(ep->node->core, delivery, PN_ACCEPTED);
qdrc_endpoint_flow_CT(ep->node->core, ep->ep, 1, false);
break;

case TEST_NODE_CRASH:
_handle_crash_request(ep->node->core, message); // not expected to return
qdrc_endpoint_settle_CT(ep->node->core, delivery, PN_ACCEPTED);
qdrc_endpoint_flow_CT(ep->node->core, ep->ep, 1, false);
break;
}
}

Expand Down Expand Up @@ -415,6 +438,7 @@ static test_module_t *qdrc_test_hooks_core_endpoint_setup(qdr_core_t *core, test
char *source_ps_address = "io.skupper.router.router/test/source_ps";
char *discard_address = "io.skupper.router.router/test/discard";
char *log_address = "io.skupper.router.router/test/log";
char *crash_address = "io.skupper.router.router/test/crash";

module->echo_node = NEW(test_node_t);
module->deny_node = NEW(test_node_t);
Expand All @@ -423,6 +447,7 @@ static test_module_t *qdrc_test_hooks_core_endpoint_setup(qdr_core_t *core, test
module->source_ps_node = NEW(test_node_t);
module->discard_node = NEW(test_node_t);
module->log_node = NEW(test_node_t);
module->crash_node = NEW(test_node_t);

module->echo_node->core = core;
module->echo_node->module = module;
Expand Down Expand Up @@ -480,6 +505,14 @@ static test_module_t *qdrc_test_hooks_core_endpoint_setup(qdr_core_t *core, test
DEQ_INIT(module->log_node->out_links);
qdrc_endpoint_bind_mobile_address_CT(core, log_address, &descriptor, module->log_node);

module->crash_node->core = core;
module->crash_node->module = module;
module->crash_node->behavior = TEST_NODE_CRASH;
module->crash_node->desc = &descriptor;
DEQ_INIT(module->crash_node->in_links);
DEQ_INIT(module->crash_node->out_links);
qdrc_endpoint_bind_mobile_address_CT(core, crash_address, &descriptor, module->crash_node);

return module;
}

Expand All @@ -493,6 +526,7 @@ static void qdrc_test_hooks_core_endpoint_finalize(test_module_t *module)
free(module->source_ps_node);
free(module->discard_node);
free(module->log_node);
free(module->crash_node);
}


Expand Down Expand Up @@ -711,6 +745,35 @@ static void _handle_log_request(qdr_core_t *core, qd_message_t *message)
qd_log(LOG_ROUTER, level, "This is a test log message.");
}

// TEST_NODE_CRASH - Force the router to crash. This is for testing the panic handler (see panic.c)
//
static void _handle_crash_request(qdr_core_t *core, qd_message_t *message)
{
// Test hooks are only enabled in CI. If anyone is crazy enough to enable them in production they get the keep the
// pieces. Dump a log just in case someone reports this as a bug:
qd_log(LOG_ROUTER, QD_LOG_CRITICAL, "Forced Router CRASH via Test Hooks");
sleep(2);
qd_iterator_t *s_itr = qd_message_field_iterator(message, QD_FIELD_SUBJECT);
if (s_itr) {
if (qd_iterator_equal(s_itr, (unsigned char *) "ABORT")) {
abort();
} else if (qd_iterator_equal(s_itr, (unsigned char *) "SEGV")) {
volatile int *badptr = (int *) 0xBEEFFACE;
*badptr = 42; // this is expected to cause the router to crash!
} else if (qd_iterator_equal(s_itr, (unsigned char *) "HEAP")) {
// keep overwriting the heap until we crash
while (true) {
volatile unsigned long *ptr = (unsigned long *) malloc(10);
for (int k = 0; k < 2 * 1024 * 1024; k++)
*ptr++ = ~0UL;
sched_yield(); // let other threads try to malloc...
}
}
qd_iterator_free(s_itr);
}

}

static bool qdrc_test_hooks_enable_CT(qdr_core_t *core)
{
return qd_router_test_hooks_enabled();
Expand Down
95 changes: 63 additions & 32 deletions tests/system_tests_panic_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,61 +23,47 @@
import re

from system_test import TestCase, unittest, main_module, Process
from system_test import retry, Qdrouterd, QdManager, HTTP_LISTENER_TYPE
from system_test import retry, Qdrouterd, AsyncTestSender

from proton import Message


@unittest.skipIf(os.environ.get("QPID_RUNTIME_CHECK", None) != "OFF",
"Skipping panic test: RUNTIME_CHECK enabled")
class PanicHandlerTest(TestCase):
"""
Force the router to crash and verify the panic handler ran
Force the router to crash and verify the panic handler produces a dump to
stdout.
"""

@classmethod
def setUpClass(cls):
super(PanicHandlerTest, cls).setUpClass()

def _start_router(self, name):
# create and start a router
config = [
('router', {'mode': 'interior', 'id': "PanicRouter"}),
('router', {'mode': 'standalone', 'id': name}),
('listener', {'role': 'normal',
'port': cls.tester.get_port()}),
'port': self.tester.get_port()}),
('address', {'prefix': 'closest', 'distribution': 'closest'}),
('address', {'prefix': 'multicast', 'distribution': 'multicast'}),
]

# turn off python debug for this test to prevent it from dumping stuff
# to output
# turn off python and heap debug for this router so we can crash it
os.environ.pop("PYTHONMALLOC", None)
os.environ.pop("PYTHONDEVMODE", None)
config = Qdrouterd.Config(config)
cls.router = cls.tester.qdrouterd("PanicRouter", config, wait=True,
cl_args=["-T"], expect=Process.EXIT_FAIL)
os.environ.pop("MALLOC_CHECK_", None)

def test_01_panic_handler(self):
"""
Crash the router and verify the panic handler writes the crash to
stderr
"""
self.router.wait_startup_message()

mgmt = QdManager(address=self.router.addresses[0], timeout=1)
# this call will crash the router
try:
mgmt.create(HTTP_LISTENER_TYPE,
{'address': 'closest/panic',
'port': self.tester.get_port(),
'protocolVersion': 'HTTP1',
'host': '$FORCE$CRASH$SEGV$'})
self.assertTrue(False, 'mgmt.create() should have crashed')
except Exception as exc:
pass # expected - the router just crashed

self.assertTrue(retry(lambda: self.router.poll() is not None))
config = Qdrouterd.Config(config)
router = self.tester.qdrouterd(name, config, wait=True,
cl_args=["-T"], expect=Process.EXIT_FAIL)
router.wait_startup_message()
return router

def _validate_panic(self, router):
# do some simple validation on the generated panic output:

crash_dump = ""
with open(self.router.outfile_path, 'r') as fd:
with open(router.outfile_path, 'r') as fd:
crash_dump = fd.read()

regex = re.compile(r"^\*\*\* SKUPPER-ROUTER FATAL ERROR \*\*\*", re.MULTILINE)
Expand Down Expand Up @@ -107,6 +93,51 @@ def test_01_panic_handler(self):
match = regex.search(crash_dump)
self.assertIsNotNone(match, "failed to dump end line")

def test_01_abort_handling(self):
"""
Crash the router via having it call abort() and verify the panic
handler output
"""
router = self._start_router("AbortRouter")
ts = AsyncTestSender(router.addresses[0],
"io.skupper.router.router/test/crash",
message=Message(subject="ABORT"),
presettle=True)
ts.wait()

self.assertTrue(retry(lambda: router.poll() is not None))
self._validate_panic(router)

def test_02_segv_handling(self):
"""
Crash the router via having it attempt to write to invalid memory and
verify the panic handler output
"""
router = self._start_router("HeapRouter")
ts = AsyncTestSender(router.addresses[0],
"io.skupper.router.router/test/crash",
message=Message(subject="SEGV"),
presettle=True)
ts.wait()

self.assertTrue(retry(lambda: router.poll() is not None))
self._validate_panic(router)

def test_03_heap_handling(self):
"""
Crash the router via having it attempt to overwrite heap memory and
verify the panic handler output
"""
router = self._start_router("SegvRouter")
ts = AsyncTestSender(router.addresses[0],
"io.skupper.router.router/test/crash",
message=Message(subject="HEAP"),
presettle=True)
ts.wait()

self.assertTrue(retry(lambda: router.poll() is not None))
self._validate_panic(router)


if __name__ == '__main__':
unittest.main(main_module())

0 comments on commit 30d71b7

Please sign in to comment.