From f50f1bc622ee385e819725723567d3ad4f1cccab Mon Sep 17 00:00:00 2001 From: Jerry Johns Date: Wed, 3 Aug 2022 23:45:03 -0700 Subject: [PATCH] Add full threading backtrace dumps in Cirque and REPL (#21615) * Add full threading backtrace dumps in Cirque and REPL tests on test timeout failure. * Add a in-python thread backtrace dump * Put back timeout --- scripts/tests/run_python_test.py | 12 ++++++--- .../python/chip/native/CommonStackInit.cpp | 11 ++++++++ .../python/test/test_scripts/base.py | 25 ++++++++++++++++--- .../linux-cirque/MobileDeviceTest.py | 4 +-- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/scripts/tests/run_python_test.py b/scripts/tests/run_python_test.py index 7ebd4b8729d990..421e5f46683eb1 100755 --- a/scripts/tests/run_python_test.py +++ b/scripts/tests/run_python_test.py @@ -101,13 +101,19 @@ def main(app: str, factoryreset: bool, app_args: str, script: str, script_args: '--log-format', '%(message)s'] + shlex.split(script_args) if script_gdb: - script_command = "gdb -batch -return-child-result -q -ex run -ex bt --args python3".split() + script_command + # + # When running through Popen, we need to preserve some space-delimited args to GDB as a single logical argument. To do that, let's use '|' as a placeholder + # for the space character so that the initial split will not tokenize them, and then replace that with the space char there-after. + # + script_command = "gdb -batch -return-child-result -q -ex run -ex thread|apply|all|bt --args python3".split() + script_command else: script_command = "/usr/bin/env python3".split() + script_command - logging.info(f"Execute: {script_command}") + final_script_command = [i.replace('|', ' ') for i in script_command] + + logging.info(f"Execute: {final_script_command}") test_script_process = subprocess.Popen( - script_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + final_script_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) DumpProgramOutputToQueue(log_cooking_threads, Fore.GREEN + "TEST" + Style.RESET_ALL, test_script_process, log_queue) diff --git a/src/controller/python/chip/native/CommonStackInit.cpp b/src/controller/python/chip/native/CommonStackInit.cpp index ef1dca2c875c5d..f01bcf8799fbfb 100644 --- a/src/controller/python/chip/native/CommonStackInit.cpp +++ b/src/controller/python/chip/native/CommonStackInit.cpp @@ -49,6 +49,17 @@ struct __attribute__((packed)) PyCommonStackInitParams uint32_t mBluetoothAdapterId; }; +/** + * Function to artifically cause a crash to happen + * that can be used in place of os.exit() in Python so that + * when run through GDB, you'll get a backtrace of what happened. + */ +void pychip_CauseCrash() +{ + uint8_t * ptr = nullptr; + *ptr = 0; +} + ChipError::StorageType pychip_CommonStackInit(const PyCommonStackInitParams * aParams) { ReturnErrorOnFailure(Platform::MemoryInit().AsInteger()); diff --git a/src/controller/python/test/test_scripts/base.py b/src/controller/python/test/test_scripts/base.py index 6c868c70d39665..90bc43cb064781 100644 --- a/src/controller/python/test/test_scripts/base.py +++ b/src/controller/python/test/test_scripts/base.py @@ -38,6 +38,7 @@ import chip.FabricAdmin import copy import secrets +import faulthandler logger = logging.getLogger('PythonMatterControllerTEST') logger.setLevel(logging.INFO) @@ -50,9 +51,27 @@ logger.addHandler(sh) -def TestFail(message): +def TestFail(message, doCrash=False): logger.fatal("Testfail: {}".format(message)) - os._exit(1) + + if (doCrash): + logger.fatal("--------------------------------") + logger.fatal("Backtrace of all Python threads:") + logger.fatal("--------------------------------") + + # + # Let's dump the Python backtrace for all threads, since the backtrace we'll + # get from gdb (if one is attached) won't give us good Python symbol information. + # + faulthandler.dump_traceback() + + # + # Cause a crash to happen so that we can actually get a meaningful + # backtrace when run through GDB. + # + chip.native.GetLibraryHandle().pychip_CauseCrash() + else: + os._exit(1) def FailIfNot(cond, message): @@ -143,7 +162,7 @@ def run(self): self._cv.wait(wait_time) wait_time = stop_time - time.time() if time.time() > stop_time: - TestFail("Timeout") + TestFail("Timeout", doCrash=True) class TestResult: diff --git a/src/test_driver/linux-cirque/MobileDeviceTest.py b/src/test_driver/linux-cirque/MobileDeviceTest.py index bb86da6dbf4631..fc461441034551 100755 --- a/src/test_driver/linux-cirque/MobileDeviceTest.py +++ b/src/test_driver/linux-cirque/MobileDeviceTest.py @@ -83,7 +83,7 @@ def run_controller_test(self): if device['type'] == 'MobileDevice'] for server in server_ids: - self.execute_device_cmd(server, "CHIPCirqueDaemon.py -- run gdb -return-child-result -q -ex \"set pagination off\" -ex run -ex \"bt 25\" --args {} --thread --discriminator {}".format( + self.execute_device_cmd(server, "CHIPCirqueDaemon.py -- run gdb -batch -return-child-result -q -ex \"set pagination off\" -ex run -ex \"thread apply all bt\" --args {} --thread --discriminator {}".format( os.path.join(CHIP_REPO, "out/debug/standalone/chip-all-clusters-app"), TEST_DISCRIMINATOR)) self.reset_thread_devices(server_ids) @@ -97,7 +97,7 @@ def run_controller_test(self): self.execute_device_cmd(req_device_id, "pip3 install {}".format(os.path.join( CHIP_REPO, "out/debug/linux_x64_gcc/controller/python/chip_repl-0.0-py3-none-any.whl"))) - command = "gdb -return-child-result -q -ex run -ex bt --args python3 {} -t 240 -a {} --paa-trust-store-path {}".format( + command = "gdb -batch -return-child-result -q -ex run -ex \"thread apply all bt\" --args python3 {} -t 240 -a {} --paa-trust-store-path {}".format( os.path.join( CHIP_REPO, "src/controller/python/test/test_scripts/mobile-device-test.py"), ethernet_ip, os.path.join(CHIP_REPO, MATTER_DEVELOPMENT_PAA_ROOT_CERTS))