diff --git a/tests/system_tests_tcp_adaptor.py b/tests/system_tests_tcp_adaptor.py index 87d3f207c..f1224a95f 100644 --- a/tests/system_tests_tcp_adaptor.py +++ b/tests/system_tests_tcp_adaptor.py @@ -507,7 +507,7 @@ def router(name, mode, connection, extra=None, ssl=False, encapsulation="legacy" cls.routers.append(cls.tester.qdrouterd(name, config, wait=True)) # monitor router memory usage: - os.environ["SKUPPER_ROUTER_ALLOC_MONITOR_SECS"] = "1" + os.environ["SKUPPER_ROUTER_ALLOC_MONITOR_SECS"] = "10" cls.routers = [] cls.test_ssl = test_ssl @@ -1443,14 +1443,12 @@ def test_90_stats(self): self.logger.log(tname + " SUCCESS") @unittest.skipIf(DISABLE_SELECTOR_TESTS, DISABLE_SELECTOR_REASON) - def test_100_memory_metrics(self): + def test_9999_memory_metrics(self): """ Take advantage of the long running TCP test to verify that alloc_pool metrics have been correctly written to the logs """ - mem_re = re.compile(r' ram:[0-9]+\.[0-9]+[BKMGTi]+ vm:[0-9]+\.[0-9]+[BKMGTi]+ rss:[0-9]+\.[0-9]+[BKMGTi]+ pool:[0-9]+\.[0-9]+[BKMGTi]+') - action_re = re.compile(r' qdr_action_t:[0-9]+:[0-9]+') - for router in self.routers: + def _poll_logs(router, regex_mem, regex_action): last_mem_match = None # match the start of the alloc log line last_action_match = None # match the qdr_action_t entry in the log line with open(router.logfile_path, 'rt') as log_file: @@ -1461,8 +1459,12 @@ def test_100_memory_metrics(self): m = action_re.search(line) if m: last_action_match = m - self.assertIsNotNone(last_mem_match, "failed to find alloc_pool output!") - self.assertIsNotNone(last_action_match, "failed to find qdr_action_t entry!") + if last_mem_match is None: + print("failed to find alloc_pool output, retrying...") + return False + if last_action_match is None: + print("failed to find qdr_action_t entry, retrying...") + return False # Sanity check that metrics are present: @@ -1470,14 +1472,28 @@ def test_100_memory_metrics(self): mems = last_mem_match.group().strip().split() for mem in mems: name, value = mem.split(':') - self.assertIn(name, ["ram", "vm", "rss", "pool"]) - self.assertTrue(int(value.split('.')[0]) > 0, - f"Expected nonzero {name} counter!") + if name not in ["ram", "vm", "rss", "pool"]: + print(f"failed to find {name} metric, retrying...") + return False + value = int(value.split('.')[0]) + if value <= 0: + print(f"Expected nonzero {name} counter, got {value}, retrying...") + return False # match = ' qdr_action_t:192:0' name, in_use, in_free = last_action_match.group().strip().split(':') - self.assertEqual(name, "qdr_action_t", f"Name mismatch {name}") - self.assertTrue(int(in_use) + int(in_free) > 0, - f"zero alloced? {in_use} {in_free}") + if name != "qdr_action_t": + print(f"Name mismatch: {name} is not 'qdr_action_t, retrying...") + return False + if int(in_use) + int(in_free) <= 0: + print(f"zero qdr_action_ts alloced? in_use={in_use} in_free={in_free}") + return False + return True + + mem_re = re.compile(r' ram:[0-9]+\.[0-9]+[BKMGTi]+ vm:[0-9]+\.[0-9]+[BKMGTi]+ rss:[0-9]+\.[0-9]+[BKMGTi]+ pool:[0-9]+\.[0-9]+[BKMGTi]+') + action_re = re.compile(r' qdr_action_t:[0-9]+:[0-9]+') + for router in self.routers: + retry(lambda rtr=router, mre=mem_re, are=action_re: _poll_logs(rtr, mre, are), + delay=0.5, max_delay=5.0) class TcpAdaptor(TcpAdaptorBase, CommonTcpTests):