tests: teach runtime runner about multiple before clauses

As noted in bpftrace#2402, usdt flakiness was made better by 508538a but not fully fixed. This commit is what I should have done all along: it allows the test runner to parse and wait for multiple BEFORE clauses and thus ensures the processes have started before the test runs. There are two minor changes: 1. The check for child processes is now `ps --ppid` based to eventually allow parallel process runs in the same environment. That requires to use `ps` from the `procps` package on Alpine as the default BusyBox one doesn't have the `--ppid` option. 2. Because of the `ps` usage, the name check is now truncated to 15 chars, which will fail if TASK_COMM_LEN is not 16. That looks like a constant in the kernel, so I think we're good.
BurntBrunch · May 24, 2023 · 16f124e · 16f124e
1 parent fb1e082
commit 16f124e
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 45 deletions.
diff --git a/docker/Dockerfile.alpine b/docker/Dockerfile.alpine
@@ -29,6 +29,7 @@ RUN apk add --update \
   llvm${LLVM_VERSION}-dev \
   llvm${LLVM_VERSION}-static \
   musl-obstack-dev \
+  procps \
   python3 \
   wget \
   xxd \

diff --git a/tests/README.md b/tests/README.md
@@ -58,7 +58,8 @@ Each runtime testcase consists of multiple directives. In no particular order:
   field is required.
 * `TIMEOUT`: The timeout for the testcase (in seconds). This field is required.
 * `BEFORE`: Run the command in a shell before running bpftrace. The command
-  will be terminated after testcase is over.
+  will be terminated after testcase is over. Can be used multiple times,
+  commands will run in parallel.
 * `AFTER`: Run the command in a shell after running bpftrace. The command will
   be terminated after the testcase is over.
 * `CLEANUP`: Run the command in a shell after test is over. This holds any

diff --git a/tests/runtime/engine/parser.py b/tests/runtime/engine/parser.py
@@ -25,7 +25,7 @@ class InvalidFieldError(Exception):
         'prog',
         'expect',
         'timeout',
-        'before',
+        'befores',
         'after',
         'cleanup',
         'suite',
@@ -102,7 +102,7 @@ def __read_test_struct(test, test_suite):
         prog = ''
         expect = ''
         timeout = ''
-        before = ''
+        befores = []
         after = ''
         cleanup = ''
         kernel_min = ''
@@ -130,7 +130,7 @@ def __read_test_struct(test, test_suite):
             elif item_name == 'TIMEOUT':
                 timeout = int(line.strip(' '))
             elif item_name == 'BEFORE':
-                before = line
+                befores.append(line)
             elif item_name == 'AFTER':
                 after = line
             elif item_name == 'CLEANUP':
@@ -198,7 +198,7 @@ def __read_test_struct(test, test_suite):
             prog,
             expect,
             timeout,
-            before,
+            befores,
             after,
             cleanup,
             test_suite,

diff --git a/tests/runtime/engine/runner.py b/tests/runtime/engine/runner.py
@@ -158,12 +158,14 @@ def run_test(test):
         signal.signal(signal.SIGALRM, Runner.__handler)
 
         p = None
-        before = None
+        befores = []
         bpftrace = None
         after = None
         cleanup = None
         try:
+            result = None
             timeout = False
+            output = ""
 
             print(ok("[ RUN      ] ") + "%s.%s" % (test.suite, test.name))
             if test.requirement:
@@ -196,22 +198,45 @@ def run_test(test):
                         print(warn("[   SKIP   ] ") + "%s.%s" % (test.suite, test.name))
                         return Runner.SKIP_FEATURE_REQUIREMENT_UNSATISFIED
 
-            if test.before:
-                before = subprocess.Popen(test.before, shell=True, preexec_fn=os.setsid)
-                waited=0
+            if test.befores:
+                for before in test.befores:
+                    before = subprocess.Popen(before.split(), preexec_fn=os.setsid)
+                    befores.append(before)
+
                 with open(os.devnull, 'w') as dn:
-                    # This might not work for complicated cases, such as if
-                    # a test program needs to accept arguments. It covers the
-                    # current simple calls with no arguments
-                    child_name = os.path.basename(test.before.split()[-1])
-                    while subprocess.call(["pidof", child_name], stdout=dn, stderr=dn) != 0:
+                    child_names = [os.path.basename(x.strip().split()[-1]) for x in test.befores]
+                    child_names = sorted((x[:15] for x in child_names))  # cut to comm length
+                    print(f"child_names: %{child_names}")
+
+                    # Print the names of all of our children and look
+                    # for the ones from BEFORE clauses
+                    waited=0
+                    while waited <= test.timeout:
+                        children = subprocess.run(["ps", "--ppid", str(os.getpid()), "--no-headers", "-o", "comm"],
+                                                  check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                        if children.returncode == 0 and children.stdout:
+                            lines = [line.decode('utf-8') for line in children.stdout.splitlines()]
+                            lines = sorted((line.strip() for line in lines if line != 'ps'))
+                            print(f"lines: %{lines}")
+                            if lines == child_names:
+                                break
+                        else:
+                            print(children.stderr)
+
                         time.sleep(0.1)
                         waited+=0.1
-                        if waited > test.timeout:
-                            raise TimeoutError('Timed out waiting for BEFORE %s ', test.before)
+
+                    if waited > test.timeout:
+                        raise TimeoutError(f'Timed out waiting for BEFORE(s) {test.befores}')
 
             bpf_call = Runner.prepare_bpf_call(test)
-            if test.before and '{{BEFORE_PID}}' in bpf_call:
+            if test.befores and '{{BEFORE_PID}}' in bpf_call:
+                if len(test.befores) > 1:
+                    raise ValueError(f"test has {len(test.befores)} BEFORE clauses but BEFORE_PID usage requires exactly one")
+
+                child_name = test.befores[0].strip().split()[-1]
+                child_name = os.path.basename(child_name)
+
                 childpid = subprocess.Popen(["pidof", child_name], stdout=subprocess.PIPE, universal_newlines=True).communicate()[0].split()[0]
                 bpf_call = re.sub("{{BEFORE_PID}}", str(childpid), bpf_call)
             env = {
@@ -235,8 +260,6 @@ def run_test(test):
 
             signal.alarm(ATTACH_TIMEOUT)
 
-            output = ""
-
             while p.poll() is None:
                 nextline = p.stdout.readline()
                 output += nextline
@@ -269,15 +292,18 @@ def run_test(test):
                     os.killpg(os.getpgid(p.pid), signal.SIGTERM)
                 output += p.communicate()[0]
                 result = re.search(test.expect, output)
-            if not result:
-                print(fail("[  TIMEOUT ] ") + "%s.%s" % (test.suite, test.name))
-                print('\tCommand: %s' % bpf_call)
-                print('\tTimeout: %s' % test.timeout)
-                print('\tCurrent output: %s' % output)
-                return Runner.TIMEOUT
+
+                if not result:
+                    print(fail("[  TIMEOUT ] ") + "%s.%s" % (test.suite, test.name))
+                    print('\tCommand: %s' % bpf_call)
+                    print('\tTimeout: %s' % test.timeout)
+                    print('\tCurrent output: %s' % output)
+                    return Runner.TIMEOUT
         finally:
-            if before and before.poll() is None:
-                os.killpg(os.getpgid(before.pid), signal.SIGKILL)
+            if befores:
+                for before in befores:
+                    if before.poll() is None:
+                        os.killpg(os.getpgid(before.pid), signal.SIGKILL)
 
             if bpftrace and bpftrace.poll() is None:
                 os.killpg(os.getpgid(p.pid), signal.SIGKILL)

diff --git a/tests/runtime/usdt b/tests/runtime/usdt
@@ -247,7 +247,8 @@ NAME "usdt probes - file based semaphore activation multi process"
 RUN {{BPFTRACE}} runtime/scripts/usdt_file_activation_multiprocess.bt --usdt-file-activation
 EXPECT found 2 processes
 TIMEOUT 5
-BEFORE ./testprogs/two_usdt_semaphore_tests
+BEFORE ./testprogs/usdt_semaphore_test
+BEFORE ./testprogs/usdt_semaphore_test
 REQUIRES ./testprogs/usdt_semaphore_test should_not_skip
 
 NAME "usdt probes - list probes by pid in separate mountns"

diff --git a/tests/testprogs/two_usdt_semaphore_tests.c b/tests/testprogs/two_usdt_semaphore_tests.c