From 5e5c68ffbe54724099f334f719209b4df8d43b6b Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Wed, 18 Sep 2024 05:36:23 -0600
Subject: [PATCH 1/4] CI: quadlet system test: be more forgiving

...of high system load (such as when running parallel tests).
Allow time for services to reach desired state, by retrying
a few times in a loop.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/252-quadlet.bats | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/test/system/252-quadlet.bats b/test/system/252-quadlet.bats
index 3ef297b869..8101dbce35 100644
--- a/test/system/252-quadlet.bats
+++ b/test/system/252-quadlet.bats
@@ -976,9 +976,16 @@ EOF
 
       service_setup $QUADLET_SERVICE_NAME
 
-      # FIXME: log.91: Starting, not Started
       # Ensure we have output. Output is synced via sd-notify (socat in Exec)
-      run journalctl "--since=$STARTED_TIME" --unit="$QUADLET_SERVICE_NAME"
+      # When running under heavy load (e.g. parallel tests), it
+      # may take a little while for service to reach Started
+      for tries in $(seq 1 5); do
+          run journalctl "--since=$STARTED_TIME" --unit="$QUADLET_SERVICE_NAME"
+          if [[ "$output" =~ "Started.*\.service" ]]; then
+              break
+          fi
+          sleep 1
+      done
       is "$output" '.*Started.*\.service.*'
 
       # Opportunistic test: confirm that the Propagation field got set.
@@ -1519,8 +1526,14 @@ EOF
     # Shutdown the service
     service_cleanup $pod_service inactive
 
-    # The service of the container should be active
-    run systemctl show --property=ActiveState "$container_service"
+    # It might take a few seconds to go inactive, esp. under heavy load
+    for tries in $(seq 1 5); do
+        run systemctl show --property=ActiveState "$container_service"
+        if [[ "$output" = "ActiveState=inactive" ]]; then
+            break
+        fi
+        sleep 1
+    done
     assert "ActiveState=inactive" \
            "quadlet - pod base: container service ActiveState"
 

From 1d5c8ac18eedd3c9d0dbf9561dc125858fdda0cd Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Wed, 18 Sep 2024 08:07:34 -0600
Subject: [PATCH 2/4] CI: system tests: always create pause image

...not just when running parallel Bats, because Bats
does not provide any way to know if we're parallel.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/helpers.bash | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/system/helpers.bash b/test/system/helpers.bash
index 3b6b5b00ed..2370ed0a30 100644
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@@ -458,16 +458,16 @@ function clean_setup() {
         _prefetch $PODMAN_TEST_IMAGE_FQN
     fi
 
-    # When running in parallel, load (create, actually) the pause image.
-    # This way, all pod tests will have it available. Without this,
-    # parallel pod tests will leave behind <none>:<none> images.
+    # Load (create, actually) the pause image. This way, all pod tests will
+    # have it available. Without this, pod tests run in parallel will leave
+    # behind <none>:<none> images.
+    # FIXME: we have to do this always, because there's no way (in bats 1.11)
+    #        to tell if we're running in parallel. See bats-core#998
     # FIXME: #23292 -- this should not be necessary.
-    if [[ -n "$PARALLEL_JOBSLOT" ]]; then
-        run_podman pod create mypod
-        run_podman pod rm mypod
-        # And now, we have a pause image, and each test does not
-        # need to build their own.
-    fi
+    run_podman pod create mypod
+    run_podman pod rm mypod
+    # And now, we have a pause image, and each test does not
+    # need to build their own.
 }
 
 # END   setup/teardown tools

From 3396dabdf3ed47fa0d9de5364eed2cc0d47c72be Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Wed, 18 Sep 2024 11:32:36 -0600
Subject: [PATCH 3/4] CI: system tests: minor documentation on parallel

Only in 000-TEMPLATE. I know I need to write more thorough
documentation. I choose to defer that.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/000-TEMPLATE | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/test/system/000-TEMPLATE b/test/system/000-TEMPLATE
index cff5de7263..6d26d7e9a8 100644
--- a/test/system/000-TEMPLATE
+++ b/test/system/000-TEMPLATE
@@ -129,17 +129,34 @@ size      | -\\\?[0-9]\\\+
 # Whenever possible, please add the ci:parallel tag to new tests,
 # not only for speed but for stress testing.
 #
-# This is an example of what NOT to do when enabling parallel tests.
+# Some test files have '# bats file_tags=ci:parallel' at the top.
+#                              ^^^^---- instead of test_tags on each test
+# This indicates that ALL tests in the file run parallel, and if
+# you add new tests, you need to guarantee that your new test
+# will also run parallel-safe.
+#
+# Below is an example of what NOT to do when enabling parallel tests.
 #
 # bats test_tags=ci:parallel
 @test "this test is completely broken in parallel" {
-    # Never use "--name HARDCODED". Define 'cname=c-$(safename)' instead.
+    # Never use "--name HARDCODED". Define 'cname=c-$(safename)' instead:
+    #    cname="c-$(safename)"
+    #    run_podman --name $cname ...
     # Not only does that guarantee uniqueness, it also gives the test number
     # in the name, so if there's a leak (at end of tests) it will be possible
     # to identify the culprit.
     run_podman --name mycontainer $IMAGE top
 
-    # Same thing for build and -t
+    # Same thing for build and -t:
+    #    imgname="i-$(safename)"
+    #    run_podman build -t $imgname ...
+    # Ed's convention is "c-$(safename)" for containers, "i-" images,
+    # "n-" namespaces, "p-" pods, "v-" volumes, "z-" zebras.
+    # When there are multiple objects needed, it is slightly easier
+    # to differentiate in front rather than at the tail:
+    #    yes: c1="ctr1-$(safename)"
+    #         c2="ctr2-$(safename)"
+    #    no:  c1="ctr-$(safename)-1"
     run_podman build -t myimage ...
 
     # Never assume exact output from podman ps! Many other containers may be running.
@@ -149,6 +166,10 @@ size      | -\\\?[0-9]\\\+
     # Never use "-l". It is meaningless when other processes are running.
     run_podman container inspect -l
 
+    # "userns=auto" can NEVER be parallelized: multiple jobs running
+    # at once will cause "not enough unused IDs in user namespace"
+    run_podman run --userns=auto ....
+
     # Never 'rm -a'!!! OMG like seriously just don't.
     run_podman rm -f -a
 }

From e3af5a38d3416b1df1f996bcc3163bf796aeb79e Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Wed, 18 Sep 2024 10:26:45 -0600
Subject: [PATCH 4/4] CI: rm system test: bump grace period

The "rm on stopping containers" test is flaking under high load,
probably because I bumped up two timeouts in the healthcheck
container that it relies on. Bump up this test's timeout as well.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/055-rm.bats | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/system/055-rm.bats b/test/system/055-rm.bats
index 3673b7dc25..c7ca0707d7 100644
--- a/test/system/055-rm.bats
+++ b/test/system/055-rm.bats
@@ -150,7 +150,7 @@ function __run_healthcheck_container() {
     # We have no way to guarantee that we see 'stopping', but at a very
     # minimum we need to check at least one rm failure
     local rm_failures=0
-    for i in {1..10}; do
+    for i in {1..20}; do
         run_podman '?' rm $cname
         if [[ $status -eq 0 ]]; then
             break