From 5e5c68ffbe54724099f334f719209b4df8d43b6b Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Wed, 18 Sep 2024 05:36:23 -0600 Subject: [PATCH 1/4] CI: quadlet system test: be more forgiving ...of high system load (such as when running parallel tests). Allow time for services to reach desired state, by retrying a few times in a loop. Signed-off-by: Ed Santiago --- test/system/252-quadlet.bats | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/test/system/252-quadlet.bats b/test/system/252-quadlet.bats index 3ef297b869..8101dbce35 100644 --- a/test/system/252-quadlet.bats +++ b/test/system/252-quadlet.bats @@ -976,9 +976,16 @@ EOF service_setup $QUADLET_SERVICE_NAME - # FIXME: log.91: Starting, not Started # Ensure we have output. Output is synced via sd-notify (socat in Exec) - run journalctl "--since=$STARTED_TIME" --unit="$QUADLET_SERVICE_NAME" + # When running under heavy load (e.g. parallel tests), it + # may take a little while for service to reach Started + for tries in $(seq 1 5); do + run journalctl "--since=$STARTED_TIME" --unit="$QUADLET_SERVICE_NAME" + if [[ "$output" =~ "Started.*\.service" ]]; then + break + fi + sleep 1 + done is "$output" '.*Started.*\.service.*' # Opportunistic test: confirm that the Propagation field got set. @@ -1519,8 +1526,14 @@ EOF # Shutdown the service service_cleanup $pod_service inactive - # The service of the container should be active - run systemctl show --property=ActiveState "$container_service" + # It might take a few seconds to go inactive, esp. under heavy load + for tries in $(seq 1 5); do + run systemctl show --property=ActiveState "$container_service" + if [[ "$output" = "ActiveState=inactive" ]]; then + break + fi + sleep 1 + done assert "ActiveState=inactive" \ "quadlet - pod base: container service ActiveState" From 1d5c8ac18eedd3c9d0dbf9561dc125858fdda0cd Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Wed, 18 Sep 2024 08:07:34 -0600 Subject: [PATCH 2/4] CI: system tests: always create pause image ...not just when running parallel Bats, because Bats does not provide any way to know if we're parallel. Signed-off-by: Ed Santiago --- test/system/helpers.bash | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/system/helpers.bash b/test/system/helpers.bash index 3b6b5b00ed..2370ed0a30 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -458,16 +458,16 @@ function clean_setup() { _prefetch $PODMAN_TEST_IMAGE_FQN fi - # When running in parallel, load (create, actually) the pause image. - # This way, all pod tests will have it available. Without this, - # parallel pod tests will leave behind : images. + # Load (create, actually) the pause image. This way, all pod tests will + # have it available. Without this, pod tests run in parallel will leave + # behind : images. + # FIXME: we have to do this always, because there's no way (in bats 1.11) + # to tell if we're running in parallel. See bats-core#998 # FIXME: #23292 -- this should not be necessary. - if [[ -n "$PARALLEL_JOBSLOT" ]]; then - run_podman pod create mypod - run_podman pod rm mypod - # And now, we have a pause image, and each test does not - # need to build their own. - fi + run_podman pod create mypod + run_podman pod rm mypod + # And now, we have a pause image, and each test does not + # need to build their own. } # END setup/teardown tools From 3396dabdf3ed47fa0d9de5364eed2cc0d47c72be Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Wed, 18 Sep 2024 11:32:36 -0600 Subject: [PATCH 3/4] CI: system tests: minor documentation on parallel Only in 000-TEMPLATE. I know I need to write more thorough documentation. I choose to defer that. Signed-off-by: Ed Santiago --- test/system/000-TEMPLATE | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/test/system/000-TEMPLATE b/test/system/000-TEMPLATE index cff5de7263..6d26d7e9a8 100644 --- a/test/system/000-TEMPLATE +++ b/test/system/000-TEMPLATE @@ -129,17 +129,34 @@ size | -\\\?[0-9]\\\+ # Whenever possible, please add the ci:parallel tag to new tests, # not only for speed but for stress testing. # -# This is an example of what NOT to do when enabling parallel tests. +# Some test files have '# bats file_tags=ci:parallel' at the top. +# ^^^^---- instead of test_tags on each test +# This indicates that ALL tests in the file run parallel, and if +# you add new tests, you need to guarantee that your new test +# will also run parallel-safe. +# +# Below is an example of what NOT to do when enabling parallel tests. # # bats test_tags=ci:parallel @test "this test is completely broken in parallel" { - # Never use "--name HARDCODED". Define 'cname=c-$(safename)' instead. + # Never use "--name HARDCODED". Define 'cname=c-$(safename)' instead: + # cname="c-$(safename)" + # run_podman --name $cname ... # Not only does that guarantee uniqueness, it also gives the test number # in the name, so if there's a leak (at end of tests) it will be possible # to identify the culprit. run_podman --name mycontainer $IMAGE top - # Same thing for build and -t + # Same thing for build and -t: + # imgname="i-$(safename)" + # run_podman build -t $imgname ... + # Ed's convention is "c-$(safename)" for containers, "i-" images, + # "n-" namespaces, "p-" pods, "v-" volumes, "z-" zebras. + # When there are multiple objects needed, it is slightly easier + # to differentiate in front rather than at the tail: + # yes: c1="ctr1-$(safename)" + # c2="ctr2-$(safename)" + # no: c1="ctr-$(safename)-1" run_podman build -t myimage ... # Never assume exact output from podman ps! Many other containers may be running. @@ -149,6 +166,10 @@ size | -\\\?[0-9]\\\+ # Never use "-l". It is meaningless when other processes are running. run_podman container inspect -l + # "userns=auto" can NEVER be parallelized: multiple jobs running + # at once will cause "not enough unused IDs in user namespace" + run_podman run --userns=auto .... + # Never 'rm -a'!!! OMG like seriously just don't. run_podman rm -f -a } From e3af5a38d3416b1df1f996bcc3163bf796aeb79e Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Wed, 18 Sep 2024 10:26:45 -0600 Subject: [PATCH 4/4] CI: rm system test: bump grace period The "rm on stopping containers" test is flaking under high load, probably because I bumped up two timeouts in the healthcheck container that it relies on. Bump up this test's timeout as well. Signed-off-by: Ed Santiago --- test/system/055-rm.bats | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/system/055-rm.bats b/test/system/055-rm.bats index 3673b7dc25..c7ca0707d7 100644 --- a/test/system/055-rm.bats +++ b/test/system/055-rm.bats @@ -150,7 +150,7 @@ function __run_healthcheck_container() { # We have no way to guarantee that we see 'stopping', but at a very # minimum we need to check at least one rm failure local rm_failures=0 - for i in {1..10}; do + for i in {1..20}; do run_podman '?' rm $cname if [[ $status -eq 0 ]]; then break