From ca8ee7bdee5b0f7600874d8cc2483cc2cc447cf4 Mon Sep 17 00:00:00 2001 From: Barak Davidov Date: Sun, 22 Dec 2024 15:49:50 +0200 Subject: [PATCH] atom runner update Signed-off-by: Barak Davidov --- .env | 2 +- .github/workflows/build-container.yml | 21 ++--- tests/atom/atomEnvInit.sh | 85 -------------------- tests/atom/clusterBuildTestsRun.sh | 107 ++++++++++++++++++-------- tests/atom/cpArtifactAndCleanup.sh | 19 +++-- 5 files changed, 96 insertions(+), 138 deletions(-) delete mode 100755 tests/atom/atomEnvInit.sh diff --git a/.env b/.env index fbe9153e..488ad8b7 100644 --- a/.env +++ b/.env @@ -71,7 +71,7 @@ CEPH_SHA=cca9372b693950885619d0b83b1f02aafac7b5ac CEPH_DEVEL_MGR_PATH=../ceph # Atom -ATOM_SHA=3c0b7531fd1022d97d5600a8ead51992e2a40ec0 +ATOM_SHA=fcbc692797c2be76ff531660344374e620ef09c8 # Demo settings RBD_POOL=rbd diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 32c7a26c..0c925899 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -845,34 +845,23 @@ jobs: atom: needs: [build, build-ceph] - if: github.repository == 'ceph/ceph-nvmeof' - runs-on: ibmcloud-1 + # if: github.repository == 'ceph/ceph-nvmeof' + runs-on: atomRunner steps: - name: Checkout code uses: actions/checkout@v4 - - name: Atom env initialization - run: | - . .env - ACTION_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" - ./tests/atom/atomEnvInit.sh $ATOM_SHA $ACTION_URL - - name: Download container images uses: actions/download-artifact@v4 with: pattern: container_images_nvmeof merge-multiple: true - - name: Load container images - run: | - docker load < nvmeof.tar - docker load < nvmeof-cli.tar - - name: Cluster build and Atom tests run - if: always() || failure() run: | . .env - ./tests/atom/clusterBuildTestsRun.sh $NVMEOF_VERSION $CEPH_SHA $ATOM_SHA + ACTION_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + ./tests/atom/clusterBuildTestsRun.sh $NVMEOF_VERSION $CEPH_SHA $ATOM_SHA $ACTION_URL - name: Atom artifact build if: always() @@ -882,7 +871,7 @@ jobs: if: always() with: name: atom-artifact - path: /tmp/artifact/* + path: /home/cephnvme/artifact.tar.gz push-images-to-ceph-registry: if: github.event_name == 'release' diff --git a/tests/atom/atomEnvInit.sh b/tests/atom/atomEnvInit.sh deleted file mode 100755 index 833857ac..00000000 --- a/tests/atom/atomEnvInit.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -ATOM_SHA=$1 -ACTION_URL=$2 -RUNNER_FILDER='/home/cephnvme/actions-runner-ceph' - -cleanup_docker_images() { - local HOST=$1 - ssh -o StrictHostKeyChecking=no root@$HOST << EOF - sudo docker ps -q | xargs -r sudo docker stop - sudo docker ps -q | xargs -r sudo docker rm -f - sudo yes | sudo docker system prune -fa - sudo docker ps - sudo docker images -EOF -} - -# Remove previous run data -rm -rf $RUNNER_FILDER/ceph-nvmeof-atom -sudo rm -rf /root/.ssh/atom_backup/artifact/multiIBMCloudServers_m2/* - -# Check if cluster is busy with another run -while true; do - if [ -f "/home/cephnvme/busyServer.txt" ]; then - echo "The server is busy with another github action job, please wait..." - sleep 90 - else - echo "The server is available for use!" - echo $ACTION_URL > /home/cephnvme/busyServer.txt - chmod +rx /home/cephnvme/busyServer.txt - break - fi -done - -# Cleanup docker images -sudo docker ps -q | xargs -r sudo docker stop; sudo docker ps -q | xargs -r sudo docker rm -f; sudo yes | docker system prune -fa; docker ps; docker images - -# Cloning atom repo -cd $RUNNER_FILDER -git clone git@github.ibm.com:NVME-Over-Fiber/ceph-nvmeof-atom.git -if [ $? -ne 0 ]; then - echo "Error: Failed to clone the atom repository." - exit 1 -fi - -# Switch to given SHA -cd ceph-nvmeof-atom -git checkout $ATOM_SHA -if [ $? -ne 0 ]; then - echo "Error: Failed to checkout the specified SHA." - exit 1 -fi - -# Build atom images based on the cloned repo -docker build -t nvmeof_atom:$ATOM_SHA $RUNNER_FILDER/ceph-nvmeof-atom -if [ $? -ne 0 ]; then - echo "Error: Failed to build Docker image." - exit 1 -fi - -# Remove ceph cluster -docker run -v /root/.ssh:/root/.ssh nvmeof_atom:$ATOM_SHA ansible-playbook -i custom_inventory.ini cephnvmeof_remove_cluster.yaml --extra-vars 'SELECTED_ENV=multiIBMCloudServers_m2' -if [ $? -ne 0 ]; then - echo "Error: Failed to run cephnvmeof_remove_cluster ansible-playbook." - exit 1 -fi - -# Cleanup remain images after ceph cluster removal -HOSTS=("cephnvme-vm9" "cephnvme-vm7" "cephnvme-vm6" "cephnvme-vm1") -for HOST in "${HOSTS[@]}"; do - echo "Cleaning up Docker images on $HOST" - cleanup_docker_images "$HOST" - if [ $? -ne 0 ]; then - echo "Error: Failed to clean up Docker images on $HOST." - fi -done - -echo "Cleaning up Podman containers and images on installer" -sudo podman ps -q | xargs -r sudo podman stop -sudo podman ps -q | xargs -r sudo podman rm -f -sudo podman rmi -f $(sudo podman images -q) -sudo yes | podman system prune -fa -echo "show exist podman images/containers (should be empty)" -sudo podman ps -sudo podman images \ No newline at end of file diff --git a/tests/atom/clusterBuildTestsRun.sh b/tests/atom/clusterBuildTestsRun.sh index 4cf5d863..a7a6119f 100755 --- a/tests/atom/clusterBuildTestsRun.sh +++ b/tests/atom/clusterBuildTestsRun.sh @@ -1,5 +1,11 @@ #!/bin/bash +# if a command fails (returns a non-zero exit code), terminate immediately +# the exit code will be the same as the exit code of the failed command. +# see https://github.com/ceph/ceph-nvmeof/actions/runs/11928539421/job/33246031083 +set -e + + VERSION=$1 if [ "$2" = "latest" ]; then CEPH_SHA=$(curl -s https://shaman.ceph.com/api/repos/ceph/main/latest/centos/9/ | jq -r ".[] | select(.archs[] == \"$(uname -m)\" and .status == \"ready\") | .sha1") @@ -7,38 +13,77 @@ else CEPH_SHA=$2 fi ATOM_SHA=$3 +ACTION_URL=$4 +NIGHTLY=$5 + +RUNNER_FILDER='/home/cephnvme/actions-runner-ceph' + +# Check if cluster is busy with another run +while true; do + if [ -f "/home/cephnvme/busyServer.txt" ]; then + echo "The server is busy with another github action job, please wait..." + sleep 90 + else + echo "The server is available for use!" + echo $ACTION_URL > /home/cephnvme/busyServer.txt + chmod +rx /home/cephnvme/busyServer.txt + break + fi +done + +# Remove previous run data +hostname +rm -rf $RUNNER_FILDER/ceph-nvmeof-atom +sudo rm -rf /root/.ssh/atom_backup/artifact/multiIBMCloudServers_m6/* +sudo ls -lta /root/.ssh/atom_backup/artifact/multiIBMCloudServers_m6 + +# Cloning atom repo +cd $RUNNER_FILDER +git clone git@github.ibm.com:NVME-Over-Fiber/ceph-nvmeof-atom.git + +# Switch to given SHA +cd ceph-nvmeof-atom +git checkout $ATOM_SHA + +# Build atom images based on the cloned repo +docker build -t nvmeof_atom:$ATOM_SHA . -# Atom test script run -# Description of the uncleared flags with their default values -# - Upgrade ceph image target (None) -# - Upgrade nvmeof image target (None) -# - Nvmeof cli image use in target (None) -# - Number of gateways (4) -# - Number of gateways to stop (1) -# - Number of gateways after scale down (1) -# - Number of subsystems (2) -# - Number of namespaces (4) -# - Max namespaces per subsystem (1024) -# - HA failover cycles (2) -# - HA failover cycles after upgrade (2) -# - RBD size (200M) -# - Seed number (0) -# - FIO use (1=run fio, 0=don't run fio) +set -x sudo docker run \ -v /root/.ssh:/root/.ssh \ nvmeof_atom:"$ATOM_SHA" \ - python3 cephnvme_atom.py \ - quay.ceph.io/ceph-ci/ceph:"$CEPH_SHA" \ - quay.io/ceph/nvmeof:"$VERSION" \ - quay.io/ceph/nvmeof-cli:"$VERSION" \ - None None None None None None 4 1 1 2 4 1024 2 2 200M 0 1 20 20 1 \ - --stopNvmeofDaemon \ - --stopNvmeofSystemctl \ - --stopMonLeader \ - --rmNvmeofDaemon \ - --gitHubActionDeployment \ - --dontUseMTLS \ - --skiplbTest \ - --journalctlToConsole \ - --dontPowerOffCloudVMs noKey noKey \ - --multiIBMCloudServers_m2 + python3 atom.py \ + --project=nvmeof \ + --ceph-img=quay.ceph.io/ceph-ci/ceph:"$CEPH_SHA" \ + --gw-img=quay.io/ceph/nvmeof:"$VERSION" \ + --cli-img=quay.io/ceph/nvmeof-cli:"$VERSION" \ + --initiators=1 \ + --gw-group-num=1 \ + --gw-num=4 \ + --gw-to-stop-num=1 \ + --gw-scale-down-num=1 \ + --subsystem-num=2 \ + --ns-num=4 \ + --subsystem-max-ns-num=1024 \ + --failover-num=2 \ + --failover-num-after-upgrade=2 \ + --rbd-size=200M \ + --fio-devices-num=1 \ + --lb-timeout=20 \ + --config-dbg-mon=10 \ + --config-dbg-ms=1 \ + --nvmeof-daemon-stop \ + --nvmeof-systemctl-stop \ + --mon-leader-stop \ + --mon-client-kill \ + --nvmeof-daemon-remove \ + --redeploy-gws \ + --github-action-deployment \ + --skip-di-test \ + --skip-lb-group-change-test \ + --skip-block-list-test \ + --skip-ns-rebalancing-test \ + --journalctl-to-console \ + --dont-power-off-cloud-vms \ + --env=m6 +set +x diff --git a/tests/atom/cpArtifactAndCleanup.sh b/tests/atom/cpArtifactAndCleanup.sh index 30760dd0..0c426ca7 100755 --- a/tests/atom/cpArtifactAndCleanup.sh +++ b/tests/atom/cpArtifactAndCleanup.sh @@ -1,7 +1,16 @@ #!/bin/bash -sudo rm -rf /tmp/artifact/multiIBMCloudServers_m2 -sudo cp -r /root/.ssh/atom_backup/artifact/multiIBMCloudServers_m2 /tmp/artifact -sudo ls -lta /tmp/artifact -sudo chmod -R +rx /tmp/artifact -rm -rf /home/cephnvme/busyServer.txt +sudo rm -rf /home/cephnvme/artifact/* +sudo ls -lta /home/cephnvme/artifact + +sudo rm -rf /home/cephnvme/artifact.tar.gz +sudo ls -lta /home/cephnvme/ + +sudo cp -r /root/.ssh/atom_backup/artifact/multiIBMCloudServers_m6 /home/cephnvme/artifact +sudo ls -lta /home/cephnvme/artifact + +sudo tar -czf /home/cephnvme/artifact.tar.gz -C /home/cephnvme/artifact . +sudo ls -lta /home/cephnvme/artifact +sudo ls -lta /home/cephnvme +sudo chmod +rx /home/cephnvme/artifact.tar.gz +sudo rm -rf /home/cephnvme/busyServer.txt