.github/workflows/orquesta-integration-tests.yaml

# We run orquesta integration tests as part of a separate workflow.
# Orquesta tests have a lot of race conditions which result in intermediate failures and timeouts.
# Utilizing separate workflow allows us to re-run just this workflow / job on failure instead of
# wasting time and resources by needing to re-run all the jobs.
name: Orquesta CI

on:
  push:
    branches:
      # only on merges to master branch
      - master
      # and version branches, which only include minor versions (eg: v3.4)
      - v[0-9]+.[0-9]+
    tags:
      # also version tags, which include bugfix releases (eg: v3.4.0)
      - v[0-9]+.[0-9]+.[0-9]+
  pull_request:
    type: [opened, reopened, edited]
    branches:
      # Only for PRs targeting those branches
      - master
      - v[0-9]+.[0-9]+
  schedule:
    # run every night at midnight
    - cron:  '0 0 * * *'

jobs:
  # TODO: Fix the required checks!
  #       When the pre_job triggers and skips builds, it prevents merging the PR because
  #       the required checks are reported as skipped instead of passed.
  # Special job which automatically cancels old runs for the same branch, prevents runs for the
  # same file set which has already passed, etc.
  pre_job:
    name: Skip Duplicate Jobs Pre Job
    runs-on: ubuntu-20.04
    outputs:
      should_skip: ${{ steps.skip_check.outputs.should_skip }}
    steps:
      - id: skip_check
        uses: fkirc/skip-duplicate-actions@4c656bbdb6906310fa6213604828008bc28fe55d # v3.3.0
        with:
          cancel_others: 'true'
          github_token: ${{ github.token }}

  integration-tests:
    needs: pre_job
    # NOTE: We always want to run job on master since we run some additional checks there (code
    # coverage, etc)
    # if: ${{ needs.pre_job.outputs.should_skip != 'true' || github.ref == 'refs/heads/master' }}
    name: '${{ matrix.name }} - Python ${{ matrix.python-version-short }}'
    runs-on: ubuntu-20.04
    strategy:
      fail-fast: false
      matrix:
        # NOTE: We need to use full Python version as part of Python deps cache key otherwise
        # setup virtualenv step will fail.
        include:
          - name: 'Integration Tests (Orquesta)'
            task: 'ci-orquesta'
            nosetests_node_total: 1
            nosetests_node_index: 0
            python-version-short: '3.8'
            python-version: '3.8.10'
          - name: 'Integration Tests (Orquesta)'
            task: 'ci-orquesta'
            nosetests_node_total: 1
            nosetests_node_index: 0
            python-version-short: '3.9'
            python-version: '3.9.14'
    services:
      mongo:
        image: mongo:7.0
        ports:
          - 27017:27017

      rabbitmq:
        image: rabbitmq:3.8-management
        options: >-
          --name rabbitmq
        ports:
          - 5671:5671/tcp   # AMQP SSL port
          - 5672:5672/tcp   # AMQP standard port
          - 15672:15672/tcp # Management: HTTP, CLI

      redis:
        # Docker Hub image
        image: redis
        # Set health checks to wait until redis has started
        options: >-
          --name "redis"
          --health-cmd "redis-cli ping"
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
        ports:
          - 6379:6379/tcp

    env:
      TASK: '${{ matrix.task }}'
      NODE_TOTAL: '${{ matrix.nosetests_node_total }}'
      NODE_INDEX: '${{ matrix.nosetests_node_index }}'

      # We need to explicitly specify terminal width otherwise some CLI tests fail on container
      # environments where small terminal size is used.
      COLUMNS: '120'

      # CI st2.conf (with ST2_CI_USER user instead of stanley)
      ST2_CONF: 'conf/st2.ci.conf'

      # Tell StackStorm that we are indeed in CI mode, previously we hard coded a Travis specific
      # environment variable in our test code, making it a PITA when we switch CI providers.
      # Now, we simply set this environment varible here in the CI portion of our testing and
      # it avoids any CI provider type lock-in.
      ST2_CI: 'true'

      # Name of the user who is running the CI (on GitHub Actions this is 'runner')
      ST2_CI_USER: 'runner'

      # GitHub is juggling how to set vars for multiple shells. Protect our PATH assumptions.
      PATH: /home/runner/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Custom Environment Setup
        run: |
          ./scripts/github/setup-environment.sh
      - name: 'Set up Python (${{ matrix.python-version }}) and Cache Deps'
        uses: ./.github/actions/setup-python
        with:
          python-version: '${{ matrix.python-version }}'
      - name: Cache and Install APT Dependencies
        uses: ./.github/actions/apt-packages
      - name: Install virtualenv
        run: |
          ./scripts/github/install-virtualenv.sh
      - name: Install requirements
        run: |
          ./scripts/ci/install-requirements.sh
      - name: Setup Integration Tests
        run: |
          # prep a ci-specific dev conf file that uses runner instead of stanley
          # this user is the username of the user in GitHub actions, used for SSH, etc during
          # integration tests (important)
          cp conf/st2.dev.conf "${ST2_CONF}" ; sed -i -e "s/stanley/${ST2_CI_USER}/" "${ST2_CONF}"

          sudo -E ./scripts/ci/add-itest-user-key.sh
      - name: Permissions Workaround
        run: |
          echo "$ST2_CI_REPO_PATH"
          sudo ST2_CI_REPO_PATH="${ST2_CI_REPO_PATH}" scripts/ci/permissions-workaround.sh
      - name: Print versions
        run: |
          ./scripts/ci/print-versions.sh
      - name: make
        timeout-minutes: 41
        env:
          MAX_ATTEMPTS: 3
          RETRY_DELAY: 5
        # use: script -e -c to print colors
        run: |
          # There is a race in some orequesta integration tests so they tend to fail quite often.
          # To avoid needed to re-run whole workflow in such case, we should try to retry this
          # specific step. This saves us a bunch of time manually re-running the whole workflow.
          # TODO: Try to identify problematic tests (iirc mostly orquesta ones) and only retry /
          # re-run those.
          set +e
          for i in $(seq 1 ${MAX_ATTEMPTS}); do
            echo "Attempt: ${i}/${MAX_ATTEMPTS}"
            script -e -c "timeout 10m make ${TASK}" && exit 0
            exit_code=$?
            echo "Command failed / timed out (exit_code=${exit_code}), will retry in ${RETRY_DELAY} seconds..."
            sleep ${RETRY_DELAY}
          done
          set -e
          echo "Failed after ${MAX_ATTEMPTS} attempts, failing the job."
          exit 1
      - name: Compress Service Logs Before upload
        if: ${{ failure() }}
        run: |
          ./tools/launchdev.sh stop # stop st2 before collecting logs
          tar cvzpf logs.tar.gz logs/*
      - name: Upload StackStorm services Logs
        if: ${{ failure() }}
        uses: actions/upload-artifact@v4
        with:
          name: logs-py${{ matrix.python-version }}
          path: logs.tar.gz
          retention-days: 7

  slack-notification:
    name: Slack notification for failed master builds
    if: always()
    needs:
      - integration-tests
    runs-on: ubuntu-20.04
    steps:
      - name: Workflow conclusion
        # this step creates an environment variable WORKFLOW_CONCLUSION and is the most reliable way to check the status of previous jobs
        uses: technote-space/workflow-conclusion-action@v2
      - name: CI Run Failure Slack Notification
        if: ${{ env.WORKFLOW_CONCLUSION == 'failure' && github.ref == 'refs/heads/master' }}
        env:
          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
        uses: voxmedia/github-action-slack-notify-build@v1
        with:
          channel: development
          status: FAILED
          color: danger

      # HELPER FOR FUTURE DEVELOPERS:
      #  If your GitHub Actions job is failing and you need to debug it, by default there is
      #  no way to SSH into the container.
      #  The step below can be uncommeted and will stop here and allow you to SSH in.
      #  When this step is reached, simply refresh the GitHub Actions output for this build
      #  and this SSH command will be printed every 5 seconds to the output.
      #  Once you are done debugging in your SSH session, simply: touch /continue
      #  and this will continue the build.
      #
      # - name: Setup tmate session for debugging failed jobs (allows SSH into the container)
      #   uses: mxschmitt/action-tmate@v3
      #   if: "${{ failure() }}"
      #