.github/workflows/deploy-infra.yaml

name: Deploy Cellenics infrastructure on AWS
on:
  workflow_dispatch:
    inputs:
      environment_name:
        type: string
        description: Select the environment name to run the actions on
        required: true
        default: all
      workflow_actions:
        type: choice
        description: Select actions to perform
        options:
          - create and configure cluster
          - configure cluster
          - deploy monitoring
        default: configure cluster
      environment_type:
        type: choice
        description: Select environment type
        options:
          - staging
          - production
          - staging and production
        default: staging

# this ensures that only one CI pipeline with the same key
#  can run at once in order to prevent undefined states
concurrency: cluster-update-mutex

permissions:
  id-token: write
  contents: read

# After load-config and check-secrets jobs are finished:
#   "create and configure cluster" workflow_actions option runs all jobs.
#   "configure cluster" workflow_actions option runs only configure-cluster job
#   "deploy monitoring" workflow_actions option runs only deploy-monitoring job
jobs:
  load-config:
    uses: ./.github/workflows/load-config.yaml
    with:
      environment_name: ${{ github.event.inputs.environment_name }}
      environment_type: ${{ github.event.inputs.environment_type }}

  check-secrets:
    name: Check that sufficient secrets are specified for environment name
    runs-on: ubuntu-20.04
    needs: load-config
    strategy:
      matrix:
        environment_name: ${{ fromJson(needs.load-config.outputs.environment_names) }}
    environment: ${{ matrix.environment_name }}
    steps:
      - id: check-secrets-for-environment
        name: Check if necessary secrets are installed.
        run: |-
          echo Checking if secrets are defined in the repository.
          if [ -z "${{ secrets.ACM_CERTIFICATE_ARN}}" ]
          then
            echo AWS certificate ARN is not defined.
            ERROR=true
          fi
          if [ -z "${{ secrets.AWS_ACCOUNT_ID }}" ]
          then
            echo AWS Account ID is not defined.
            ERROR=true
          fi
          if [ -z "${{ secrets.API_TOKEN_GITHUB }}" ]
          then
            echo GitHub deploy key access token is not defined.
            ERROR=true
          fi
          if [ -z "${{ secrets.PRIMARY_DOMAIN_NAME }}" ]
          then
            echo Secret PRIMARY_DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets.
            ERROR=true
          fi
          if [ -z "${{ secrets.DOMAIN_NAME }}" ]
          then
            echo Secret DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets.
            ERROR=true
          fi
          if [ -n "$ERROR" ]
          then
            echo
            echo This workflow requires some secrets to complete.
            echo Please make they are created by adding/rotating them manually.
            exit 1
          fi

  create-eks-cluster:
    name: Create EKS cluster
    runs-on: ubuntu-20.04
    needs: [check-secrets, load-config]
    if: github.event.inputs.workflow_actions == 'create and configure cluster'
    env:
      CLUSTER_ENV: ${{ matrix.environment.type }}
    strategy:
      max-parallel: 1
      matrix:
        environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }}
    environment: ${{ matrix.environment.name }}
    steps:
      - id: checkout
        name: Check out source code
        uses: actions/checkout@v3

      - id: setup-aws
        name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
          role-duration-seconds: 4800
          aws-region: ${{ secrets.AWS_REGION }}

      - id: fill-metadata
        name: Add name and region to the eksctl file.
        run: |-
          export CLUSTER_NAME="biomage-$CLUSTER_ENV"

          yq -i '
            .metadata.name = strenv(CLUSTER_NAME) |
            .metadata.region = strenv(AWS_REGION)
          ' infra/config/cluster/cluster-template.yaml

          # CELLENICS_VPC_ID is set if using custom cluster deployment. In this case, use the custom template file.
          # If not set, create an empty template file to let eksctl create a new cluster for the deployment.
          if [ ! -z "$CELLENICS_VPC_ID" ]; then

            export PRIVATE_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" | jq -r '.Subnets[0].SubnetId')
            export PRIVATE_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" | jq -r '.Subnets[1].SubnetId')
            export PUBLIC_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" | jq -r '.Subnets[0].SubnetId')
            export PUBLIC_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" | jq -r '.Subnets[1].SubnetId')

            yq '
                .vpc.id = strenv(CELLENICS_VPC_ID) |
                .vpc.subnets.private.private-1 = strenv(PRIVATE_SUBNET_1_ID) |
                .vpc.subnets.private.private-2 = strenv(PRIVATE_SUBNET_2_ID) |
                .vpc.subnets.public.public-1 = strenv(PUBLIC_SUBNET_1_ID) |
                .vpc.subnets.public.public-2 = strenv(PUBLIC_SUBNET_2_ID)
            ' infra/config/cluster/cluster-config-template.yaml > /tmp/cluster-config-values.yaml
          else
            touch /tmp/cluster-config-values.yaml
          fi

          yq eval-all '. as $item ireduce ({}; . *d $item)' infra/config/cluster/cluster-template.yaml /tmp/cluster-config-values.yaml > /tmp/cluster-$CLUSTER_ENV.yaml
          cat /tmp/cluster-$CLUSTER_ENV.yaml
        env:
          AWS_REGION: ${{ secrets.AWS_REGION }}
          CELLENICS_VPC_ID: ${{ secrets.CELLENICS_VPC_ID }}

      - id: install-eksctl
        name: Install eksctl
        run: |-
          curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
          sudo mv /tmp/eksctl /usr/local/bin

      - id: create-clusters
        name: Attempt to create clusters from spec.
        # this job will always pass, irrespective of whether creation was successful or not.
        # this is because the cluster may already exist. we will check for this condition
        # on failure in the next step
        continue-on-error: true
        run: |-
          exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log)

          eksctl create cluster -f /tmp/cluster-$CLUSTER_ENV.yaml

          echo "outcome=created" >> $GITHUB_OUTPUT

      - id: check-for-failure
        name: Check for reason of failure if cluster creation failed.
        if: steps.create-clusters.outcome == 'failure'
        run: |-
          # Check if failure was caused by an already exists exception.
          # If not, the job should fail.
          ALREADY_EXISTS=$(grep AlreadyExistsException /tmp/eksctl-$CLUSTER_ENV.log | wc -l | xargs)
          if [ $ALREADY_EXISTS -ne 1 ]
          then
            echo Step failed for reason other than stack already existing.
            echo Job failing...
            echo "reason=error" >> $GITHUB_OUTPUT
            false
          fi

          echo Cluster already exists.
          echo "reason=already-exists" >> $GITHUB_OUTPUT

      - id: update-addons-for-cluster
        name: Attempt to create addons for cluster.
        continue-on-error: true
        run: |-
          exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log)

          eksctl create addon -f /tmp/cluster-$CLUSTER_ENV.yaml

      - id: update-nodegroup
        name: Attempt to update node groups for existing cluster.
        if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists'
        run: |-
          eksctl create nodegroup --config-file=/tmp/cluster-$CLUSTER_ENV.yaml
          eksctl delete nodegroup --config-file /tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve

      # note: iam service accounts should really be created from within the helm chart as seen here:
      # https://docs.aws.amazon.com/eks/latest/userguide/specify-service-account-role.html
      - id: update-serviceaccounts
        name: Attempt to update IAM service accounts for existing cluster.
        if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists'
        run: |-
          eksctl utils associate-iam-oidc-provider --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --approve
          eksctl create iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml
          eksctl delete iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve

  configure-cluster:
    name: Configure Kubernetes resources on the EKS cluster
    runs-on: ubuntu-20.04
    needs: [check-secrets, create-eks-cluster, load-config]
    if: always() && (github.event.inputs.workflow_actions == 'create and configure cluster' || github.event.inputs.workflow_actions == 'configure cluster') && (needs.check-secrets.result == 'success') && (needs.create-eks-cluster.result == 'success' || needs.create-eks-cluster.result == 'skipped')
    env:
      CLUSTER_ENV: ${{ matrix.environment.type }}
      API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
    strategy:
      max-parallel: 1
      matrix:
        environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }}
    environment: ${{ matrix.environment.name }}
    steps:
      - id: checkout
        name: Check out source code
        uses: actions/checkout@v3

      - id: setup-aws
        name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
          role-duration-seconds: 4800
          aws-region: ${{ secrets.AWS_REGION }}

      - id: add-kubeconfig
        name: Add k8s config file for existing cluster.
        run: |-
          aws eks update-kubeconfig --name biomage-$CLUSTER_ENV

      - id: deploy-metrics-server
        name: Deploy k8s metrics server
        run: |-
          kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml

      - id: install-helm
        name: Install Helm
        run: |-
          sudo snap install helm --classic

      - id: install-eksctl
        name: Install eksctl
        run: |-
          ARCH=amd64
          PLATFORM=$(uname -s)_$ARCH

          curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz"
          tar -xzf eksctl_$PLATFORM.tar.gz -C /tmp && rm eksctl_$PLATFORM.tar.gz

          sudo mv /tmp/eksctl /usr/local/bin

      - id: deploy-load-balancer-role
        name: Deploy permissions for AWS load balancer controller
        run: |-
          curl -o iam-policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/v2.10.1/docs/install/iam_policy.json
          aws iam create-policy \
            --policy-name AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \
            --policy-document file://iam-policy.json || true
          eksctl create iamserviceaccount \
            --cluster=biomage-$CLUSTER_ENV \
            --namespace=kube-system \
            --name=aws-load-balancer-controller \
            --attach-policy-arn=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:policy/AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \
            --role-name eksctl-$CLUSTER_ENV-load-balancer-controller-role \
            --override-existing-serviceaccounts \
            --approve

      # we need to retry this due to an active issue with the AWS Load Balancer Controller
      # where there are intermittent failures that are only fixable by retrying
      # see issue at https://github.com/kubernetes-sigs/aws-load-balancer-controller/issues/2071
      - id: install-lbc
        name: Deploy AWS Load Balancer Controller
        uses: nick-invision/retry@v2
        with:
          timeout_seconds: 600
          max_attempts: 5
          retry_on: error
          on_retry_command: sleep $(shuf -i 5-15 -n 1)
          command: |-
            helm repo add eks https://aws.github.io/eks-charts
            wget https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/refs/heads/release-2.10/helm/aws-load-balancer-controller/crds/crds.yaml
            kubectl apply -f crds.yaml
            helm repo update
            helm upgrade aws-load-balancer-controller eks/aws-load-balancer-controller \
              --namespace kube-system \
              --set serviceAccount.create=false \
              --set serviceAccount.name=aws-load-balancer-controller \
              --set clusterName=biomage-$CLUSTER_ENV \
              --install --wait

      - id: platform-public-facing
        name: Get config for whether platform should be public facing
        uses: mikefarah/yq@master
        with:
          cmd: yq '.[env(ENVIRONMENT_NAME)].publicFacing' 'infra/config/github-environments-config.yaml'
        env:
          ENVIRONMENT_NAME: ${{ matrix.environment.name }}

      - id: install-elb-503-subscription-endpoint
        name: Install ELB 503 subscription endpoint
        run: |-
            echo "value of publicFacing: $PUBLIC_FACING"

            # Check that publicFacing is set to true or false
            if [ "$PUBLIC_FACING" != "true" ] && [ "$PUBLIC_FACING" != "false" ]; then
              echo "value of publicFacing in infra/config/github-environments-config.yaml is not set to true or false"
              exit 1
            fi

            # this is needed so SNS does not stop trying to subscribe to not-yet-deployed
            # API staging environments because their endpoints are not yet available.
            helm upgrade aws-elb-503-subscription-endpoint infra/aws-elb-503-subscription-endpoint \
              --namespace default \
              --set clusterEnv=$CLUSTER_ENV \
              --set acmCertificate="$ACM_CERTIFICATE_ARN" \
              --set-string publicFacing="$PUBLIC_FACING" \
              --install --wait
        env:
          PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }}
          ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }}

      - id: deploy-env-loadbalancer
        name: Deploy AWS Application Load Balancer for environment
        uses: aws-actions/aws-cloudformation-github-deploy@v1
        with:
          parameter-overrides: "Environment=${{ matrix.environment.type }},PublicFacing=${{ steps.platform-public-facing.outputs.result }}"
          name: "biomage-k8s-alb-${{ matrix.environment.type }}"
          template: 'infra/cf-loadbalancer.yaml'
          no-fail-on-empty-changeset: "1"

    # For HMS ACM_CERTIFICATE_ARN_STAGING exists, having different domains for staging and prod
    # so we need to check if it exists, otherwise set it to ACM_CERTIFICATE_ARN
    # same applies for PRIMARY_DOMAIN_NAME_STAGING
      - id: setup-domain
        name: Compile environment-specific domain name
        run: |-
          if [ "${{ matrix.environment.type }}" = "production" ]; then
            PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}"
            DOMAIN_NAME="${{ secrets.DOMAIN_NAME }}"
            ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}"
          fi
          if [ "${{ matrix.environment.type }}" = "staging" ]; then
            PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME_STAGING }}"
            if [ -z "$PRIMARY_DOMAIN_NAME" ]; then
              PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}"
            fi
            DOMAIN_NAME="${{ secrets.DOMAIN_NAME_STAGING }}"
            ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN_STAGING }}"
            if [ -z "$ACM_CERTIFICATE_ARN" ]; then
              ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}"
            fi
          fi
          echo "primary-domain-name=$PRIMARY_DOMAIN_NAME" >> $GITHUB_OUTPUT
          echo "domain-name=$DOMAIN_NAME" >> $GITHUB_OUTPUT
          echo "acm-certificate-arn=$ACM_CERTIFICATE_ARN" >> $GITHUB_OUTPUT


      # This step should be run only once per deployment. The associated Route 53 records to be created
      # e.g. DOMAIN_NAME and *.DOMAIN_NAME should be deleted before running this comment, otherwise this step fails.
      # Refer to the new deployment runbook to learn more.
      # - id: deploy-route53
      #   name: Deploy Route 53 DNS records to ELB
      #   uses: aws-actions/aws-cloudformation-github-deploy@v1
      #   with:
      #     parameter-overrides: "Environment=${{ matrix.environment.type }},DNSName=${{ steps.deploy-env-loadbalancer.outputs.DNSName }},HostedZoneId=${{ steps.deploy-env-loadbalancer.outputs.CanonicalHostedZoneID }},PrimaryDomainName=${{ steps.setup-domain.outputs.primary-domain-name }},DomainName=${{ steps.setup-domain.outputs.domain-name }}"
      #     name: "biomage-alb-route53-${{ matrix.environment.type }}"
      #     template: 'infra/cf-route53.yaml'
      #     no-fail-on-empty-changeset: "1"

      - id: deploy-xray-daemon
        name: Deploy AWS X-Ray daemon
        run: |-
          helm upgrade "aws-xray-daemon" infra/aws-xray-daemon \
            --namespace default \
            --set iamRole=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/xray-daemon-role-$CLUSTER_ENV \
            --install --wait

      - id: install-ebs-csi-driver
        name: Install AWS EBS Container Storage Interface (CSI) drivers
        run: |-
          helm upgrade \
            aws-ebs-csi-driver https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/download/helm-chart-aws-ebs-csi-driver-2.17.2/aws-ebs-csi-driver-2.17.2.tgz \
            --namespace kube-system \
            --set enableVolumeScheduling=true \
            --set enableVolumeResizing=true \
            --set enableVolumeSnapshot=true \
            --install --wait

      - id: deploy-read-only-group
        name: Deploy read-only permission definition for cluster
        run: |-
          helm upgrade "biomage-read-only-group" infra/biomage-read-only-group \
            --install --wait

      - id: deploy-state-machine-role
        name: Deploy AWS Step Function (state machine) roles
        uses: aws-actions/aws-cloudformation-github-deploy@v1
        with:
          parameter-overrides: "Environment=${{ matrix.environment.type }}"
          name: "biomage-state-machine-role-${{ matrix.environment.type }}"
          template: 'infra/cf-state-machine-role.yaml'
          capabilities: 'CAPABILITY_IAM,CAPABILITY_NAMED_IAM'
          no-fail-on-empty-changeset: "1"

      - id: remove-identitymappings
        name: Remove all previous identity mappings for IAM users
        run: |-
          eksctl get iamidentitymapping --cluster=biomage-$CLUSTER_ENV --output=json | \
          jq -r '.[] | select(.userarn != null) | .userarn' > /tmp/users_to_remove
          while IFS= read -r user
          do
            echo "Remove rights of $user"
            eksctl delete iamidentitymapping \
              --cluster=biomage-$CLUSTER_ENV \
              --arn $user \
              --all
          done < "/tmp/users_to_remove"

      # see https://eksctl.io/usage/iam-identity-mappings/
      # Grant login rights to ci-iac-role
      - id: add-ci-iac-oidc-cluster-role
        name: Allow the OIDC role to log in to our cluster.
        run: |-
          eksctl create iamidentitymapping \
            --cluster=biomage-$CLUSTER_ENV \
            --arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/ci-iac-role \
            --group system:masters \
            --username ci-iac-role

      # SSO access to cluster is only added if accessing AWS and cluster using SSO
      - id: allow-sso-roles-to-access-cluster
        env:
          SSO_ROLE: ${{ secrets.SSO_ROLE }}
        if: ${{ env.SSO_ROLE != '' }}
        name: Allow SSO role to log into the cluster
        run: |-
          eksctl create iamidentitymapping \
            --cluster biomage-$CLUSTER_ENV \
            --arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/${{ env.SSO_ROLE }} \
            --username sso-cluster-admin \
            --no-duplicate-arns \
            --group system:masters

      - id: add-state-machine-cluster-role
        name: Grant rights to the state machine IAM role.
        run: |-
          eksctl create iamidentitymapping \
            --cluster=biomage-$CLUSTER_ENV \
            --arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/state-machine-role-$CLUSTER_ENV \
            --group state-machine-runner-group \
            --username state-machine-runner

      # NOTE: after updating this step, make sure you apply the updates in other relevant Github Actions workflows
      - id: update-identitymapping-admin
        name: Add cluster admin rights to everyone on the admin list.
        run: |-
          echo "Setting cluster admin rights for ${{matrix.environment.name}} in ${{matrix.environment.type}} environment"
          ADMINS="${{ join(matrix.environment.admins, ' ') }}"
          echo $ADMINS
          for user in $ADMINS; do
            echo "Adding cluster admin rights to $user"
            eksctl create iamidentitymapping \
              --cluster=biomage-$CLUSTER_ENV \
              --arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:user/$user \
              --group system:masters \
              --username $user
          done

      ###
      ### INSTALL AND CONFIGURE FLUX V2 ###
      ###
      - id: using-self-signed-certificate
        name: Get config for whether deployment is using self-signed certificate
        uses: mikefarah/yq@master
        with:
          cmd: yq '.[env(ENVIRONMENT_NAME)].selfSignedCertificate' 'infra/config/github-environments-config.yaml'
        env:
          ENVIRONMENT_NAME: ${{ matrix.environment.name }}

      - id: fill-account-specific-metadata
        name: Fill in account specific metadata in ConfigMap
        run: |-
          yq -i '
              .myAccount.domainName = strenv(DOMAIN_NAME) |
              .myAccount.region = strenv(AWS_REGION) |
              .myAccount.accountId = strenv(AWS_ACCOUNT_ID) |
              .myAccount.publicFacing = strenv(PUBLIC_FACING) |
              .myAccount.acmCertificate = strenv(ACM_CERTIFICATE_ARN) |
              .myAccount.selfSignedCertificate = strenv(SELF_SIGNED_CERTIFICATE)
            ' infra/config/account-config.yaml

          if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]]
          then
            export DATADOG_API_KEY="${{ secrets.DATADOG_API_KEY }}"
            export DATADOG_APP_KEY="${{ secrets.DATADOG_APP_KEY }}"
            yq -i '
              .myAccount.datadogAppKey = strenv(DATADOG_APP_KEY) |
              .myAccount.datadogApiKey = strenv(DATADOG_API_KEY)
            ' infra/config/account-config.yaml
          fi

          cat infra/config/account-config.yaml
        env:
          AWS_REGION: ${{ secrets.AWS_REGION }}
          AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}
          DOMAIN_NAME: ${{ steps.setup-domain.outputs.domain-name }}
          ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }}
          PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }}
          SELF_SIGNED_CERTIFICATE: ${{ steps.using-self-signed-certificate.outputs.result }}

      - id: create-flux-namespace
        name: Attempt to create flux namespace
        continue-on-error: true
        run: |-
          kubectl create namespace flux-system

      - id: create-account-information-configmap
        name: Create a configmap containing AWS account specific details
        continue-on-error: false
        run: |-
          kubectl create configmap account-config --from-file=infra/config/account-config.yaml -n flux-system -o yaml --dry-run | kubectl apply -f -

      - id: install-flux-v2
        name: Install flux CLI version 2.3.0
        run: |-
          curl -s https://fluxcd.io/install.sh | sudo FLUX_VERSION=2.3.0 bash

      - id: delete-old-flux-github-deploy-key
        name: Attempt to delete previous github flux deploy key
        continue-on-error: true
        run: |-
          kubectl -n flux-system delete secret flux-system

      - id: install-flux
        name: Install Flux to EKS cluster
        run: |-

          # Refer to https://github.com/fluxcd/flux2/releases
          FLUX_VERSION=v2.3.0
          FLUX_REPO=releases
          FLUX_PATH=deployments/$ENVIRONMENT_NAME-$CLUSTER_ENV
          REPO_FULL_PATH=$GITHUB_REPOSITORY_OWNER/$FLUX_REPO

          echo "flux-full-repo=$(echo $REPO_FULL_PATH)" >> $GITHUB_ENV
          echo "flux-path=$(echo $FLUX_PATH)" >> $GITHUB_ENV

          args=(
            --version $FLUX_VERSION
            --owner $GITHUB_REPOSITORY_OWNER
            --repository $FLUX_REPO
            --branch master
            --path $FLUX_PATH
            --timeout 40s
            –-interval 2m
            --components-extra=image-reflector-controller,image-automation-controller
            --namespace flux-system
            --cluster arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV
            --context arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV
          )

          if [ "${{ matrix.environment.type }}" = "staging" ]
          then
            echo Flux will be deployed in staging with read and write permissions
            args+=(--read-write-key)
          elif [ "${{ matrix.environment.type }}" = "production" ]
          then
            echo Flux will be deployed in production with read-only permissions
          fi

          flux bootstrap github "${args[@]}"

        env:
          GITHUB_TOKEN: ${{ secrets.API_TOKEN_GITHUB }}
          AWS_REGION: ${{ secrets.AWS_REGION }}
          AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}
          ENVIRONMENT_NAME: ${{ matrix.environment.name }}

      - id: fill-in-sync-yaml
        name: Create the sync.yaml file that contains the Kustomization to sync the cluster
        run: |-
          export SPEC_PATH="./$CLUSTER_ENV"
          yq -i '
            .spec.path = strenv(SPEC_PATH)
          ' infra/flux/sync.yaml

          cat infra/flux/sync.yaml

      - id: push-sync-yaml
        name: Push the sync.yaml file that was filled in during the previous step
        uses: dmnemec/copy_file_to_another_repo_action@v1.0.4
        env:
          API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
        with:
          source_file: infra/flux/sync.yaml
          destination_repo: ${{ env.flux-full-repo }}
          destination_folder: ${{ env.flux-path }}
          user_email: ci@biomage.net
          user_name: 'Biomage CI/CD'

      - id: fill-kustomization-template
        name: Fill in Kustomization template
        run: |-
          cat infra/flux/kustomization-template.yaml \
            | sed "s/AWS_ACCOUNT_ID/$AWS_ACCOUNT_ID/g" \
            | sed "s/CLUSTER_ENV/$CLUSTER_ENV/g" \
            > infra/flux/kustomization.yaml

          cat infra/flux/kustomization.yaml
        env:
          AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}

      - id: push-kustomization-yaml
        name: Push the kustomization.yaml file to apply our custom config
        uses: dmnemec/copy_file_to_another_repo_action@v1.0.4
        env:
          API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
        with:
          source_file: infra/flux/kustomization.yaml
          destination_repo: ${{ env.flux-full-repo }}
          destination_folder: ${{ env.flux-path }}/flux-system
          user_email: ci@biomage.net
          user_name: 'Biomage CI/CD'

      - id: install-kubernetes-reflector
        name: Install kubernetes reflector
        run: |-
          helm repo add emberstack https://emberstack.github.io/helm-charts
          helm repo update
          helm upgrade --install reflector emberstack/reflector --namespace flux-system

      - id: add-account-config-configmap-annotations
        name: Add annotations to account-config configmap
        run: |-
          kubectl annotate configmap account-config \
            --overwrite \
            --namespace flux-system \
            reflector.v1.k8s.emberstack.com/reflection-allowed="true" \
            reflector.v1.k8s.emberstack.com/reflection-allowed-namespaces="ui-.*,api-.*,pipeline-.*,worker-.*" \
            reflector.v1.k8s.emberstack.com/reflection-auto-enabled="true"
      ###
      ### END OF INSTALL AND CONFIGURE FLUX V2 ###
      ###

  deploy-monitoring:
    name: Setup logging and monitoring
    runs-on: ubuntu-20.04
    needs: [check-secrets, create-eks-cluster, configure-cluster, load-config]
    if: always() && (needs.check-secrets.result == 'success') && (github.event.inputs.workflow_actions == 'deploy monitoring')
    env:
      CLUSTER_ENV: ${{ matrix.environment.type }}
      API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
    strategy:
      matrix:
        environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix)}}
    environment: ${{ matrix.environment.name }}
    steps:
      - id: checkout
        name: Check out source code
        uses: actions/checkout@v3

      - id: setup-aws
        name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
          role-duration-seconds: 3600
          aws-region: ${{ secrets.AWS_REGION }}

      - id: add-kubeconfig
        name: Add k8s config file for existing cluster.
        run: |-
          aws eks update-kubeconfig --name biomage-$CLUSTER_ENV

      - id: install-eksctl
        name: Install eksctl
        run: |-
          curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
          sudo mv /tmp/eksctl /usr/local/bin

      - id: setup-cluster-cloudwatch-logging-policy
        name: Setup permissions required for cluster to log to Cloudwatch
        uses: aws-actions/aws-cloudformation-github-deploy@v1
        with:
          parameter-overrides: "Environment=${{ matrix.environment.type }}"
          name: "cluster-cloudwatch-logging-policy-${{ matrix.environment.type }}"
          template: 'infra/cluster-logging/cf-cluster-log-cloudwatch-policy.yaml'
          no-fail-on-empty-changeset: "1"
          capabilities: "CAPABILITY_IAM,CAPABILITY_NAMED_IAM"

      # Setting up log forwarding for pods hosted in EC2 nodes
      - id: create-fluent-bit-namespace
        name: Create namespace for node FluentBit deployment
        run: kubectl apply -f infra/cluster-logging/node-fluentbit-namespace.yaml

      - id: create-service-account-for-node-fluent-bit
        name: Create service account for node FluentBit
        env:
          LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }}
        run: |-
          eksctl create iamserviceaccount \
            --name fluent-bit \
            --namespace node-logging \
            --cluster biomage-$CLUSTER_ENV \
            --role-name irsa-fluent-bit-$CLUSTER_ENV \
            --attach-policy-arn $LOGGING_POLICY_ARN \
            --override-existing-serviceaccounts \
            --approve

      - id: deploy-node-fluent-bit
        name: Deploy FluentBit for EC2 nodes
        env:
          AWS_REGION: ${{ secrets.AWS_REGION }}
        run: |
          # FluentBit configuration is determined in infra/cluster-logging/node-fluentbit-config.yaml, specifically under [INPUT] > Path
          # We do not want to log everything for costs/security concerns

          yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/node-fluentbit-config.yaml
          yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/node-fluentbit-config.yaml

          kubectl apply -f infra/cluster-logging/node-fluentbit-config.yaml

      # Setting up log forwarding for pods hosted on Fargate nodes
      - id: attach-pod-execution-role-name
        name: Attach logging policy to pod execution role
        env:
          LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }}
        run: |-
          # Pods launched in the same cluster has the same pod execution role, as pod execution role scope is cluster-wide.
          # See https://eksctl.io/usage/fargate-support/#creating-a-cluster-with-fargate-support
          # Getting fargate-profile of pipeline or worker in the same cluster gets the same pod execution role.

          POD_EXEC_ROLE_NAME=$(aws eks describe-fargate-profile \
            --cluster-name biomage-$CLUSTER_ENV \
            --fargate-profile-name pipeline-default | jq -r '.fargateProfile.podExecutionRoleArn' | awk -F"/" '{print (NF>1)? $NF : ""}' )

          aws iam attach-role-policy --role-name $POD_EXEC_ROLE_NAME --policy-arn $LOGGING_POLICY_ARN

      - id: deploy-fargate-fluent-bit
        name: Deploy FluentBit config for Fargate pods
        env:
          AWS_REGION: ${{ secrets.AWS_REGION }}
        run: |-
          # FluentBit configuration is determined in infra/cluster-logging/fargate-fluentbit-config.yaml
          yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/fargate-fluentbit-config.yaml
          yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/fargate-fluentbit-config.yaml

          kubectl apply -f infra/cluster-logging/fargate-fluentbit-config.yaml

      # Setting up Datadog to watch pod metrics for pods hosted on EC2 and Fargate nodes
      - id: setup-datadog-cluster-agent
        name: Setup Datadog cluster agent
        run: |-
          if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]];
          then
            helm repo add datadog https://helm.datadoghq.com
            helm repo update
            helm upgrade datadog-agent datadog/datadog \
              -f infra/datadog/cluster-agent-values.yaml \
              --set datadog.apiKey=$DATADOG_API_KEY \
              --set datadog.clusterName=biomage-$CLUSTER_ENV \
              --install
          else
            echo "Datadog api key missing, skipping datadog setup"
          fi

      - id: setup-datadog-sidecar-permissions
        name: Setup Datadog sidecar permissions
        run: |-
          if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]];
          then
            kubectl apply -f infra/datadog/datadog-sidecar-rbac.yaml
          fi

      - id: setup-orca-cspm
        name: Setup ORCA CSPM
        run: |-
          if [[ -n "${{ secrets.ORCA_TUNNEL_ID }}" ]];
          then
            helm upgrade --install orca-tunnel \
            --namespace orca-security --create-namespace \
            oci://public.ecr.aws/orcasecurity/helm-k8s-tunnel \
            --set tunnelAddr=tunnel.production.us-east-1.orcasecurity.net \
            --set tunnelId="${{ secrets.ORCA_TUNNEL_ID }}" \
            --set tunnelToken="${{ secrets.ORCA_TUNNEL_TOKEN }}" \
            --set clusterName="biomage-$CLUSTER_ENV" \
            --set cloudVendorId="${{ secrets.AWS_ACCOUNT_ID }}" \
            --set region="${{ secrets.AWS_REGION }}" \
            --set clusterType=eks
          else
            echo "ORCA_TUNNEL_ID missing, skipping ORCA CSPM setup."
          fi

      - id: login-ecr
        name: Login to Amazon ECR
        uses: aws-actions/amazon-ecr-login@v1

      - id: create-falcon-ecr-registries
        name: Create an ECR repositories for the Falcon Sensor (if needed)
        # This will fail if the registry already exists, which is fine. If there is some other
        # error, the `push` step will fail instead.
        continue-on-error: true
        run: |-
          if [[ -n "${{ secrets.FALCON_CID }}" ]];
          then
            aws ecr create-repository --repository-name falcon-container/falcon-sensor --image-tag-mutability MUTABLE
            aws ecr create-repository --repository-name falcon-sensor/falcon-sensor --image-tag-mutability MUTABLE
          else
            echo "CrowdStrike CID missing, not creating falcon sensor repos"
          fi

      - id: create-falcon-namespace
        name: Attempt to create falcon namespace
        continue-on-error: true
        run: |-
          if [[ -n "${{ secrets.FALCON_CID }}" ]];
          then
            kubectl create namespace falcon-system
             else
            echo "CrowdStrike CID missing, not creating falcon namespace"
          fi

      - id: setup-falcon-sensor
        name: Setup Falcon Sensor
        run: |-
          if [[ -n "${{ secrets.FALCON_CID }}" ]];
          then

            # configure the API client ID and password
            export FALCON_CLIENT_ID="${{ secrets.FALCON_CLIENT_ID }}"
            export FALCON_CLIENT_SECRET="${{ secrets.FALCON_CLIENT_SECRET }}"

            # confgure CID
            export FALCON_CID="${{ secrets.FALCON_CID }}"

            # URL of falcon-container-sensor-pull.sh
            PULL_SCRIPT_URL="https://raw.githubusercontent.com/CrowdStrike/falcon-scripts/main/bash/containers/falcon-container-sensor-pull/falcon-container-sensor-pull.sh"

            # Download the pull script from GitHub and save it to the current directory
            # --silent - Supresses standard/error output
            # --remote-name - Keeps the original filename when saving
            # --location - Follow redirects
            curl --silent --remote-name --location "$PULL_SCRIPT_URL"

            # make script executable
            chmod +x falcon-container-sensor-pull.sh

            # download latest version of the Falcon Container (for fargate) and copy it to ECS
            ./falcon-container-sensor-pull.sh \
            --client-id ${FALCON_CLIENT_ID} \
            --client-secret ${FALCON_CLIENT_SECRET} \
            --type falcon-container \
            --copy "${ECR_REGISTRY}/falcon-container"

            # download latest version of the Falcon Node Sensor (for EC2) and copy it to ECS
            ./falcon-container-sensor-pull.sh \
            --client-id ${FALCON_CLIENT_ID} \
            --client-secret ${FALCON_CLIENT_SECRET} \
            --type falcon-sensor \
            --copy "${ECR_REGISTRY}/falcon-sensor"

            # functions to get image names for helm
            get_image_path() {
                local container_type=$1
                ./falcon-container-sensor-pull.sh \
                --client-id ${FALCON_CLIENT_ID} \
                --client-secret ${FALCON_CLIENT_SECRET} \
                --type ${container_type} \
                --get-image-path
            }

            get_image_name() {
                local container_type=$1
                local image_path=$(get_image_path "$container_type")
                
                # Extract the image name using awk
                local image_name=$(echo "$image_path" | awk -F':' '{print $2}')
                
                echo "$image_name"
            }


            FALCON_CONTAINER_IMAGE_TAG=$(get_image_name "falcon-container")
            FALCON_SENSOR_IMAGE_TAG=$(get_image_name "falcon-sensor")

            # install container sensor (for fargate) into a customized namespace
            helm repo add crowdstrike https://crowdstrike.github.io/falcon-helm
            helm repo update
            helm upgrade --install falcon-container-helm crowdstrike/falcon-sensor \
            -n falcon-container-system --create-namespace \
            --set node.enabled=false \
            --set container.enabled=true \
            --set falcon.cid="$FALCON_CID" \
            --set container.image.repository="${ECR_REGISTRY}/falcon-container/falcon-sensor" \
            --set container.image.tag="$FALCON_CONTAINER_IMAGE_TAG"

            # install node sensor (for ec2) with different release name (falcon-sensor-helm)
            helm upgrade --install falcon-sensor-helm crowdstrike/falcon-sensor \
            -n falcon-sensor-system --create-namespace \
            --set falcon.cid="$FALCON_CID" \
            --set node.image.repository="${ECR_REGISTRY}/falcon-sensor/falcon-sensor" \
            --set node.image.tag="$FALCON_SENSOR_IMAGE_TAG"

            # install KPA (kubernetes protection agent)
            helm upgrade --install kpagent crowdstrike/cs-k8s-protection-agent \
            -n falcon-kubernetes-protection --create-namespace \
            --set image.repository="registry.crowdstrike.com/kubernetes_protection/kpagent" \
            --set image.tag="0.2117.0" \
            --set crowdstrikeConfig.clientID="$FALCON_CLIENT_ID" \
            --set crowdstrikeConfig.clientSecret="$FALCON_CLIENT_SECRET" \
            --set crowdstrikeConfig.clusterName="arn:aws:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/biomage-${CLUSTER_ENV}" \
            --set crowdstrikeConfig.env="${{ secrets.FALCON_REGION }}" \
            --set crowdstrikeConfig.cid="${{ secrets.FALCON_CCID }}" \
            --set crowdstrikeConfig.dockerAPIToken="${{ secrets.FALCON_DOCKER_API_TOKEN }}"

          else
            echo "CrowdStrike CID missing, skipping falcon sensor setup"
          fi
        env:
          ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
          AWS_REGION: ${{ secrets.AWS_REGION }}
          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}

  report-if-failed:
    name: Report if workflow failed
    runs-on: ubuntu-20.04
    needs: [load-config, check-secrets, create-eks-cluster, configure-cluster, deploy-monitoring]
    if: failure() && github.ref == 'refs/heads/master'
    steps:
      - id: send-to-slack
        name: Send failure notification to Slack on failure
        env:
          SLACK_BOT_TOKEN: ${{ secrets.WORKFLOW_STATUS_BOT_TOKEN }}
        uses: voxmedia/github-action-slack-notify-build@v1
        with:
          channel: workflow-failures
          status: FAILED
          color: danger