Skip to content

Deploy Cellenics infrastructure on AWS #598

Deploy Cellenics infrastructure on AWS

Deploy Cellenics infrastructure on AWS #598

Workflow file for this run

name: Deploy Cellenics infrastructure on AWS
on:
workflow_dispatch:
inputs:
environment_name:
type: string
description: Select the environment name to run the actions on
required: true
default: all
workflow_actions:
type: choice
description: Select actions to perform
options:
- create and configure cluster
- configure cluster
- deploy monitoring
default: configure cluster
environment_type:
type: choice
description: Select environment type
options:
- staging
- production
- staging and production
default: staging
# this ensures that only one CI pipeline with the same key
# can run at once in order to prevent undefined states
concurrency: cluster-update-mutex
permissions:
id-token: write
contents: read
# After load-config and check-secrets jobs are finished:
# "create and configure cluster" workflow_actions option runs all jobs.
# "configure cluster" workflow_actions option runs only configure-cluster job
# "deploy monitoring" workflow_actions option runs only deploy-monitoring job
jobs:
load-config:
uses: ./.github/workflows/load-config.yaml
with:
environment_name: ${{ github.event.inputs.environment_name }}
environment_type: ${{ github.event.inputs.environment_type }}
check-secrets:
name: Check that sufficient secrets are specified for environment name
runs-on: ubuntu-20.04
needs: load-config
strategy:
matrix:
environment_name: ${{ fromJson(needs.load-config.outputs.environment_names) }}
environment: ${{ matrix.environment_name }}
steps:
- id: check-secrets-for-environment
name: Check if necessary secrets are installed.
run: |-
echo Checking if secrets are defined in the repository.
if [ -z "${{ secrets.ACM_CERTIFICATE_ARN}}" ]
then
echo AWS certificate ARN is not defined.
ERROR=true
fi
if [ -z "${{ secrets.AWS_ACCOUNT_ID }}" ]
then
echo AWS Account ID is not defined.
ERROR=true
fi
if [ -z "${{ secrets.API_TOKEN_GITHUB }}" ]
then
echo GitHub deploy key access token is not defined.
ERROR=true
fi
if [ -z "${{ secrets.PRIMARY_DOMAIN_NAME }}" ]
then
echo Secret PRIMARY_DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets.
ERROR=true
fi
if [ -z "${{ secrets.DOMAIN_NAME }}" ]
then
echo Secret DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets.
ERROR=true
fi
if [ -n "$ERROR" ]
then
echo
echo This workflow requires some secrets to complete.
echo Please make they are created by adding/rotating them manually.
exit 1
fi
create-eks-cluster:
name: Create EKS cluster
runs-on: ubuntu-20.04
needs: [check-secrets, load-config]
if: github.event.inputs.workflow_actions == 'create and configure cluster'
env:
CLUSTER_ENV: ${{ matrix.environment.type }}
strategy:
max-parallel: 1
matrix:
environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }}
environment: ${{ matrix.environment.name }}
steps:
- id: checkout
name: Check out source code
uses: actions/checkout@v3
- id: setup-aws
name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
role-duration-seconds: 4800
aws-region: ${{ secrets.AWS_REGION }}
- id: fill-metadata
name: Add name and region to the eksctl file.
run: |-
export CLUSTER_NAME="biomage-$CLUSTER_ENV"
yq -i '
.metadata.name = strenv(CLUSTER_NAME) |
.metadata.region = strenv(AWS_REGION)
' infra/config/cluster/cluster-template.yaml
# CELLENICS_VPC_ID is set if using custom cluster deployment. In this case, use the custom template file.
# If not set, create an empty template file to let eksctl create a new cluster for the deployment.
if [ ! -z "$CELLENICS_VPC_ID" ]; then
export PRIVATE_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" | jq -r '.Subnets[0].SubnetId')
export PRIVATE_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" | jq -r '.Subnets[1].SubnetId')
export PUBLIC_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" | jq -r '.Subnets[0].SubnetId')
export PUBLIC_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" | jq -r '.Subnets[1].SubnetId')
yq '
.vpc.id = strenv(CELLENICS_VPC_ID) |
.vpc.subnets.private.private-1 = strenv(PRIVATE_SUBNET_1_ID) |
.vpc.subnets.private.private-2 = strenv(PRIVATE_SUBNET_2_ID) |
.vpc.subnets.public.public-1 = strenv(PUBLIC_SUBNET_1_ID) |
.vpc.subnets.public.public-2 = strenv(PUBLIC_SUBNET_2_ID)
' infra/config/cluster/cluster-config-template.yaml > /tmp/cluster-config-values.yaml
else
touch /tmp/cluster-config-values.yaml
fi
yq eval-all '. as $item ireduce ({}; . *d $item)' infra/config/cluster/cluster-template.yaml /tmp/cluster-config-values.yaml > /tmp/cluster-$CLUSTER_ENV.yaml
cat /tmp/cluster-$CLUSTER_ENV.yaml
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
CELLENICS_VPC_ID: ${{ secrets.CELLENICS_VPC_ID }}
- id: install-eksctl
name: Install eksctl
run: |-
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
- id: create-clusters
name: Attempt to create clusters from spec.
# this job will always pass, irrespective of whether creation was successful or not.
# this is because the cluster may already exist. we will check for this condition
# on failure in the next step
continue-on-error: true
run: |-
exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log)
eksctl create cluster -f /tmp/cluster-$CLUSTER_ENV.yaml
echo "outcome=created" >> $GITHUB_OUTPUT
- id: check-for-failure
name: Check for reason of failure if cluster creation failed.
if: steps.create-clusters.outcome == 'failure'
run: |-
# Check if failure was caused by an already exists exception.
# If not, the job should fail.
ALREADY_EXISTS=$(grep AlreadyExistsException /tmp/eksctl-$CLUSTER_ENV.log | wc -l | xargs)
if [ $ALREADY_EXISTS -ne 1 ]
then
echo Step failed for reason other than stack already existing.
echo Job failing...
echo "reason=error" >> $GITHUB_OUTPUT
false
fi
echo Cluster already exists.
echo "reason=already-exists" >> $GITHUB_OUTPUT
- id: update-addons-for-cluster
name: Attempt to create addons for cluster.
continue-on-error: true
run: |-
exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log)
eksctl create addon -f /tmp/cluster-$CLUSTER_ENV.yaml
- id: update-nodegroup
name: Attempt to update node groups for existing cluster.
if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists'
run: |-
eksctl create nodegroup --config-file=/tmp/cluster-$CLUSTER_ENV.yaml
eksctl delete nodegroup --config-file /tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve
# note: iam service accounts should really be created from within the helm chart as seen here:
# https://docs.aws.amazon.com/eks/latest/userguide/specify-service-account-role.html
- id: update-serviceaccounts
name: Attempt to update IAM service accounts for existing cluster.
if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists'
run: |-
eksctl utils associate-iam-oidc-provider --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --approve
eksctl create iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml
eksctl delete iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve
configure-cluster:
name: Configure Kubernetes resources on the EKS cluster
runs-on: ubuntu-20.04
needs: [check-secrets, create-eks-cluster, load-config]
if: always() && (github.event.inputs.workflow_actions == 'create and configure cluster' || github.event.inputs.workflow_actions == 'configure cluster') && (needs.check-secrets.result == 'success') && (needs.create-eks-cluster.result == 'success' || needs.create-eks-cluster.result == 'skipped')
env:
CLUSTER_ENV: ${{ matrix.environment.type }}
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
strategy:
max-parallel: 1
matrix:
environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }}
environment: ${{ matrix.environment.name }}
steps:
- id: checkout
name: Check out source code
uses: actions/checkout@v3
- id: setup-aws
name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
role-duration-seconds: 4800
aws-region: ${{ secrets.AWS_REGION }}
- id: add-kubeconfig
name: Add k8s config file for existing cluster.
run: |-
aws eks update-kubeconfig --name biomage-$CLUSTER_ENV
- id: deploy-metrics-server
name: Deploy k8s metrics server
run: |-
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
- id: install-helm
name: Install Helm
run: |-
sudo snap install helm --classic
- id: install-eksctl
name: Install eksctl
run: |-
ARCH=amd64
PLATFORM=$(uname -s)_$ARCH
curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz"
tar -xzf eksctl_$PLATFORM.tar.gz -C /tmp && rm eksctl_$PLATFORM.tar.gz
sudo mv /tmp/eksctl /usr/local/bin
- id: deploy-load-balancer-role
name: Deploy permissions for AWS load balancer controller
run: |-
curl -o iam-policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/v2.10.1/docs/install/iam_policy.json
aws iam create-policy \
--policy-name AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \
--policy-document file://iam-policy.json || true
eksctl create iamserviceaccount \
--cluster=biomage-$CLUSTER_ENV \
--namespace=kube-system \
--name=aws-load-balancer-controller \
--attach-policy-arn=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:policy/AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \
--role-name eksctl-$CLUSTER_ENV-load-balancer-controller-role \
--override-existing-serviceaccounts \
--approve
# we need to retry this due to an active issue with the AWS Load Balancer Controller
# where there are intermittent failures that are only fixable by retrying
# see issue at https://github.com/kubernetes-sigs/aws-load-balancer-controller/issues/2071
- id: install-lbc
name: Deploy AWS Load Balancer Controller
uses: nick-invision/retry@v2
with:
timeout_seconds: 600
max_attempts: 5
retry_on: error
on_retry_command: sleep $(shuf -i 5-15 -n 1)
command: |-
helm repo add eks https://aws.github.io/eks-charts
wget https://raw.githubusercontent.com/aws/eks-charts/master/stable/aws-load-balancer-controller/crds/crds.yaml
kubectl apply -f crds.yaml
helm repo update
helm upgrade aws-load-balancer-controller eks/aws-load-balancer-controller \
--namespace kube-system \
--set serviceAccount.create=false \
--set serviceAccount.name=aws-load-balancer-controller \
--set clusterName=biomage-$CLUSTER_ENV \
--install --wait
- id: platform-public-facing
name: Get config for whether platform should be public facing
uses: mikefarah/yq@master
with:
cmd: yq '.[env(ENVIRONMENT_NAME)].publicFacing' 'infra/config/github-environments-config.yaml'
env:
ENVIRONMENT_NAME: ${{ matrix.environment.name }}
- id: install-elb-503-subscription-endpoint
name: Install ELB 503 subscription endpoint
run: |-
echo "value of publicFacing: $PUBLIC_FACING"
# Check that publicFacing is set to true or false
if [ "$PUBLIC_FACING" != "true" ] && [ "$PUBLIC_FACING" != "false" ]; then
echo "value of publicFacing in infra/config/github-environments-config.yaml is not set to true or false"
exit 1
fi
# this is needed so SNS does not stop trying to subscribe to not-yet-deployed
# API staging environments because their endpoints are not yet available.
helm upgrade aws-elb-503-subscription-endpoint infra/aws-elb-503-subscription-endpoint \
--namespace default \
--set clusterEnv=$CLUSTER_ENV \
--set acmCertificate="$ACM_CERTIFICATE_ARN" \
--set-string publicFacing="$PUBLIC_FACING" \
--install --wait
env:
PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }}
ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }}
- id: deploy-env-loadbalancer
name: Deploy AWS Application Load Balancer for environment
uses: aws-actions/aws-cloudformation-github-deploy@v1
with:
parameter-overrides: "Environment=${{ matrix.environment.type }},PublicFacing=${{ steps.platform-public-facing.outputs.result }}"
name: "biomage-k8s-alb-${{ matrix.environment.type }}"
template: 'infra/cf-loadbalancer.yaml'
no-fail-on-empty-changeset: "1"
# For HMS ACM_CERTIFICATE_ARN_STAGING exists, having different domains for staging and prod
# so we need to check if it exists, otherwise set it to ACM_CERTIFICATE_ARN
# same applies for PRIMARY_DOMAIN_NAME_STAGING
- id: setup-domain
name: Compile environment-specific domain name
run: |-
if [ "${{ matrix.environment.type }}" = "production" ]; then
PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}"
DOMAIN_NAME="${{ secrets.DOMAIN_NAME }}"
ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}"
fi
if [ "${{ matrix.environment.type }}" = "staging" ]; then
PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME_STAGING }}"
if [ -z "$PRIMARY_DOMAIN_NAME" ]; then
PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}"
fi
DOMAIN_NAME="${{ secrets.DOMAIN_NAME_STAGING }}"
ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN_STAGING }}"
if [ -z "$ACM_CERTIFICATE_ARN" ]; then
ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}"
fi
fi
echo "primary-domain-name=$PRIMARY_DOMAIN_NAME" >> $GITHUB_OUTPUT
echo "domain-name=$DOMAIN_NAME" >> $GITHUB_OUTPUT
echo "acm-certificate-arn=$ACM_CERTIFICATE_ARN" >> $GITHUB_OUTPUT
# This step should be run only once per deployment. The associated Route 53 records to be created
# e.g. DOMAIN_NAME and *.DOMAIN_NAME should be deleted before running this comment, otherwise this step fails.
# Refer to the new deployment runbook to learn more.
# - id: deploy-route53
# name: Deploy Route 53 DNS records to ELB
# uses: aws-actions/aws-cloudformation-github-deploy@v1
# with:
# parameter-overrides: "Environment=${{ matrix.environment.type }},DNSName=${{ steps.deploy-env-loadbalancer.outputs.DNSName }},HostedZoneId=${{ steps.deploy-env-loadbalancer.outputs.CanonicalHostedZoneID }},PrimaryDomainName=${{ steps.setup-domain.outputs.primary-domain-name }},DomainName=${{ steps.setup-domain.outputs.domain-name }}"
# name: "biomage-alb-route53-${{ matrix.environment.type }}"
# template: 'infra/cf-route53.yaml'
# no-fail-on-empty-changeset: "1"
- id: deploy-xray-daemon
name: Deploy AWS X-Ray daemon
run: |-
helm upgrade "aws-xray-daemon" infra/aws-xray-daemon \
--namespace default \
--set iamRole=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/xray-daemon-role-$CLUSTER_ENV \
--install --wait
- id: install-ebs-csi-driver
name: Install AWS EBS Container Storage Interface (CSI) drivers
run: |-
helm upgrade \
aws-ebs-csi-driver https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/download/helm-chart-aws-ebs-csi-driver-2.17.2/aws-ebs-csi-driver-2.17.2.tgz \
--namespace kube-system \
--set enableVolumeScheduling=true \
--set enableVolumeResizing=true \
--set enableVolumeSnapshot=true \
--install --wait
- id: deploy-read-only-group
name: Deploy read-only permission definition for cluster
run: |-
helm upgrade "biomage-read-only-group" infra/biomage-read-only-group \
--install --wait
- id: deploy-state-machine-role
name: Deploy AWS Step Function (state machine) roles
uses: aws-actions/aws-cloudformation-github-deploy@v1
with:
parameter-overrides: "Environment=${{ matrix.environment.type }}"
name: "biomage-state-machine-role-${{ matrix.environment.type }}"
template: 'infra/cf-state-machine-role.yaml'
capabilities: 'CAPABILITY_IAM,CAPABILITY_NAMED_IAM'
no-fail-on-empty-changeset: "1"
- id: remove-identitymappings
name: Remove all previous identity mappings for IAM users
run: |-
eksctl get iamidentitymapping --cluster=biomage-$CLUSTER_ENV --output=json | \
jq -r '.[] | select(.userarn != null) | .userarn' > /tmp/users_to_remove
while IFS= read -r user
do
echo "Remove rights of $user"
eksctl delete iamidentitymapping \
--cluster=biomage-$CLUSTER_ENV \
--arn $user \
--all
done < "/tmp/users_to_remove"
# see https://eksctl.io/usage/iam-identity-mappings/
# Grant login rights to ci-iac-role
- id: add-ci-iac-oidc-cluster-role
name: Allow the OIDC role to log in to our cluster.
run: |-
eksctl create iamidentitymapping \
--cluster=biomage-$CLUSTER_ENV \
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/ci-iac-role \
--group system:masters \
--username ci-iac-role
# SSO access to cluster is only added if accessing AWS and cluster using SSO
- id: allow-sso-roles-to-access-cluster
env:
SSO_ROLE: ${{ secrets.SSO_ROLE }}
if: ${{ env.SSO_ROLE != '' }}
name: Allow SSO role to log into the cluster
run: |-
eksctl create iamidentitymapping \
--cluster biomage-$CLUSTER_ENV \
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/${{ env.SSO_ROLE }} \
--username sso-cluster-admin \
--no-duplicate-arns \
--group system:masters
- id: add-state-machine-cluster-role
name: Grant rights to the state machine IAM role.
run: |-
eksctl create iamidentitymapping \
--cluster=biomage-$CLUSTER_ENV \
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/state-machine-role-$CLUSTER_ENV \
--group state-machine-runner-group \
--username state-machine-runner
# NOTE: after updating this step, make sure you apply the updates in other relevant Github Actions workflows
- id: update-identitymapping-admin
name: Add cluster admin rights to everyone on the admin list.
run: |-
echo "Setting cluster admin rights for ${{matrix.environment.name}} in ${{matrix.environment.type}} environment"
ADMINS="${{ join(matrix.environment.admins, ' ') }}"
echo $ADMINS
for user in $ADMINS; do
echo "Adding cluster admin rights to $user"
eksctl create iamidentitymapping \
--cluster=biomage-$CLUSTER_ENV \
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:user/$user \
--group system:masters \
--username $user
done
###
### INSTALL AND CONFIGURE FLUX V2 ###
###
- id: using-self-signed-certificate
name: Get config for whether deployment is using self-signed certificate
uses: mikefarah/yq@master
with:
cmd: yq '.[env(ENVIRONMENT_NAME)].selfSignedCertificate' 'infra/config/github-environments-config.yaml'
env:
ENVIRONMENT_NAME: ${{ matrix.environment.name }}
- id: fill-account-specific-metadata
name: Fill in account specific metadata in ConfigMap
run: |-
yq -i '
.myAccount.domainName = strenv(DOMAIN_NAME) |
.myAccount.region = strenv(AWS_REGION) |
.myAccount.accountId = strenv(AWS_ACCOUNT_ID) |
.myAccount.publicFacing = strenv(PUBLIC_FACING) |
.myAccount.acmCertificate = strenv(ACM_CERTIFICATE_ARN) |
.myAccount.selfSignedCertificate = strenv(SELF_SIGNED_CERTIFICATE)
' infra/config/account-config.yaml
if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]]
then
export DATADOG_API_KEY="${{ secrets.DATADOG_API_KEY }}"
export DATADOG_APP_KEY="${{ secrets.DATADOG_APP_KEY }}"
yq -i '
.myAccount.datadogAppKey = strenv(DATADOG_APP_KEY) |
.myAccount.datadogApiKey = strenv(DATADOG_API_KEY)
' infra/config/account-config.yaml
fi
cat infra/config/account-config.yaml
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}
DOMAIN_NAME: ${{ steps.setup-domain.outputs.domain-name }}
ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }}
PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }}
SELF_SIGNED_CERTIFICATE: ${{ steps.using-self-signed-certificate.outputs.result }}
- id: create-flux-namespace
name: Attempt to create flux namespace
continue-on-error: true
run: |-
kubectl create namespace flux-system
- id: create-account-information-configmap
name: Create a configmap containing AWS account specific details
continue-on-error: false
run: |-
kubectl create configmap account-config --from-file=infra/config/account-config.yaml -n flux-system -o yaml --dry-run | kubectl apply -f -
- id: install-flux-v2
name: Install flux CLI version 2.3.0
run: |-
curl -s https://fluxcd.io/install.sh | sudo FLUX_VERSION=2.3.0 bash
- id: delete-old-flux-github-deploy-key
name: Attempt to delete previous github flux deploy key
continue-on-error: true
run: |-
kubectl -n flux-system delete secret flux-system
- id: install-flux
name: Install Flux to EKS cluster
run: |-
# Refer to https://github.com/fluxcd/flux2/releases
FLUX_VERSION=v2.3.0
FLUX_REPO=releases
FLUX_PATH=deployments/$ENVIRONMENT_NAME-$CLUSTER_ENV
REPO_FULL_PATH=$GITHUB_REPOSITORY_OWNER/$FLUX_REPO
echo "flux-full-repo=$(echo $REPO_FULL_PATH)" >> $GITHUB_ENV
echo "flux-path=$(echo $FLUX_PATH)" >> $GITHUB_ENV
args=(
--version $FLUX_VERSION
--owner $GITHUB_REPOSITORY_OWNER
--repository $FLUX_REPO
--branch master
--path $FLUX_PATH
--timeout 40s
–-interval 2m
--components-extra=image-reflector-controller,image-automation-controller
--namespace flux-system
--cluster arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV
--context arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV
)
if [ "${{ matrix.environment.type }}" = "staging" ]
then
echo Flux will be deployed in staging with read and write permissions
args+=(--read-write-key)
elif [ "${{ matrix.environment.type }}" = "production" ]
then
echo Flux will be deployed in production with read-only permissions
fi
flux bootstrap github "${args[@]}"
env:
GITHUB_TOKEN: ${{ secrets.API_TOKEN_GITHUB }}
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}
ENVIRONMENT_NAME: ${{ matrix.environment.name }}
- id: fill-in-sync-yaml
name: Create the sync.yaml file that contains the Kustomization to sync the cluster
run: |-
export SPEC_PATH="./$CLUSTER_ENV"
yq -i '
.spec.path = strenv(SPEC_PATH)
' infra/flux/sync.yaml
cat infra/flux/sync.yaml
- id: push-sync-yaml
name: Push the sync.yaml file that was filled in during the previous step
uses: dmnemec/[email protected]
env:
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
with:
source_file: infra/flux/sync.yaml
destination_repo: ${{ env.flux-full-repo }}
destination_folder: ${{ env.flux-path }}
user_email: [email protected]
user_name: 'Biomage CI/CD'
- id: fill-kustomization-template
name: Fill in Kustomization template
run: |-
cat infra/flux/kustomization-template.yaml \
| sed "s/AWS_ACCOUNT_ID/$AWS_ACCOUNT_ID/g" \
| sed "s/CLUSTER_ENV/$CLUSTER_ENV/g" \
> infra/flux/kustomization.yaml
cat infra/flux/kustomization.yaml
env:
AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }}
- id: push-kustomization-yaml
name: Push the kustomization.yaml file to apply our custom config
uses: dmnemec/[email protected]
env:
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
with:
source_file: infra/flux/kustomization.yaml
destination_repo: ${{ env.flux-full-repo }}
destination_folder: ${{ env.flux-path }}/flux-system
user_email: [email protected]
user_name: 'Biomage CI/CD'
- id: install-kubernetes-reflector
name: Install kubernetes reflector
run: |-
helm repo add emberstack https://emberstack.github.io/helm-charts
helm repo update
helm upgrade --install reflector emberstack/reflector --namespace flux-system
- id: add-account-config-configmap-annotations
name: Add annotations to account-config configmap
run: |-
kubectl annotate configmap account-config \
--overwrite \
--namespace flux-system \
reflector.v1.k8s.emberstack.com/reflection-allowed="true" \
reflector.v1.k8s.emberstack.com/reflection-allowed-namespaces="ui-.*,api-.*,pipeline-.*,worker-.*" \
reflector.v1.k8s.emberstack.com/reflection-auto-enabled="true"
###
### END OF INSTALL AND CONFIGURE FLUX V2 ###
###
deploy-monitoring:
name: Setup logging and monitoring
runs-on: ubuntu-20.04
needs: [check-secrets, create-eks-cluster, configure-cluster, load-config]
if: always() && (needs.check-secrets.result == 'success') && (github.event.inputs.workflow_actions == 'deploy monitoring')
env:
CLUSTER_ENV: ${{ matrix.environment.type }}
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }}
strategy:
matrix:
environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix)}}
environment: ${{ matrix.environment.name }}
steps:
- id: checkout
name: Check out source code
uses: actions/checkout@v3
- id: setup-aws
name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role
role-duration-seconds: 3600
aws-region: ${{ secrets.AWS_REGION }}
- id: add-kubeconfig
name: Add k8s config file for existing cluster.
run: |-
aws eks update-kubeconfig --name biomage-$CLUSTER_ENV
- id: install-eksctl
name: Install eksctl
run: |-
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
- id: setup-cluster-cloudwatch-logging-policy
name: Setup permissions required for cluster to log to Cloudwatch
uses: aws-actions/aws-cloudformation-github-deploy@v1
with:
parameter-overrides: "Environment=${{ matrix.environment.type }}"
name: "cluster-cloudwatch-logging-policy-${{ matrix.environment.type }}"
template: 'infra/cluster-logging/cf-cluster-log-cloudwatch-policy.yaml'
no-fail-on-empty-changeset: "1"
capabilities: "CAPABILITY_IAM,CAPABILITY_NAMED_IAM"
# Setting up log forwarding for pods hosted in EC2 nodes
- id: create-fluent-bit-namespace
name: Create namespace for node FluentBit deployment
run: kubectl apply -f infra/cluster-logging/node-fluentbit-namespace.yaml
- id: create-service-account-for-node-fluent-bit
name: Create service account for node FluentBit
env:
LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }}
run: |-
eksctl create iamserviceaccount \
--name fluent-bit \
--namespace node-logging \
--cluster biomage-$CLUSTER_ENV \
--role-name irsa-fluent-bit-$CLUSTER_ENV \
--attach-policy-arn $LOGGING_POLICY_ARN \
--override-existing-serviceaccounts \
--approve
- id: deploy-node-fluent-bit
name: Deploy FluentBit for EC2 nodes
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
run: |
# FluentBit configuration is determined in infra/cluster-logging/node-fluentbit-config.yaml, specifically under [INPUT] > Path
# We do not want to log everything for costs/security concerns
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/node-fluentbit-config.yaml
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/node-fluentbit-config.yaml
kubectl apply -f infra/cluster-logging/node-fluentbit-config.yaml
# Setting up log forwarding for pods hosted on Fargate nodes
- id: attach-pod-execution-role-name
name: Attach logging policy to pod execution role
env:
LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }}
run: |-
# Pods launched in the same cluster has the same pod execution role, as pod execution role scope is cluster-wide.
# See https://eksctl.io/usage/fargate-support/#creating-a-cluster-with-fargate-support
# Getting fargate-profile of pipeline or worker in the same cluster gets the same pod execution role.
POD_EXEC_ROLE_NAME=$(aws eks describe-fargate-profile \
--cluster-name biomage-$CLUSTER_ENV \
--fargate-profile-name pipeline-default | jq -r '.fargateProfile.podExecutionRoleArn' | awk -F"/" '{print (NF>1)? $NF : ""}' )
aws iam attach-role-policy --role-name $POD_EXEC_ROLE_NAME --policy-arn $LOGGING_POLICY_ARN
- id: deploy-fargate-fluent-bit
name: Deploy FluentBit config for Fargate pods
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
run: |-
# FluentBit configuration is determined in infra/cluster-logging/fargate-fluentbit-config.yaml
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/fargate-fluentbit-config.yaml
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/fargate-fluentbit-config.yaml
kubectl apply -f infra/cluster-logging/fargate-fluentbit-config.yaml
# Setting up Datadog to watch pod metrics for pods hosted on EC2 and Fargate nodes
- id: setup-datadog-cluster-agent
name: Setup Datadog cluster agent
run: |-
if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]];
then
helm repo add datadog https://helm.datadoghq.com
helm repo update
helm upgrade datadog-agent datadog/datadog \
-f infra/datadog/cluster-agent-values.yaml \
--set datadog.apiKey=$DATADOG_API_KEY \
--set datadog.clusterName=biomage-$CLUSTER_ENV \
--install
else
echo "Datadog api key missing, skipping datadog setup"
fi
- id: setup-datadog-sidecar-permissions
name: Setup Datadog sidecar permissions
run: |-
if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]];
then
kubectl apply -f infra/datadog/datadog-sidecar-rbac.yaml
fi
- id: setup-orca-cspm
name: Setup ORCA CSPM
run: |-
if [[ -n "${{ secrets.ORCA_TUNNEL_ID }}" ]];
then
helm upgrade --install orca-tunnel \
--namespace orca-security --create-namespace \
oci://public.ecr.aws/orcasecurity/helm-k8s-tunnel \
--set tunnelAddr=tunnel.production.us-east-1.orcasecurity.net \
--set tunnelId="${{ secrets.ORCA_TUNNEL_ID }}" \
--set tunnelToken="${{ secrets.ORCA_TUNNEL_TOKEN }}" \
--set clusterName="biomage-$CLUSTER_ENV" \
--set cloudVendorId="${{ secrets.AWS_ACCOUNT_ID }}" \
--set region="${{ secrets.AWS_REGION }}" \
--set clusterType=eks
else
echo "ORCA_TUNNEL_ID missing, skipping ORCA CSPM setup."
fi
- id: login-ecr
name: Login to Amazon ECR
uses: aws-actions/amazon-ecr-login@v1
- id: create-falcon-ecr-registries
name: Create an ECR repositories for the Falcon Sensor (if needed)
# This will fail if the registry already exists, which is fine. If there is some other
# error, the `push` step will fail instead.
continue-on-error: true
run: |-
if [[ -n "${{ secrets.FALCON_CID }}" ]];
then
aws ecr create-repository --repository-name falcon-container/falcon-sensor --image-tag-mutability MUTABLE
aws ecr create-repository --repository-name falcon-sensor/falcon-sensor --image-tag-mutability MUTABLE
else
echo "CrowdStrike CID missing, not creating falcon sensor repos"
fi
- id: create-falcon-namespace
name: Attempt to create falcon namespace
continue-on-error: true
run: |-
if [[ -n "${{ secrets.FALCON_CID }}" ]];
then
kubectl create namespace falcon-system
else
echo "CrowdStrike CID missing, not creating falcon namespace"
fi
- id: setup-falcon-sensor
name: Setup Falcon Sensor
run: |-
if [[ -n "${{ secrets.FALCON_CID }}" ]];
then
# configure the API client ID and password
export FALCON_CLIENT_ID="${{ secrets.FALCON_CLIENT_ID }}"
export FALCON_CLIENT_SECRET="${{ secrets.FALCON_CLIENT_SECRET }}"
# confgure CID
export FALCON_CID="${{ secrets.FALCON_CID }}"
# URL of falcon-container-sensor-pull.sh
PULL_SCRIPT_URL="https://raw.githubusercontent.com/CrowdStrike/falcon-scripts/main/bash/containers/falcon-container-sensor-pull/falcon-container-sensor-pull.sh"
# Download the pull script from GitHub and save it to the current directory
# --silent - Supresses standard/error output
# --remote-name - Keeps the original filename when saving
# --location - Follow redirects
curl --silent --remote-name --location "$PULL_SCRIPT_URL"
# make script executable
chmod +x falcon-container-sensor-pull.sh
# download latest version of the Falcon Container (for fargate) and copy it to ECS
./falcon-container-sensor-pull.sh \
--client-id ${FALCON_CLIENT_ID} \
--client-secret ${FALCON_CLIENT_SECRET} \
--type falcon-container \
--copy "${ECR_REGISTRY}/falcon-container"
# download latest version of the Falcon Node Sensor (for EC2) and copy it to ECS
./falcon-container-sensor-pull.sh \
--client-id ${FALCON_CLIENT_ID} \
--client-secret ${FALCON_CLIENT_SECRET} \
--type falcon-sensor \
--copy "${ECR_REGISTRY}/falcon-sensor"
# functions to get image names for helm
get_image_path() {
local container_type=$1
./falcon-container-sensor-pull.sh \
--client-id ${FALCON_CLIENT_ID} \
--client-secret ${FALCON_CLIENT_SECRET} \
--type ${container_type} \
--get-image-path
}
get_image_name() {
local container_type=$1
local image_path=$(get_image_path "$container_type")
# Extract the image name using awk
local image_name=$(echo "$image_path" | awk -F':' '{print $2}')
echo "$image_name"
}
FALCON_CONTAINER_IMAGE_TAG=$(get_image_name "falcon-container")
FALCON_SENSOR_IMAGE_TAG=$(get_image_name "falcon-sensor")
# install container sensor (for fargate) into a customized namespace
helm repo add crowdstrike https://crowdstrike.github.io/falcon-helm
helm repo update
helm upgrade --install falcon-container-helm crowdstrike/falcon-sensor \
-n falcon-container-system --create-namespace \
--set node.enabled=false \
--set container.enabled=true \
--set falcon.cid="$FALCON_CID" \
--set container.image.repository="${ECR_REGISTRY}/falcon-container/falcon-sensor" \
--set container.image.tag="$FALCON_CONTAINER_IMAGE_TAG"
# install node sensor (for ec2) with different release name (falcon-sensor-helm)
helm upgrade --install falcon-sensor-helm crowdstrike/falcon-sensor \
-n falcon-sensor-system --create-namespace \
--set falcon.cid="$FALCON_CID" \
--set node.image.repository="${ECR_REGISTRY}/falcon-sensor/falcon-sensor" \
--set node.image.tag="$FALCON_SENSOR_IMAGE_TAG"
# install KPA (kubernetes protection agent)
helm upgrade --install kpagent crowdstrike/cs-k8s-protection-agent \
-n falcon-kubernetes-protection --create-namespace \
--set image.repository="registry.crowdstrike.com/kubernetes_protection/kpagent" \
--set image.tag="0.2117.0" \
--set crowdstrikeConfig.clientID="$FALCON_CLIENT_ID" \
--set crowdstrikeConfig.clientSecret="$FALCON_CLIENT_SECRET" \
--set crowdstrikeConfig.clusterName="arn:aws:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/biomage-${CLUSTER_ENV}" \
--set crowdstrikeConfig.env="${{ secrets.FALCON_REGION }}" \
--set crowdstrikeConfig.cid="${{ secrets.FALCON_CCID }}" \
--set crowdstrikeConfig.dockerAPIToken="${{ secrets.FALCON_DOCKER_API_TOKEN }}"
else
echo "CrowdStrike CID missing, skipping falcon sensor setup"
fi
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
report-if-failed:
name: Report if workflow failed
runs-on: ubuntu-20.04
needs: [load-config, check-secrets, create-eks-cluster, configure-cluster, deploy-monitoring]
if: failure() && github.ref == 'refs/heads/master'
steps:
- id: send-to-slack
name: Send failure notification to Slack on failure
env:
SLACK_BOT_TOKEN: ${{ secrets.WORKFLOW_STATUS_BOT_TOKEN }}
uses: voxmedia/github-action-slack-notify-build@v1
with:
channel: workflow-failures
status: FAILED
color: danger