Deploy Cellenics infrastructure on AWS #592
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Deploy Cellenics infrastructure on AWS | |
on: | |
workflow_dispatch: | |
inputs: | |
environment_name: | |
type: string | |
description: Select the environment name to run the actions on | |
required: true | |
default: all | |
workflow_actions: | |
type: choice | |
description: Select actions to perform | |
options: | |
- create and configure cluster | |
- configure cluster | |
- deploy monitoring | |
default: configure cluster | |
environment_type: | |
type: choice | |
description: Select environment type | |
options: | |
- staging | |
- production | |
- staging and production | |
default: staging | |
# this ensures that only one CI pipeline with the same key | |
# can run at once in order to prevent undefined states | |
concurrency: cluster-update-mutex | |
permissions: | |
id-token: write | |
contents: read | |
# After load-config and check-secrets jobs are finished: | |
# "create and configure cluster" workflow_actions option runs all jobs. | |
# "configure cluster" workflow_actions option runs only configure-cluster job | |
# "deploy monitoring" workflow_actions option runs only deploy-monitoring job | |
jobs: | |
load-config: | |
uses: ./.github/workflows/load-config.yaml | |
with: | |
environment_name: ${{ github.event.inputs.environment_name }} | |
environment_type: ${{ github.event.inputs.environment_type }} | |
check-secrets: | |
name: Check that sufficient secrets are specified for environment name | |
runs-on: ubuntu-20.04 | |
needs: load-config | |
strategy: | |
matrix: | |
environment_name: ${{ fromJson(needs.load-config.outputs.environment_names) }} | |
environment: ${{ matrix.environment_name }} | |
steps: | |
- id: check-secrets-for-environment | |
name: Check if necessary secrets are installed. | |
run: |- | |
echo Checking if secrets are defined in the repository. | |
if [ -z "${{ secrets.ACM_CERTIFICATE_ARN}}" ] | |
then | |
echo AWS certificate ARN is not defined. | |
ERROR=true | |
fi | |
if [ -z "${{ secrets.AWS_ACCOUNT_ID }}" ] | |
then | |
echo AWS Account ID is not defined. | |
ERROR=true | |
fi | |
if [ -z "${{ secrets.API_TOKEN_GITHUB }}" ] | |
then | |
echo GitHub deploy key access token is not defined. | |
ERROR=true | |
fi | |
if [ -z "${{ secrets.PRIMARY_DOMAIN_NAME }}" ] | |
then | |
echo Secret PRIMARY_DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets. | |
ERROR=true | |
fi | |
if [ -z "${{ secrets.DOMAIN_NAME }}" ] | |
then | |
echo Secret DOMAIN_NAME is not set in repository secrets. Make sure this secret exists in the repository secrets. | |
ERROR=true | |
fi | |
if [ -n "$ERROR" ] | |
then | |
echo | |
echo This workflow requires some secrets to complete. | |
echo Please make they are created by adding/rotating them manually. | |
exit 1 | |
fi | |
create-eks-cluster: | |
name: Create EKS cluster | |
runs-on: ubuntu-20.04 | |
needs: [check-secrets, load-config] | |
if: github.event.inputs.workflow_actions == 'create and configure cluster' | |
env: | |
CLUSTER_ENV: ${{ matrix.environment.type }} | |
strategy: | |
max-parallel: 1 | |
matrix: | |
environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }} | |
environment: ${{ matrix.environment.name }} | |
steps: | |
- id: checkout | |
name: Check out source code | |
uses: actions/checkout@v3 | |
- id: setup-aws | |
name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role | |
role-duration-seconds: 4800 | |
aws-region: ${{ secrets.AWS_REGION }} | |
- id: fill-metadata | |
name: Add name and region to the eksctl file. | |
run: |- | |
export CLUSTER_NAME="biomage-$CLUSTER_ENV" | |
yq -i ' | |
.metadata.name = strenv(CLUSTER_NAME) | | |
.metadata.region = strenv(AWS_REGION) | |
' infra/config/cluster/cluster-template.yaml | |
# CELLENICS_VPC_ID is set if using custom cluster deployment. In this case, use the custom template file. | |
# If not set, create an empty template file to let eksctl create a new cluster for the deployment. | |
if [ ! -z "$CELLENICS_VPC_ID" ]; then | |
export PRIVATE_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" | jq -r '.Subnets[0].SubnetId') | |
export PRIVATE_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=false" | jq -r '.Subnets[1].SubnetId') | |
export PUBLIC_SUBNET_1_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" | jq -r '.Subnets[0].SubnetId') | |
export PUBLIC_SUBNET_2_ID=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$CELLENICS_VPC_ID" "Name=map-public-ip-on-launch,Values=true" | jq -r '.Subnets[1].SubnetId') | |
yq ' | |
.vpc.id = strenv(CELLENICS_VPC_ID) | | |
.vpc.subnets.private.private-1 = strenv(PRIVATE_SUBNET_1_ID) | | |
.vpc.subnets.private.private-2 = strenv(PRIVATE_SUBNET_2_ID) | | |
.vpc.subnets.public.public-1 = strenv(PUBLIC_SUBNET_1_ID) | | |
.vpc.subnets.public.public-2 = strenv(PUBLIC_SUBNET_2_ID) | |
' infra/config/cluster/cluster-config-template.yaml > /tmp/cluster-config-values.yaml | |
else | |
touch /tmp/cluster-config-values.yaml | |
fi | |
yq eval-all '. as $item ireduce ({}; . *d $item)' infra/config/cluster/cluster-template.yaml /tmp/cluster-config-values.yaml > /tmp/cluster-$CLUSTER_ENV.yaml | |
cat /tmp/cluster-$CLUSTER_ENV.yaml | |
env: | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
CELLENICS_VPC_ID: ${{ secrets.CELLENICS_VPC_ID }} | |
- id: install-eksctl | |
name: Install eksctl | |
run: |- | |
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp | |
sudo mv /tmp/eksctl /usr/local/bin | |
- id: create-clusters | |
name: Attempt to create clusters from spec. | |
# this job will always pass, irrespective of whether creation was successful or not. | |
# this is because the cluster may already exist. we will check for this condition | |
# on failure in the next step | |
continue-on-error: true | |
run: |- | |
exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log) | |
eksctl create cluster -f /tmp/cluster-$CLUSTER_ENV.yaml | |
echo "outcome=created" >> $GITHUB_OUTPUT | |
- id: check-for-failure | |
name: Check for reason of failure if cluster creation failed. | |
if: steps.create-clusters.outcome == 'failure' | |
run: |- | |
# Check if failure was caused by an already exists exception. | |
# If not, the job should fail. | |
ALREADY_EXISTS=$(grep AlreadyExistsException /tmp/eksctl-$CLUSTER_ENV.log | wc -l | xargs) | |
if [ $ALREADY_EXISTS -ne 1 ] | |
then | |
echo Step failed for reason other than stack already existing. | |
echo Job failing... | |
echo "reason=error" >> $GITHUB_OUTPUT | |
false | |
fi | |
echo Cluster already exists. | |
echo "reason=already-exists" >> $GITHUB_OUTPUT | |
- id: update-addons-for-cluster | |
name: Attempt to create addons for cluster. | |
continue-on-error: true | |
run: |- | |
exec &> >(tee /tmp/eksctl-$CLUSTER_ENV.log) | |
eksctl create addon -f /tmp/cluster-$CLUSTER_ENV.yaml | |
- id: update-nodegroup | |
name: Attempt to update node groups for existing cluster. | |
if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists' | |
run: |- | |
eksctl create nodegroup --config-file=/tmp/cluster-$CLUSTER_ENV.yaml | |
eksctl delete nodegroup --config-file /tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve | |
# note: iam service accounts should really be created from within the helm chart as seen here: | |
# https://docs.aws.amazon.com/eks/latest/userguide/specify-service-account-role.html | |
- id: update-serviceaccounts | |
name: Attempt to update IAM service accounts for existing cluster. | |
if: steps.create-clusters.outcome == 'failure' && steps.check-for-failure.outputs.reason == 'already-exists' | |
run: |- | |
eksctl utils associate-iam-oidc-provider --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --approve | |
eksctl create iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml | |
eksctl delete iamserviceaccount --config-file=/tmp/cluster-$CLUSTER_ENV.yaml --only-missing --approve | |
configure-cluster: | |
name: Configure Kubernetes resources on the EKS cluster | |
runs-on: ubuntu-20.04 | |
needs: [check-secrets, create-eks-cluster, load-config] | |
if: always() && (github.event.inputs.workflow_actions == 'create and configure cluster' || github.event.inputs.workflow_actions == 'configure cluster') && (needs.check-secrets.result == 'success') && (needs.create-eks-cluster.result == 'success' || needs.create-eks-cluster.result == 'skipped') | |
env: | |
CLUSTER_ENV: ${{ matrix.environment.type }} | |
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} | |
strategy: | |
max-parallel: 1 | |
matrix: | |
environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix) }} | |
environment: ${{ matrix.environment.name }} | |
steps: | |
- id: checkout | |
name: Check out source code | |
uses: actions/checkout@v3 | |
- id: setup-aws | |
name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role | |
role-duration-seconds: 4800 | |
aws-region: ${{ secrets.AWS_REGION }} | |
- id: add-kubeconfig | |
name: Add k8s config file for existing cluster. | |
run: |- | |
aws eks update-kubeconfig --name biomage-$CLUSTER_ENV | |
- id: deploy-metrics-server | |
name: Deploy k8s metrics server | |
run: |- | |
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml | |
- id: install-helm | |
name: Install Helm | |
run: |- | |
sudo snap install helm --classic | |
- id: install-eksctl | |
name: Install eksctl | |
run: |- | |
ARCH=amd64 | |
PLATFORM=$(uname -s)_$ARCH | |
curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz" | |
tar -xzf eksctl_$PLATFORM.tar.gz -C /tmp && rm eksctl_$PLATFORM.tar.gz | |
sudo mv /tmp/eksctl /usr/local/bin | |
- id: deploy-load-balancer-role | |
name: Deploy permissions for AWS load balancer controller | |
run: |- | |
curl -o iam-policy.json https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/v2.10.1/docs/install/iam_policy.json | |
aws iam create-policy \ | |
--policy-name AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \ | |
--policy-document file://iam-policy.json || true | |
eksctl create iamserviceaccount \ | |
--cluster=biomage-$CLUSTER_ENV \ | |
--namespace=kube-system \ | |
--name=aws-load-balancer-controller \ | |
--attach-policy-arn=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:policy/AWSLoadBalancerControllerIAMPolicy-$CLUSTER_ENV \ | |
--role-name eksctl-$CLUSTER_ENV-load-balancer-controller-role \ | |
--override-existing-serviceaccounts \ | |
--approve | |
# we need to retry this due to an active issue with the AWS Load Balancer Controller | |
# where there are intermittent failures that are only fixable by retrying | |
# see issue at https://github.com/kubernetes-sigs/aws-load-balancer-controller/issues/2071 | |
- id: install-lbc | |
name: Deploy AWS Load Balancer Controller | |
uses: nick-invision/retry@v2 | |
with: | |
timeout_seconds: 600 | |
max_attempts: 5 | |
retry_on: error | |
on_retry_command: sleep $(shuf -i 5-15 -n 1) | |
command: |- | |
helm repo add eks https://aws.github.io/eks-charts | |
wget https://raw.githubusercontent.com/aws/eks-charts/master/stable/aws-load-balancer-controller/crds/crds.yaml | |
kubectl apply -f crds.yaml | |
helm repo update | |
helm upgrade aws-load-balancer-controller eks/aws-load-balancer-controller \ | |
--namespace kube-system \ | |
--set serviceAccount.create=false \ | |
--set serviceAccount.name=aws-load-balancer-controller \ | |
--set clusterName=biomage-$CLUSTER_ENV \ | |
--install --wait | |
- id: platform-public-facing | |
name: Get config for whether platform should be public facing | |
uses: mikefarah/yq@master | |
with: | |
cmd: yq '.[env(ENVIRONMENT_NAME)].publicFacing' 'infra/config/github-environments-config.yaml' | |
env: | |
ENVIRONMENT_NAME: ${{ matrix.environment.name }} | |
- id: install-elb-503-subscription-endpoint | |
name: Install ELB 503 subscription endpoint | |
run: |- | |
echo "value of publicFacing: $PUBLIC_FACING" | |
# Check that publicFacing is set to true or false | |
if [ "$PUBLIC_FACING" != "true" ] && [ "$PUBLIC_FACING" != "false" ]; then | |
echo "value of publicFacing in infra/config/github-environments-config.yaml is not set to true or false" | |
exit 1 | |
fi | |
# this is needed so SNS does not stop trying to subscribe to not-yet-deployed | |
# API staging environments because their endpoints are not yet available. | |
helm upgrade aws-elb-503-subscription-endpoint infra/aws-elb-503-subscription-endpoint \ | |
--namespace default \ | |
--set clusterEnv=$CLUSTER_ENV \ | |
--set acmCertificate="$ACM_CERTIFICATE_ARN" \ | |
--set-string publicFacing="$PUBLIC_FACING" \ | |
--install --wait | |
env: | |
PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }} | |
ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }} | |
- id: deploy-env-loadbalancer | |
name: Deploy AWS Application Load Balancer for environment | |
uses: aws-actions/aws-cloudformation-github-deploy@v1 | |
with: | |
parameter-overrides: "Environment=${{ matrix.environment.type }},PublicFacing=${{ steps.platform-public-facing.outputs.result }}" | |
name: "biomage-k8s-alb-${{ matrix.environment.type }}" | |
template: 'infra/cf-loadbalancer.yaml' | |
no-fail-on-empty-changeset: "1" | |
# For HMS ACM_CERTIFICATE_ARN_STAGING exists, having different domains for staging and prod | |
# so we need to check if it exists, otherwise set it to ACM_CERTIFICATE_ARN | |
# same applies for PRIMARY_DOMAIN_NAME_STAGING | |
- id: setup-domain | |
name: Compile environment-specific domain name | |
run: |- | |
if [ "${{ matrix.environment.type }}" = "production" ]; then | |
PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}" | |
DOMAIN_NAME="${{ secrets.DOMAIN_NAME }}" | |
ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}" | |
fi | |
if [ "${{ matrix.environment.type }}" = "staging" ]; then | |
PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME_STAGING }}" | |
if [ -z "$PRIMARY_DOMAIN_NAME" ]; then | |
PRIMARY_DOMAIN_NAME="${{ secrets.PRIMARY_DOMAIN_NAME }}" | |
fi | |
DOMAIN_NAME="${{ secrets.DOMAIN_NAME_STAGING }}" | |
ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN_STAGING }}" | |
if [ -z "$ACM_CERTIFICATE_ARN" ]; then | |
ACM_CERTIFICATE_ARN="${{ secrets.ACM_CERTIFICATE_ARN }}" | |
fi | |
fi | |
echo "primary-domain-name=$PRIMARY_DOMAIN_NAME" >> $GITHUB_OUTPUT | |
echo "domain-name=$DOMAIN_NAME" >> $GITHUB_OUTPUT | |
echo "acm-certificate-arn=$ACM_CERTIFICATE_ARN" >> $GITHUB_OUTPUT | |
# This step should be run only once per deployment. The associated Route 53 records to be created | |
# e.g. DOMAIN_NAME and *.DOMAIN_NAME should be deleted before running this comment, otherwise this step fails. | |
# Refer to the new deployment runbook to learn more. | |
# - id: deploy-route53 | |
# name: Deploy Route 53 DNS records to ELB | |
# uses: aws-actions/aws-cloudformation-github-deploy@v1 | |
# with: | |
# parameter-overrides: "Environment=${{ matrix.environment.type }},DNSName=${{ steps.deploy-env-loadbalancer.outputs.DNSName }},HostedZoneId=${{ steps.deploy-env-loadbalancer.outputs.CanonicalHostedZoneID }},PrimaryDomainName=${{ steps.setup-domain.outputs.primary-domain-name }},DomainName=${{ steps.setup-domain.outputs.domain-name }}" | |
# name: "biomage-alb-route53-${{ matrix.environment.type }}" | |
# template: 'infra/cf-route53.yaml' | |
# no-fail-on-empty-changeset: "1" | |
- id: deploy-xray-daemon | |
name: Deploy AWS X-Ray daemon | |
run: |- | |
helm upgrade "aws-xray-daemon" infra/aws-xray-daemon \ | |
--namespace default \ | |
--set iamRole=arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/xray-daemon-role-$CLUSTER_ENV \ | |
--install --wait | |
- id: install-ebs-csi-driver | |
name: Install AWS EBS Container Storage Interface (CSI) drivers | |
run: |- | |
helm upgrade \ | |
aws-ebs-csi-driver https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/download/helm-chart-aws-ebs-csi-driver-2.17.2/aws-ebs-csi-driver-2.17.2.tgz \ | |
--namespace kube-system \ | |
--set enableVolumeScheduling=true \ | |
--set enableVolumeResizing=true \ | |
--set enableVolumeSnapshot=true \ | |
--install --wait | |
- id: deploy-read-only-group | |
name: Deploy read-only permission definition for cluster | |
run: |- | |
helm upgrade "biomage-read-only-group" infra/biomage-read-only-group \ | |
--install --wait | |
- id: deploy-state-machine-role | |
name: Deploy AWS Step Function (state machine) roles | |
uses: aws-actions/aws-cloudformation-github-deploy@v1 | |
with: | |
parameter-overrides: "Environment=${{ matrix.environment.type }}" | |
name: "biomage-state-machine-role-${{ matrix.environment.type }}" | |
template: 'infra/cf-state-machine-role.yaml' | |
capabilities: 'CAPABILITY_IAM,CAPABILITY_NAMED_IAM' | |
no-fail-on-empty-changeset: "1" | |
- id: remove-identitymappings | |
name: Remove all previous identity mappings for IAM users | |
run: |- | |
eksctl get iamidentitymapping --cluster=biomage-$CLUSTER_ENV --output=json | \ | |
jq -r '.[] | select(.userarn != null) | .userarn' > /tmp/users_to_remove | |
while IFS= read -r user | |
do | |
echo "Remove rights of $user" | |
eksctl delete iamidentitymapping \ | |
--cluster=biomage-$CLUSTER_ENV \ | |
--arn $user \ | |
--all | |
done < "/tmp/users_to_remove" | |
# see https://eksctl.io/usage/iam-identity-mappings/ | |
# Grant login rights to ci-iac-role | |
- id: add-ci-iac-oidc-cluster-role | |
name: Allow the OIDC role to log in to our cluster. | |
run: |- | |
eksctl create iamidentitymapping \ | |
--cluster=biomage-$CLUSTER_ENV \ | |
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/ci-iac-role \ | |
--group system:masters \ | |
--username ci-iac-role | |
# SSO access to cluster is only added if accessing AWS and cluster using SSO | |
- id: allow-sso-roles-to-access-cluster | |
env: | |
SSO_ROLE: ${{ secrets.SSO_ROLE }} | |
if: ${{ env.SSO_ROLE != '' }} | |
name: Allow SSO role to log into the cluster | |
run: |- | |
eksctl create iamidentitymapping \ | |
--cluster biomage-$CLUSTER_ENV \ | |
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/${{ env.SSO_ROLE }} \ | |
--username sso-cluster-admin \ | |
--no-duplicate-arns \ | |
--group system:masters | |
- id: add-state-machine-cluster-role | |
name: Grant rights to the state machine IAM role. | |
run: |- | |
eksctl create iamidentitymapping \ | |
--cluster=biomage-$CLUSTER_ENV \ | |
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:role/state-machine-role-$CLUSTER_ENV \ | |
--group state-machine-runner-group \ | |
--username state-machine-runner | |
# NOTE: after updating this step, make sure you apply the updates in other relevant Github Actions workflows | |
- id: update-identitymapping-admin | |
name: Add cluster admin rights to everyone on the admin list. | |
run: |- | |
echo "Setting cluster admin rights for ${{matrix.environment.name}} in ${{matrix.environment.type}} environment" | |
ADMINS="${{ join(matrix.environment.admins, ' ') }}" | |
echo $ADMINS | |
for user in $ADMINS; do | |
echo "Adding cluster admin rights to $user" | |
eksctl create iamidentitymapping \ | |
--cluster=biomage-$CLUSTER_ENV \ | |
--arn arn:aws:iam::${{ steps.setup-aws.outputs.aws-account-id }}:user/$user \ | |
--group system:masters \ | |
--username $user | |
done | |
### | |
### INSTALL AND CONFIGURE FLUX V2 ### | |
### | |
- id: using-self-signed-certificate | |
name: Get config for whether deployment is using self-signed certificate | |
uses: mikefarah/yq@master | |
with: | |
cmd: yq '.[env(ENVIRONMENT_NAME)].selfSignedCertificate' 'infra/config/github-environments-config.yaml' | |
env: | |
ENVIRONMENT_NAME: ${{ matrix.environment.name }} | |
- id: fill-account-specific-metadata | |
name: Fill in account specific metadata in ConfigMap | |
run: |- | |
yq -i ' | |
.myAccount.domainName = strenv(DOMAIN_NAME) | | |
.myAccount.region = strenv(AWS_REGION) | | |
.myAccount.accountId = strenv(AWS_ACCOUNT_ID) | | |
.myAccount.publicFacing = strenv(PUBLIC_FACING) | | |
.myAccount.acmCertificate = strenv(ACM_CERTIFICATE_ARN) | | |
.myAccount.selfSignedCertificate = strenv(SELF_SIGNED_CERTIFICATE) | |
' infra/config/account-config.yaml | |
if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]] | |
then | |
export DATADOG_API_KEY="${{ secrets.DATADOG_API_KEY }}" | |
export DATADOG_APP_KEY="${{ secrets.DATADOG_APP_KEY }}" | |
yq -i ' | |
.myAccount.datadogAppKey = strenv(DATADOG_APP_KEY) | | |
.myAccount.datadogApiKey = strenv(DATADOG_API_KEY) | |
' infra/config/account-config.yaml | |
fi | |
cat infra/config/account-config.yaml | |
env: | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }} | |
DOMAIN_NAME: ${{ steps.setup-domain.outputs.domain-name }} | |
ACM_CERTIFICATE_ARN: ${{ steps.setup-domain.outputs.acm-certificate-arn }} | |
PUBLIC_FACING: ${{ steps.platform-public-facing.outputs.result }} | |
SELF_SIGNED_CERTIFICATE: ${{ steps.using-self-signed-certificate.outputs.result }} | |
- id: create-flux-namespace | |
name: Attempt to create flux namespace | |
continue-on-error: true | |
run: |- | |
kubectl create namespace flux-system | |
- id: create-account-information-configmap | |
name: Create a configmap containing AWS account specific details | |
continue-on-error: false | |
run: |- | |
kubectl create configmap account-config --from-file=infra/config/account-config.yaml -n flux-system -o yaml --dry-run | kubectl apply -f - | |
- id: install-flux-v2 | |
name: Install flux CLI version 2.3.0 | |
run: |- | |
curl -s https://fluxcd.io/install.sh | sudo FLUX_VERSION=2.3.0 bash | |
- id: delete-old-flux-github-deploy-key | |
name: Attempt to delete previous github flux deploy key | |
continue-on-error: true | |
run: |- | |
kubectl -n flux-system delete secret flux-system | |
- id: install-flux | |
name: Install Flux to EKS cluster | |
run: |- | |
# Refer to https://github.com/fluxcd/flux2/releases | |
FLUX_VERSION=v2.3.0 | |
FLUX_REPO=releases | |
FLUX_PATH=deployments/$ENVIRONMENT_NAME-$CLUSTER_ENV | |
REPO_FULL_PATH=$GITHUB_REPOSITORY_OWNER/$FLUX_REPO | |
echo "flux-full-repo=$(echo $REPO_FULL_PATH)" >> $GITHUB_ENV | |
echo "flux-path=$(echo $FLUX_PATH)" >> $GITHUB_ENV | |
args=( | |
--version $FLUX_VERSION | |
--owner $GITHUB_REPOSITORY_OWNER | |
--repository $FLUX_REPO | |
--branch master | |
--path $FLUX_PATH | |
--timeout 40s | |
–-interval 2m | |
--components-extra=image-reflector-controller,image-automation-controller | |
--namespace flux-system | |
--cluster arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV | |
--context arn:aws:eks:$AWS_REGION:$AWS_ACCOUNT_ID:cluster/biomage-$CLUSTER_ENV | |
) | |
if [ "${{ matrix.environment.type }}" = "staging" ] | |
then | |
echo Flux will be deployed in staging with read and write permissions | |
args+=(--read-write-key) | |
elif [ "${{ matrix.environment.type }}" = "production" ] | |
then | |
echo Flux will be deployed in production with read-only permissions | |
fi | |
flux bootstrap github "${args[@]}" | |
env: | |
GITHUB_TOKEN: ${{ secrets.API_TOKEN_GITHUB }} | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }} | |
ENVIRONMENT_NAME: ${{ matrix.environment.name }} | |
- id: fill-in-sync-yaml | |
name: Create the sync.yaml file that contains the Kustomization to sync the cluster | |
run: |- | |
export SPEC_PATH="./$CLUSTER_ENV" | |
yq -i ' | |
.spec.path = strenv(SPEC_PATH) | |
' infra/flux/sync.yaml | |
cat infra/flux/sync.yaml | |
- id: push-sync-yaml | |
name: Push the sync.yaml file that was filled in during the previous step | |
uses: dmnemec/[email protected] | |
env: | |
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} | |
with: | |
source_file: infra/flux/sync.yaml | |
destination_repo: ${{ env.flux-full-repo }} | |
destination_folder: ${{ env.flux-path }} | |
user_email: [email protected] | |
user_name: 'Biomage CI/CD' | |
- id: fill-kustomization-template | |
name: Fill in Kustomization template | |
run: |- | |
cat infra/flux/kustomization-template.yaml \ | |
| sed "s/AWS_ACCOUNT_ID/$AWS_ACCOUNT_ID/g" \ | |
| sed "s/CLUSTER_ENV/$CLUSTER_ENV/g" \ | |
> infra/flux/kustomization.yaml | |
cat infra/flux/kustomization.yaml | |
env: | |
AWS_ACCOUNT_ID: ${{ steps.setup-aws.outputs.aws-account-id }} | |
- id: push-kustomization-yaml | |
name: Push the kustomization.yaml file to apply our custom config | |
uses: dmnemec/[email protected] | |
env: | |
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} | |
with: | |
source_file: infra/flux/kustomization.yaml | |
destination_repo: ${{ env.flux-full-repo }} | |
destination_folder: ${{ env.flux-path }}/flux-system | |
user_email: [email protected] | |
user_name: 'Biomage CI/CD' | |
- id: install-kubernetes-reflector | |
name: Install kubernetes reflector | |
run: |- | |
helm repo add emberstack https://emberstack.github.io/helm-charts | |
helm repo update | |
helm upgrade --install reflector emberstack/reflector --namespace flux-system | |
- id: add-account-config-configmap-annotations | |
name: Add annotations to account-config configmap | |
run: |- | |
kubectl annotate configmap account-config \ | |
--overwrite \ | |
--namespace flux-system \ | |
reflector.v1.k8s.emberstack.com/reflection-allowed="true" \ | |
reflector.v1.k8s.emberstack.com/reflection-allowed-namespaces="ui-.*,api-.*,pipeline-.*,worker-.*" \ | |
reflector.v1.k8s.emberstack.com/reflection-auto-enabled="true" | |
### | |
### END OF INSTALL AND CONFIGURE FLUX V2 ### | |
### | |
deploy-monitoring: | |
name: Setup logging and monitoring | |
runs-on: ubuntu-20.04 | |
needs: [check-secrets, create-eks-cluster, configure-cluster, load-config] | |
if: always() && (needs.check-secrets.result == 'success') && (github.event.inputs.workflow_actions == 'deploy monitoring') | |
env: | |
CLUSTER_ENV: ${{ matrix.environment.type }} | |
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} | |
strategy: | |
matrix: | |
environment: ${{ fromJson(needs.load-config.outputs.deployment_matrix)}} | |
environment: ${{ matrix.environment.name }} | |
steps: | |
- id: checkout | |
name: Check out source code | |
uses: actions/checkout@v3 | |
- id: setup-aws | |
name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/ci-iac-role | |
role-duration-seconds: 3600 | |
aws-region: ${{ secrets.AWS_REGION }} | |
- id: add-kubeconfig | |
name: Add k8s config file for existing cluster. | |
run: |- | |
aws eks update-kubeconfig --name biomage-$CLUSTER_ENV | |
- id: install-eksctl | |
name: Install eksctl | |
run: |- | |
curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp | |
sudo mv /tmp/eksctl /usr/local/bin | |
- id: setup-cluster-cloudwatch-logging-policy | |
name: Setup permissions required for cluster to log to Cloudwatch | |
uses: aws-actions/aws-cloudformation-github-deploy@v1 | |
with: | |
parameter-overrides: "Environment=${{ matrix.environment.type }}" | |
name: "cluster-cloudwatch-logging-policy-${{ matrix.environment.type }}" | |
template: 'infra/cluster-logging/cf-cluster-log-cloudwatch-policy.yaml' | |
no-fail-on-empty-changeset: "1" | |
capabilities: "CAPABILITY_IAM,CAPABILITY_NAMED_IAM" | |
# Setting up log forwarding for pods hosted in EC2 nodes | |
- id: create-fluent-bit-namespace | |
name: Create namespace for node FluentBit deployment | |
run: kubectl apply -f infra/cluster-logging/node-fluentbit-namespace.yaml | |
- id: create-service-account-for-node-fluent-bit | |
name: Create service account for node FluentBit | |
env: | |
LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }} | |
run: |- | |
eksctl create iamserviceaccount \ | |
--name fluent-bit \ | |
--namespace node-logging \ | |
--cluster biomage-$CLUSTER_ENV \ | |
--role-name irsa-fluent-bit-$CLUSTER_ENV \ | |
--attach-policy-arn $LOGGING_POLICY_ARN \ | |
--override-existing-serviceaccounts \ | |
--approve | |
- id: deploy-node-fluent-bit | |
name: Deploy FluentBit for EC2 nodes | |
env: | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
run: | | |
# FluentBit configuration is determined in infra/cluster-logging/node-fluentbit-config.yaml, specifically under [INPUT] > Path | |
# We do not want to log everything for costs/security concerns | |
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/node-fluentbit-config.yaml | |
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/node-fluentbit-config.yaml | |
kubectl apply -f infra/cluster-logging/node-fluentbit-config.yaml | |
# Setting up log forwarding for pods hosted on Fargate nodes | |
- id: attach-pod-execution-role-name | |
name: Attach logging policy to pod execution role | |
env: | |
LOGGING_POLICY_ARN: ${{ steps.setup-cluster-cloudwatch-logging-policy.outputs.PolicyARN }} | |
run: |- | |
# Pods launched in the same cluster has the same pod execution role, as pod execution role scope is cluster-wide. | |
# See https://eksctl.io/usage/fargate-support/#creating-a-cluster-with-fargate-support | |
# Getting fargate-profile of pipeline or worker in the same cluster gets the same pod execution role. | |
POD_EXEC_ROLE_NAME=$(aws eks describe-fargate-profile \ | |
--cluster-name biomage-$CLUSTER_ENV \ | |
--fargate-profile-name pipeline-default | jq -r '.fargateProfile.podExecutionRoleArn' | awk -F"/" '{print (NF>1)? $NF : ""}' ) | |
aws iam attach-role-policy --role-name $POD_EXEC_ROLE_NAME --policy-arn $LOGGING_POLICY_ARN | |
- id: deploy-fargate-fluent-bit | |
name: Deploy FluentBit config for Fargate pods | |
env: | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
run: |- | |
# FluentBit configuration is determined in infra/cluster-logging/fargate-fluentbit-config.yaml | |
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_CLUSTER_ENV\", \"$CLUSTER_ENV\")" infra/cluster-logging/fargate-fluentbit-config.yaml | |
yq -i "(.. | select(type == \"!!str\")) |= sub(\"CI_AWS_REGION\", \"$AWS_REGION\")" infra/cluster-logging/fargate-fluentbit-config.yaml | |
kubectl apply -f infra/cluster-logging/fargate-fluentbit-config.yaml | |
# Setting up Datadog to watch pod metrics for pods hosted on EC2 and Fargate nodes | |
- id: setup-datadog-cluster-agent | |
name: Setup Datadog cluster agent | |
run: |- | |
if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]]; | |
then | |
helm repo add datadog https://helm.datadoghq.com | |
helm repo update | |
helm upgrade datadog-agent datadog/datadog \ | |
-f infra/datadog/cluster-agent-values.yaml \ | |
--set datadog.apiKey=$DATADOG_API_KEY \ | |
--set datadog.clusterName=biomage-$CLUSTER_ENV \ | |
--install | |
else | |
echo "Datadog api key missing, skipping datadog setup" | |
fi | |
- id: setup-datadog-sidecar-permissions | |
name: Setup Datadog sidecar permissions | |
run: |- | |
if [[ -n "${{ secrets.DATADOG_API_KEY }}" ]]; | |
then | |
kubectl apply -f infra/datadog/datadog-sidecar-rbac.yaml | |
fi | |
- id: setup-orca-cspm | |
name: Setup ORCA CSPM | |
run: |- | |
if [[ -n "${{ secrets.ORCA_TUNNEL_ID }}" ]]; | |
then | |
helm upgrade --install orca-tunnel \ | |
--namespace orca-security --create-namespace \ | |
oci://public.ecr.aws/orcasecurity/helm-k8s-tunnel \ | |
--set tunnelAddr=tunnel.production.us-east-1.orcasecurity.net \ | |
--set tunnelId="${{ secrets.ORCA_TUNNEL_ID }}" \ | |
--set tunnelToken="${{ secrets.ORCA_TUNNEL_TOKEN }}" \ | |
--set clusterName="biomage-$CLUSTER_ENV" \ | |
--set cloudVendorId="${{ secrets.AWS_ACCOUNT_ID }}" \ | |
--set region="${{ secrets.AWS_REGION }}" \ | |
--set clusterType=eks | |
else | |
echo "ORCA_TUNNEL_ID missing, skipping ORCA CSPM setup." | |
fi | |
- id: login-ecr | |
name: Login to Amazon ECR | |
uses: aws-actions/amazon-ecr-login@v1 | |
- id: create-falcon-ecr-registries | |
name: Create an ECR repositories for the Falcon Sensor (if needed) | |
# This will fail if the registry already exists, which is fine. If there is some other | |
# error, the `push` step will fail instead. | |
continue-on-error: true | |
run: |- | |
if [[ -n "${{ secrets.FALCON_CID }}" ]]; | |
then | |
aws ecr create-repository --repository-name falcon-container/falcon-sensor --image-tag-mutability MUTABLE | |
aws ecr create-repository --repository-name falcon-sensor/falcon-sensor --image-tag-mutability MUTABLE | |
else | |
echo "CrowdStrike CID missing, not creating falcon sensor repos" | |
fi | |
- id: create-falcon-namespace | |
name: Attempt to create falcon namespace | |
continue-on-error: true | |
run: |- | |
if [[ -n "${{ secrets.FALCON_CID }}" ]]; | |
then | |
kubectl create namespace falcon-system | |
else | |
echo "CrowdStrike CID missing, not creating falcon namespace" | |
fi | |
- id: setup-falcon-sensor | |
name: Setup Falcon Sensor | |
run: |- | |
if [[ -n "${{ secrets.FALCON_CID }}" ]]; | |
then | |
# configure the API client ID and password | |
export FALCON_CLIENT_ID="${{ secrets.FALCON_CLIENT_ID }}" | |
export FALCON_CLIENT_SECRET="${{ secrets.FALCON_CLIENT_SECRET }}" | |
# confgure CID | |
export FALCON_CID="${{ secrets.FALCON_CID }}" | |
# URL of falcon-container-sensor-pull.sh | |
PULL_SCRIPT_URL="https://raw.githubusercontent.com/CrowdStrike/falcon-scripts/main/bash/containers/falcon-container-sensor-pull/falcon-container-sensor-pull.sh" | |
# Download the pull script from GitHub and save it to the current directory | |
# --silent - Supresses standard/error output | |
# --remote-name - Keeps the original filename when saving | |
# --location - Follow redirects | |
curl --silent --remote-name --location "$PULL_SCRIPT_URL" | |
# make script executable | |
chmod +x falcon-container-sensor-pull.sh | |
# download latest version of the Falcon Container (for fargate) and copy it to ECS | |
./falcon-container-sensor-pull.sh \ | |
--client-id ${FALCON_CLIENT_ID} \ | |
--client-secret ${FALCON_CLIENT_SECRET} \ | |
--type falcon-container \ | |
--copy "${ECR_REGISTRY}/falcon-container" | |
# download latest version of the Falcon Node Sensor (for EC2) and copy it to ECS | |
./falcon-container-sensor-pull.sh \ | |
--client-id ${FALCON_CLIENT_ID} \ | |
--client-secret ${FALCON_CLIENT_SECRET} \ | |
--type falcon-sensor \ | |
--copy "${ECR_REGISTRY}/falcon-sensor" | |
# functions to get image names for helm | |
get_image_path() { | |
local container_type=$1 | |
./falcon-container-sensor-pull.sh \ | |
--client-id ${FALCON_CLIENT_ID} \ | |
--client-secret ${FALCON_CLIENT_SECRET} \ | |
--type ${container_type} \ | |
--get-image-path | |
} | |
get_image_name() { | |
local container_type=$1 | |
local image_path=$(get_image_path "$container_type") | |
# Extract the image name using awk | |
local image_name=$(echo "$image_path" | awk -F':' '{print $2}') | |
echo "$image_name" | |
} | |
FALCON_CONTAINER_IMAGE_TAG=$(get_image_name "falcon-container") | |
FALCON_SENSOR_IMAGE_TAG=$(get_image_name "falcon-sensor") | |
# install container sensor (for fargate) into a customized namespace | |
helm repo add crowdstrike https://crowdstrike.github.io/falcon-helm | |
helm repo update | |
helm upgrade --install falcon-container-helm crowdstrike/falcon-sensor \ | |
-n falcon-container-system --create-namespace \ | |
--set node.enabled=false \ | |
--set container.enabled=true \ | |
--set falcon.cid="$FALCON_CID" \ | |
--set container.image.repository="${ECR_REGISTRY}/falcon-container/falcon-sensor" \ | |
--set container.image.tag="$FALCON_CONTAINER_IMAGE_TAG" | |
# install node sensor (for ec2) with different release name (falcon-sensor-helm) | |
helm upgrade --install falcon-sensor-helm crowdstrike/falcon-sensor \ | |
-n falcon-sensor-system --create-namespace \ | |
--set falcon.cid="$FALCON_CID" \ | |
--set node.image.repository="${ECR_REGISTRY}/falcon-sensor/falcon-sensor" \ | |
--set node.image.tag="$FALCON_SENSOR_IMAGE_TAG" | |
# install KPA (kubernetes protection agent) | |
helm upgrade --install kpagent crowdstrike/cs-k8s-protection-agent \ | |
-n falcon-kubernetes-protection --create-namespace \ | |
--set image.repository="registry.crowdstrike.com/kubernetes_protection/kpagent" \ | |
--set image.tag="0.2117.0" \ | |
--set crowdstrikeConfig.clientID="$FALCON_CLIENT_ID" \ | |
--set crowdstrikeConfig.clientSecret="$FALCON_CLIENT_SECRET" \ | |
--set crowdstrikeConfig.clusterName="arn:aws:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/biomage-${CLUSTER_ENV}" \ | |
--set crowdstrikeConfig.env="${{ secrets.FALCON_REGION }}" \ | |
--set crowdstrikeConfig.cid="${{ secrets.FALCON_CCID }}" \ | |
--set crowdstrikeConfig.dockerAPIToken="${{ secrets.FALCON_DOCKER_API_TOKEN }}" | |
else | |
echo "CrowdStrike CID missing, skipping falcon sensor setup" | |
fi | |
env: | |
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
report-if-failed: | |
name: Report if workflow failed | |
runs-on: ubuntu-20.04 | |
needs: [load-config, check-secrets, create-eks-cluster, configure-cluster, deploy-monitoring] | |
if: failure() && github.ref == 'refs/heads/master' | |
steps: | |
- id: send-to-slack | |
name: Send failure notification to Slack on failure | |
env: | |
SLACK_BOT_TOKEN: ${{ secrets.WORKFLOW_STATUS_BOT_TOKEN }} | |
uses: voxmedia/github-action-slack-notify-build@v1 | |
with: | |
channel: workflow-failures | |
status: FAILED | |
color: danger |