Skip to content

Commit

Permalink
fix: Resiliency changes testing changes (#1149)
Browse files Browse the repository at this point in the history
Co-authored-by: Sai Vennam <[email protected]>
  • Loading branch information
vishdivg and svennam92 authored Oct 30, 2024
1 parent 59a0984 commit 6a6f9a3
Show file tree
Hide file tree
Showing 10 changed files with 61 additions and 85 deletions.
4 changes: 4 additions & 0 deletions lab/iam/policies/iam.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ Statement:
- iam:TagRole
- iam:PassRole
- sts:AssumeRole
- iam:DeleteServiceLinkedRole
- iam:GetServiceLinkedRoleDeletionStatus
Resource:
- !Sub arn:aws:iam::${AWS::AccountId}:role/${Env}*
- !Sub arn:aws:iam::${AWS::AccountId}:role/eksctl-${Env}*
- !Sub arn:aws:iam::${AWS::AccountId}:role/aws-service-role/fis*
- Effect: Allow
Action:
- iam:CreatePolicy
Expand Down Expand Up @@ -82,3 +85,4 @@ Statement:
- eks-fargate.amazonaws.com
- guardduty.amazonaws.com
- spot.amazonaws.com
- fis.amazonaws.com
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ done

# Delete IAM Roles and Policies
echo "Cleaning up IAM roles and policies..."
for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do
for role_prefix in "eks-workshop-fis-role" "eks-workshop-canary-role"; do
for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do
echo "Processing role: $role"
for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do
Expand All @@ -48,7 +48,11 @@ for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-
done
done

for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do
# Delete fis service role

safe_delete "aws iam delete-service-linked-role --role-name AWSServiceRoleForFIS" "IAM role AWSServiceRoleForFIS"

for policy_prefix in "eks-workshop-resiliency-fis-policy" "eks-workshop-resiliency-canary-policy"; do
for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do
safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn"
done
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ resource "random_id" "suffix" {
}

resource "aws_iam_role" "fis_role" {
name = "${var.addon_context.eks_cluster_id}-fis_role-${random_id.suffix.hex}"
name = "${var.addon_context.eks_cluster_id}-fis-role-${random_id.suffix.hex}"

assume_role_policy = jsonencode({
Version = "2012-10-17"
Expand Down Expand Up @@ -211,7 +211,7 @@ resource "aws_iam_role_policy_attachment" "fis_cni_policy" {

# Policy for creating FIS experiment templates
resource "aws_iam_policy" "eks_resiliency_fis_policy" {
name = "${var.addon_context.eks_cluster_id}-resiliency_fis_policy-${random_id.suffix.hex}"
name = "${var.addon_context.eks_cluster_id}-resiliency-fis-policy-${random_id.suffix.hex}"
path = "/"
description = "Custom policy for EKS resiliency FIS experiments"

Expand Down Expand Up @@ -276,7 +276,7 @@ resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment"

# Canary IAM role
resource "aws_iam_role" "canary_role" {
name = "${var.addon_context.eks_cluster_id}-canary_role-${random_id.suffix.hex}"
name = "${var.addon_context.eks_cluster_id}-canary-role-${random_id.suffix.hex}"

assume_role_policy = jsonencode({
Version = "2012-10-17"
Expand Down Expand Up @@ -307,7 +307,7 @@ resource "aws_iam_role_policy_attachment" "canary_lambda_basic_execution" {

# Policy for Canary
resource "aws_iam_policy" "eks_resiliency_canary_policy" {
name = "${var.addon_context.eks_cluster_id}-resiliency_canary_policy-${random_id.suffix.hex}"
name = "${var.addon_context.eks_cluster_id}-resiliency-canary-policy-${random_id.suffix.hex}"
path = "/"
description = "Custom policy for EKS resiliency Canary"

Expand Down Expand Up @@ -360,39 +360,6 @@ resource "aws_iam_role_policy_attachment" "eks_resiliency_canary_policy_attachme
role = aws_iam_role.canary_role.name
}

# EKS Cluster IAM Role
resource "aws_iam_role" "eks_cluster_role" {
name = "eks-workshop-cluster-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}"

assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = {
Service = "eks.amazonaws.com"
}
Action = "sts:AssumeRole"
}
]
})

lifecycle {
create_before_destroy = true
}
}

# Attach required policies to EKS Cluster role
resource "aws_iam_role_policy_attachment" "eks_cluster_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.eks_cluster_role.name
}

resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
role = aws_iam_role.eks_cluster_role.name
}

# Executable Scripts
resource "null_resource" "chmod_all_scripts_bash" {
provisioner "local-exec" {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
output "environment_variables" {
description = "Environment variables to be added to the IDE shell"
value = {
LBC_CHART_VERSION = var.load_balancer_controller_chart_version
LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn
FIS_ROLE_ARN = aws_iam_role.fis_role.arn
RANDOM_SUFFIX = random_id.suffix.hex
SCRIPT_DIR = var.script_dir
CANARY_ROLE_ARN = aws_iam_role.canary_role.arn
EKS_CLUSTER_ROLE_ARN = aws_iam_role.eks_cluster_role.arn
AWS_REGION = data.aws_region.current.name
LBC_CHART_VERSION = var.load_balancer_controller_chart_version
LBC_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn
FIS_ROLE_ARN = aws_iam_role.fis_role.arn
RANDOM_SUFFIX = random_id.suffix.hex
SCRIPT_DIR = var.script_dir
CANARY_ROLE_ARN = aws_iam_role.canary_role.arn
AWS_REGION = data.aws_region.current.name
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,17 @@ generate_output() {
# Initial clear screen
# clear_screen

trap 'rm -rf ~/environment/eks-workshop/temp_output.txt; exit 0' SIGTERM INT
# Main loop
while true; do

# Generate output to a temporary file
generate_output > temp_output.txt
generate_output > ~/environment/eks-workshop/temp_output.txt

#generate_output
# Clear screen and display the new output
# clear_screen
cat temp_output.txt
cat ~/environment/eks-workshop/temp_output.txt
# clear_screen

# Wait before next update
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ It's important to note that this experiment is repeatable, allowing you to run i
To simulate the node failure and monitor its effects, run the following command:

```bash timeout=240
$ ~/$SCRIPT_DIR/node-failure.sh && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh
$ ~/$SCRIPT_DIR/node-failure.sh && timeout --preserve-status 180s ~/$SCRIPT_DIR/get-pods-by-az.sh

------us-west-2a------
ip-10-42-127-82.us-west-2.compute.internal:
Expand Down Expand Up @@ -63,18 +63,18 @@ This command counts the total number of nodes in the `Ready` state and continuou

Once all nodes are ready, we'll redeploy the pods to ensure they are balanced across the nodes:

```bash timeout=900 wait=60
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
```bash timeout=900 wait=30
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ sleep 180
$ sleep 90
$ kubectl rollout status -n ui deployment/ui --timeout 180s
$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ $ export NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"de

Execute the FIS experiment to simulate the node failure and monitor the response:

```bash timeout=240
$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && timeout 240s ~/$SCRIPT_DIR/get-pods-by-az.sh
```bash timeout=300
$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && timeout --preserve-status 240s ~/$SCRIPT_DIR/get-pods-by-az.sh

------us-west-2a------
ip-10-42-127-82.us-west-2.compute.internal:
Expand Down Expand Up @@ -86,19 +86,19 @@ Your retail url should stay operational unlike the node failure without FIS.
:::note
To verify nodes and re-balance pods, you can run:

```bash timeout=900 wait=60
```bash timeout=900 wait=30
$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ sleep 180
$ sleep 90
$ kubectl rollout status -n ui deployment/ui --timeout 180s
$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ $ export FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json
Execute the FIS experiment and monitor the cluster's response:

```bash timeout=420
$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh
$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && timeout --preserve-status 360s ~/$SCRIPT_DIR/get-pods-by-az.sh

------us-west-2a------
ip-10-42-106-250.us-west-2.compute.internal:
Expand Down Expand Up @@ -59,19 +59,19 @@ Due to the severity of the experiment, the retail store url will not stay operat
:::note
To verify nodes and pods redistribution, you can run:

```bash timeout=900 wait=60
```bash timeout=900 wait=30
$ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ sleep 180
$ sleep 90
$ kubectl rollout status -n ui deployment/ui --timeout 180s
$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
```
Expand Down
4 changes: 2 additions & 2 deletions website/docs/observability/high-availability/06-az-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ description: "Scale your application to two instances and prepare for an AZ fail

To see the full impact of an Availability Zone (AZ) failure, let's first scale up to two instances per AZ as well as increase the number of pods up to 9:

```bash timeout=120
$ ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text)
```bash timeout=120 wait=30
$ export ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text)
$ aws autoscaling update-auto-scaling-group \
--auto-scaling-group-name $ASG_NAME \
--desired-capacity 6 \
Expand Down
18 changes: 9 additions & 9 deletions website/docs/observability/high-availability/07-az-failure.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ $ export ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"de

Execute the FIS experiment to simulate the AZ failure:

```bash timeout=560
$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh
```bash timeout=540
$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && timeout --preserve-status 480s ~/$SCRIPT_DIR/get-pods-by-az.sh

------us-west-2a------
ip-10-42-100-4.us-west-2.compute.internal:
Expand Down Expand Up @@ -61,19 +61,19 @@ During this time, the retail url will stay available showing how resilient EKS i
:::note
To verify nodes and pods redistribution, you can run:

```bash timeout=900 wait=60
```bash timeout=900 wait=30
$ EXPECTED_NODES=6 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
$ sleep 180
$ sleep 90
$ kubectl rollout status -n ui deployment/ui --timeout 180s
$ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
```
Expand Down

0 comments on commit 6a6f9a3

Please sign in to comment.