fix: Resiliency changes testing changes (#1149)

Co-authored-by: Sai Vennam <[email protected]>
aws-samples · Oct 30, 2024 · 6a6f9a3 · 6a6f9a3
1 parent 59a0984
commit 6a6f9a3
Show file tree

Hide file tree

Showing 10 changed files with 61 additions and 85 deletions.
diff --git a/lab/iam/policies/iam.yaml b/lab/iam/policies/iam.yaml
@@ -15,9 +15,12 @@ Statement:
       - iam:TagRole
       - iam:PassRole
       - sts:AssumeRole
+      - iam:DeleteServiceLinkedRole
+      - iam:GetServiceLinkedRoleDeletionStatus
     Resource:
       - !Sub arn:aws:iam::${AWS::AccountId}:role/${Env}*
       - !Sub arn:aws:iam::${AWS::AccountId}:role/eksctl-${Env}*
+      - !Sub arn:aws:iam::${AWS::AccountId}:role/aws-service-role/fis*
   - Effect: Allow
     Action:
       - iam:CreatePolicy
@@ -82,3 +85,4 @@ Statement:
           - eks-fargate.amazonaws.com
           - guardduty.amazonaws.com
           - spot.amazonaws.com
+          - fis.amazonaws.com
diff --git a/manifests/modules/observability/resiliency/.workshop/cleanup.sh b/manifests/modules/observability/resiliency/.workshop/cleanup.sh
@@ -35,7 +35,7 @@ done
 
 # Delete IAM Roles and Policies
 echo "Cleaning up IAM roles and policies..."
-for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-workshop"; do
+for role_prefix in "eks-workshop-fis-role" "eks-workshop-canary-role"; do
     for role in $(aws iam list-roles --query "Roles[?starts_with(RoleName, '${role_prefix}')].RoleName" --output text); do
         echo "Processing role: $role"
         for policy in $(aws iam list-attached-role-policies --role-name $role --query "AttachedPolicies[*].PolicyArn" --output text); do
@@ -48,7 +48,11 @@ for role_prefix in "fis-execution-role-eks-workshop" "canary-execution-role-eks-
     done
 done
 
-for policy_prefix in "eks-resiliency-fis-policy" "eks-resiliency-canary-policy"; do
+# Delete fis service role
+
+safe_delete "aws iam delete-service-linked-role --role-name AWSServiceRoleForFIS" "IAM role AWSServiceRoleForFIS"
+
+for policy_prefix in "eks-workshop-resiliency-fis-policy" "eks-workshop-resiliency-canary-policy"; do
     for policy_arn in $(aws iam list-policies --scope Local --query "Policies[?starts_with(PolicyName, '${policy_prefix}')].Arn" --output text); do
         safe_delete "aws iam delete-policy --policy-arn $policy_arn" "IAM policy $policy_arn"
     done

diff --git a/manifests/modules/observability/resiliency/.workshop/terraform/main.tf b/manifests/modules/observability/resiliency/.workshop/terraform/main.tf
@@ -137,7 +137,7 @@ resource "random_id" "suffix" {
 }
 
 resource "aws_iam_role" "fis_role" {
-  name = "${var.addon_context.eks_cluster_id}-fis_role-${random_id.suffix.hex}"
+  name = "${var.addon_context.eks_cluster_id}-fis-role-${random_id.suffix.hex}"
 
   assume_role_policy = jsonencode({
     Version = "2012-10-17"
@@ -211,7 +211,7 @@ resource "aws_iam_role_policy_attachment" "fis_cni_policy" {
 
 # Policy for creating FIS experiment templates
 resource "aws_iam_policy" "eks_resiliency_fis_policy" {
-  name        = "${var.addon_context.eks_cluster_id}-resiliency_fis_policy-${random_id.suffix.hex}"
+  name        = "${var.addon_context.eks_cluster_id}-resiliency-fis-policy-${random_id.suffix.hex}"
   path        = "/"
   description = "Custom policy for EKS resiliency FIS experiments"
 
@@ -276,7 +276,7 @@ resource "aws_iam_role_policy_attachment" "eks_resiliency_fis_policy_attachment"
 
 # Canary IAM role
 resource "aws_iam_role" "canary_role" {
-  name = "${var.addon_context.eks_cluster_id}-canary_role-${random_id.suffix.hex}"
+  name = "${var.addon_context.eks_cluster_id}-canary-role-${random_id.suffix.hex}"
 
   assume_role_policy = jsonencode({
     Version = "2012-10-17"
@@ -307,7 +307,7 @@ resource "aws_iam_role_policy_attachment" "canary_lambda_basic_execution" {
 
 # Policy for Canary
 resource "aws_iam_policy" "eks_resiliency_canary_policy" {
-  name        = "${var.addon_context.eks_cluster_id}-resiliency_canary_policy-${random_id.suffix.hex}"
+  name        = "${var.addon_context.eks_cluster_id}-resiliency-canary-policy-${random_id.suffix.hex}"
   path        = "/"
   description = "Custom policy for EKS resiliency Canary"
 
@@ -360,39 +360,6 @@ resource "aws_iam_role_policy_attachment" "eks_resiliency_canary_policy_attachme
   role       = aws_iam_role.canary_role.name
 }
 
-# EKS Cluster IAM Role
-resource "aws_iam_role" "eks_cluster_role" {
-  name = "eks-workshop-cluster-role-${var.addon_context.eks_cluster_id}-${random_id.suffix.hex}"
-
-  assume_role_policy = jsonencode({
-    Version = "2012-10-17"
-    Statement = [
-      {
-        Effect = "Allow"
-        Principal = {
-          Service = "eks.amazonaws.com"
-        }
-        Action = "sts:AssumeRole"
-      }
-    ]
-  })
-
-  lifecycle {
-    create_before_destroy = true
-  }
-}
-
-# Attach required policies to EKS Cluster role
-resource "aws_iam_role_policy_attachment" "eks_cluster_policy" {
-  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
-  role       = aws_iam_role.eks_cluster_role.name
-}
-
-resource "aws_iam_role_policy_attachment" "eks_vpc_resource_controller" {
-  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
-  role       = aws_iam_role.eks_cluster_role.name
-}
-
 # Executable Scripts
 resource "null_resource" "chmod_all_scripts_bash" {
   provisioner "local-exec" {

diff --git a/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf b/manifests/modules/observability/resiliency/.workshop/terraform/outputs.tf
@@ -1,13 +1,12 @@
 output "environment_variables" {
   description = "Environment variables to be added to the IDE shell"
   value = {
-    LBC_CHART_VERSION    = var.load_balancer_controller_chart_version
-    LBC_ROLE_ARN         = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn
-    FIS_ROLE_ARN         = aws_iam_role.fis_role.arn
-    RANDOM_SUFFIX        = random_id.suffix.hex
-    SCRIPT_DIR           = var.script_dir
-    CANARY_ROLE_ARN      = aws_iam_role.canary_role.arn
-    EKS_CLUSTER_ROLE_ARN = aws_iam_role.eks_cluster_role.arn
-    AWS_REGION           = data.aws_region.current.name
+    LBC_CHART_VERSION = var.load_balancer_controller_chart_version
+    LBC_ROLE_ARN      = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn
+    FIS_ROLE_ARN      = aws_iam_role.fis_role.arn
+    RANDOM_SUFFIX     = random_id.suffix.hex
+    SCRIPT_DIR        = var.script_dir
+    CANARY_ROLE_ARN   = aws_iam_role.canary_role.arn
+    AWS_REGION        = data.aws_region.current.name
   }
 }
diff --git a/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh b/manifests/modules/observability/resiliency/scripts/get-pods-by-az.sh
@@ -37,15 +37,17 @@ generate_output() {
 # Initial clear screen
 # clear_screen
 
+trap 'rm -rf ~/environment/eks-workshop/temp_output.txt; exit 0' SIGTERM INT
 # Main loop
 while true; do
+
     # Generate output to a temporary file
-    generate_output > temp_output.txt
+    generate_output > ~/environment/eks-workshop/temp_output.txt
 
     #generate_output
     # Clear screen and display the new output
     # clear_screen
-    cat temp_output.txt
+    cat ~/environment/eks-workshop/temp_output.txt
     # clear_screen
 
     # Wait before next update

diff --git a/website/docs/observability/high-availability/03-node-failure-no-fis.md b/website/docs/observability/high-availability/03-node-failure-no-fis.md
@@ -21,7 +21,7 @@ It's important to note that this experiment is repeatable, allowing you to run i
 To simulate the node failure and monitor its effects, run the following command:
 
 ```bash timeout=240
-$ ~/$SCRIPT_DIR/node-failure.sh && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh
+$ ~/$SCRIPT_DIR/node-failure.sh && timeout --preserve-status 180s  ~/$SCRIPT_DIR/get-pods-by-az.sh
 
 ------us-west-2a------
   ip-10-42-127-82.us-west-2.compute.internal:
@@ -63,18 +63,18 @@ This command counts the total number of nodes in the `Ready` state and continuou
 
 Once all nodes are ready, we'll redeploy the pods to ensure they are balanced across the nodes:
 
-```bash timeout=900 wait=60
-$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
+```bash timeout=900 wait=30
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
+$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
+$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
+$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
-$ sleep 180
+$ sleep 90
 $ kubectl rollout status -n ui deployment/ui --timeout 180s
 $ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
 ```

diff --git a/website/docs/observability/high-availability/04-node-failure-partial-fis.md b/website/docs/observability/high-availability/04-node-failure-partial-fis.md
@@ -55,8 +55,8 @@ $ export NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"de
 
 Execute the FIS experiment to simulate the node failure and monitor the response:
 
-```bash timeout=240
-$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && timeout 240s ~/$SCRIPT_DIR/get-pods-by-az.sh
+```bash timeout=300
+$ aws fis start-experiment --experiment-template-id $NODE_EXP_ID --output json && timeout --preserve-status 240s ~/$SCRIPT_DIR/get-pods-by-az.sh
 
 ------us-west-2a------
   ip-10-42-127-82.us-west-2.compute.internal:
@@ -86,19 +86,19 @@ Your retail url should stay operational unlike the node failure without FIS.
 :::note
 To verify nodes and re-balance pods, you can run:
 
-```bash timeout=900 wait=60
+```bash timeout=900 wait=30
 $ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done
-$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
+$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
+$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
+$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
-$ sleep 180
+$ sleep 90
 $ kubectl rollout status -n ui deployment/ui --timeout 180s
 $ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
 ```

diff --git a/website/docs/observability/high-availability/05-node-failure-complete-fis.md b/website/docs/observability/high-availability/05-node-failure-complete-fis.md
@@ -29,7 +29,7 @@ $ export FULL_NODE_EXP_ID=$(aws fis create-experiment-template --cli-input-json
 Execute the FIS experiment and monitor the cluster's response:
 
 ```bash timeout=420
-$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh
+$ aws fis start-experiment --experiment-template-id $FULL_NODE_EXP_ID --output json && timeout --preserve-status 360s ~/$SCRIPT_DIR/get-pods-by-az.sh
 
 ------us-west-2a------
   ip-10-42-106-250.us-west-2.compute.internal:
@@ -59,19 +59,19 @@ Due to the severity of the experiment, the retail store url will not stay operat
 :::note
 To verify nodes and pods redistribution, you can run:
 
-```bash timeout=900 wait=60
+```bash timeout=900 wait=30
 $ EXPECTED_NODES=3 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done
-$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
+$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
+$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
+$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
-$ sleep 180
+$ sleep 90
 $ kubectl rollout status -n ui deployment/ui --timeout 180s
 $ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
 ```

diff --git a/website/docs/observability/high-availability/06-az-setup.md b/website/docs/observability/high-availability/06-az-setup.md
@@ -8,8 +8,8 @@ description: "Scale your application to two instances and prepare for an AZ fail
 
 To see the full impact of an Availability Zone (AZ) failure, let's first scale up to two instances per AZ as well as increase the number of pods up to 9:
 
-```bash timeout=120
-$ ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text)
+```bash timeout=120 wait=30
+$ export ASG_NAME=$(aws autoscaling describe-auto-scaling-groups --query "AutoScalingGroups[? Tags[? (Key=='eks:cluster-name') && Value=='eks-workshop']].AutoScalingGroupName" --output text)
 $ aws autoscaling update-auto-scaling-group \
     --auto-scaling-group-name $ASG_NAME \
     --desired-capacity 6 \

diff --git a/website/docs/observability/high-availability/07-az-failure.md b/website/docs/observability/high-availability/07-az-failure.md
@@ -20,8 +20,8 @@ $ export ZONE_EXP_ID=$(aws fis create-experiment-template --cli-input-json '{"de
 
 Execute the FIS experiment to simulate the AZ failure:
 
-```bash timeout=560
-$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && timeout 180s ~/$SCRIPT_DIR/get-pods-by-az.sh
+```bash timeout=540
+$ aws fis start-experiment --experiment-template-id $ZONE_EXP_ID --output json && timeout --preserve-status 480s ~/$SCRIPT_DIR/get-pods-by-az.sh
 
 ------us-west-2a------
   ip-10-42-100-4.us-west-2.compute.internal:
@@ -61,19 +61,19 @@ During this time, the retail url will stay available showing how resilient EKS i
 :::note
 To verify nodes and pods redistribution, you can run:
 
-```bash timeout=900 wait=60
+```bash timeout=900 wait=30
 $ EXPECTED_NODES=6 && while true; do ready_nodes=$(kubectl get nodes --no-headers | grep " Ready" | wc -l); if [ "$ready_nodes" -eq "$EXPECTED_NODES" ]; then echo "All $EXPECTED_NODES expected nodes are ready."; echo "Listing the ready nodes:"; kubectl get nodes | grep " Ready"; break; else echo "Waiting for all $EXPECTED_NODES nodes to be ready... (Currently $ready_nodes are ready)"; sleep 10; fi; done
-$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=dynamodb
+$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=redis
+$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=service
+$ kubectl delete pod --grace-period=0 --force -n orders -l app.kubernetes.io/component=mysql
+$ kubectl delete pod --grace-period=0 --force -n ui -l app.kubernetes.io/component=service
 $ kubectl delete pod --grace-period=0 --force -n catalog -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n carts -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n checkout -l app.kubernetes.io/component=service
-$ kubectl delete pod --grace-period=0 --force -n assets -l app.kubernetes.io/component=service
-$ sleep 180
+$ sleep 90
 $ kubectl rollout status -n ui deployment/ui --timeout 180s
 $ timeout 10s ~/$SCRIPT_DIR/get-pods-by-az.sh | head -n 30
 ```