From beb5b81695c2ea2b2b89eab40c085fdd937e5437 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Sun, 9 Jun 2024 01:56:07 -0400 Subject: [PATCH 01/13] fix inte-test: Correctly exclude existing files and directories during data sync and integrity check - Add logic to generate a list of existing files in /home before performing the rsync operation. - Ensure that files and directories present in /home but not in /tmp/home are properly excluded during the diff check. - Handle cases where the same filename exists in both /home and /tmp/home but with different content, by including these files in the exclude list. - Update the rsync and diff commands to ensure accurate synchronization and verification of data integrity. - This fix addresses issues where the /home directory contains pre-existing files such as slurm-*.out generated by running SLURM jobs and the automatically created lost+found directory. --- .../recipes/init/restore_home_shared_data.rb | 50 ++++++++++++++++++- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index df13ca3c3..5c31e8d6e 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -18,19 +18,65 @@ # Restore the shared storage home data if it doesn't already exist # This is necessary to preserve any data in these directories that was # generated during the build of ParallelCluster AMIs after converting to - # shared storage and backed up to a temporary location previously + # shared storage and backed up to a temporary location previously. # Before removing the backup, ensure the data in the new home is the same # as the original to avoid any data loss or inconsistency. This is done # by using rsync to copy the data and diff to check for differences. + # The diff command excludes files that were originally in /home to focus on + # the newly synchronized files. This approach is necessary because the /home + # directory may contain pre-existing files such as slurm-*.out generated by + # running SLURM jobs and the automatically created lost+found directory. # Remove the backup after the copy is done and the data integrity is verified. bash "Restore /home" do user 'root' group 'root' code <<-EOH + # Generate a list of existing files in /home before the sync + find /home -type f > /tmp/home_existing_files.txt + + # Initialize an empty set for exclude options and directories to exclude + exclude_options="" + declare -A exclude_dirs + + # Process each file in the list + while IFS= read -r file; do + # Remove the /home/ prefix + relative_path=${file#/home/} + + # Initialize current path + current_path="/tmp/home" + + # Split the relative path by / + IFS='/' read -ra parts <<< "$relative_path" + + for part in "${parts[@]}"; do + current_path="$current_path/$part" + if [ ! -e "$current_path" ]; then + # If the path does not exist in /tmp/home, add the top-level directory to the exclude list + if [ -z "${exclude_dirs[$part]}" ]; then + exclude_dirs[$part]=1 + exclude_options="$exclude_options --exclude=$part" + fi + break + else + if [ -f "$current_path" ]; then + # If the path is a file, add it to the exclude list + exclude_options="$exclude_options --exclude=$part" + break + fi + # If the path is a directory, continue checking subdirectories + fi + done + done < /tmp/home_existing_files.txt + + # Sync data from /tmp/home to /home rsync -a --ignore-existing /tmp/home/ /home - diff_output=$(diff -r /tmp/home/ /home) + + # Perform the diff check, excluding the original files + diff_output=$(eval diff -r $exclude_options /tmp/home /home) if [ $? -eq 0 ]; then rm -rf /tmp/home/ + rm -rf /tmp/home_existing_files.txt else echo "Data integrity check failed comparing /home and /tmp/home: $diff_output" exit 1 From ee8d286fb125c11bf5f6d2425748c6cd6edf60ea Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Sun, 9 Jun 2024 08:14:27 -0400 Subject: [PATCH 02/13] add a 1 min sleep waiting for files generation in /home --- .../recipes/init/restore_home_shared_data.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index 5c31e8d6e..2f52e36f7 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -31,6 +31,8 @@ user 'root' group 'root' code <<-EOH + # Sleep 1 min waiting for files generation in /home + sleep 60 # Generate a list of existing files in /home before the sync find /home -type f > /tmp/home_existing_files.txt From 63fcc41cfec6f8314967e1d5e777d8bf4c116d9e Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Sun, 9 Jun 2024 09:35:20 -0400 Subject: [PATCH 03/13] Modify the logic to generate a list of existing files and dirs in /home before the sync not only the files --- .../recipes/init/restore_home_shared_data.rb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index 2f52e36f7..74c2e35a2 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -31,10 +31,8 @@ user 'root' group 'root' code <<-EOH - # Sleep 1 min waiting for files generation in /home - sleep 60 - # Generate a list of existing files in /home before the sync - find /home -type f > /tmp/home_existing_files.txt + # Generate a list of existing files and dirs in /home before the sync + find /home -mindepth 1 > /tmp/home_existing_files.txt # Initialize an empty set for exclude options and directories to exclude exclude_options="" From db3c7221c984f756643df3f40ac63f2afa0f368a Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 11 Jun 2024 13:24:57 -0400 Subject: [PATCH 04/13] Modify comments --- .../recipes/init/restore_home_shared_data.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index 74c2e35a2..3ba42eab8 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -52,7 +52,7 @@ for part in "${parts[@]}"; do current_path="$current_path/$part" if [ ! -e "$current_path" ]; then - # If the path does not exist in /tmp/home, add the top-level directory to the exclude list + # If the path does not exist in /tmp/home, add the last part of path to the exclude list if [ -z "${exclude_dirs[$part]}" ]; then exclude_dirs[$part]=1 exclude_options="$exclude_options --exclude=$part" From ec699ca0c433ed89e983d2d43d7c2cf9d01641d4 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 13 Jun 2024 11:47:45 -0400 Subject: [PATCH 05/13] Address comments: 1. use --exclude-from instead of per file exclusion. 2. Minor changes of comments. --- .../recipes/init/restore_home_shared_data.rb | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index 3ba42eab8..1fdc3255a 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -17,8 +17,7 @@ if node['cluster']['node_type'] == 'HeadNode' # Restore the shared storage home data if it doesn't already exist # This is necessary to preserve any data in these directories that was - # generated during the build of ParallelCluster AMIs after converting to - # shared storage and backed up to a temporary location previously. + # generated during the node bootstrap after converting to shared storage. # Before removing the backup, ensure the data in the new home is the same # as the original to avoid any data loss or inconsistency. This is done # by using rsync to copy the data and diff to check for differences. @@ -35,15 +34,12 @@ find /home -mindepth 1 > /tmp/home_existing_files.txt # Initialize an empty set for exclude options and directories to exclude - exclude_options="" declare -A exclude_dirs - # Process each file in the list + # Process each file and directory in the list to determine which paths should be excluded from the diff check while IFS= read -r file; do # Remove the /home/ prefix relative_path=${file#/home/} - - # Initialize current path current_path="/tmp/home" # Split the relative path by / @@ -55,13 +51,13 @@ # If the path does not exist in /tmp/home, add the last part of path to the exclude list if [ -z "${exclude_dirs[$part]}" ]; then exclude_dirs[$part]=1 - exclude_options="$exclude_options --exclude=$part" + echo $part >> /tmp/exclude_options.txt fi break else if [ -f "$current_path" ]; then # If the path is a file, add it to the exclude list - exclude_options="$exclude_options --exclude=$part" + echo $part >> /tmp/exclude_options.txt break fi # If the path is a directory, continue checking subdirectories @@ -73,10 +69,11 @@ rsync -a --ignore-existing /tmp/home/ /home # Perform the diff check, excluding the original files - diff_output=$(eval diff -r $exclude_options /tmp/home /home) + diff_output=$(diff -r --exclude-from=/tmp/exclude_options.txt /tmp/home /home) if [ $? -eq 0 ]; then rm -rf /tmp/home/ rm -rf /tmp/home_existing_files.txt + rm -rf /tmp/exclude_options.txt else echo "Data integrity check failed comparing /home and /tmp/home: $diff_output" exit 1 From 15f8592516dd956e311f7990c465dc6deb1f51a4 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 13 Jun 2024 14:49:53 -0400 Subject: [PATCH 06/13] Create /tmp/exclude_options.txt in advanced, preventing /tmp/exclude_options.txt not found error --- .../recipes/init/restore_home_shared_data.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index 1fdc3255a..043cff945 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -35,6 +35,8 @@ # Initialize an empty set for exclude options and directories to exclude declare -A exclude_dirs + + touch /tmp/exclude_options.txt # Process each file and directory in the list to determine which paths should be excluded from the diff check while IFS= read -r file; do From 4fef7a7aa771b98e5e521b6b29c4f74bef79857b Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 13 Jun 2024 15:39:29 -0400 Subject: [PATCH 07/13] Minor change on format --- .../recipes/init/restore_home_shared_data.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index 043cff945..cf2e8af1c 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -35,7 +35,7 @@ # Initialize an empty set for exclude options and directories to exclude declare -A exclude_dirs - + touch /tmp/exclude_options.txt # Process each file and directory in the list to determine which paths should be excluded from the diff check From 2e6ea90ade8f455f0d76348d6806193e0441510a Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Fri, 14 Jun 2024 14:56:23 -0400 Subject: [PATCH 08/13] Check if the diff_output does not contain 'Only in /tmp/home', instead of using overcomplicating loop. Apply the same logic to restore_internal_use_shared_data.rb and config_default_user_home.rb --- .../recipes/init/config_default_user_home.rb | 6 +- .../recipes/init/restore_home_shared_data.rb | 57 +++---------------- .../init/restore_internal_use_shared_data.rb | 6 +- .../recipes/config_default_user_home_spec.rb | 6 +- .../recipes/mount_internal_use_efs_spec.rb | 6 +- 5 files changed, 23 insertions(+), 58 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb index 9607007b0..d81fbd18e 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb @@ -71,11 +71,13 @@ group 'root' code <<-EOH diff_output=$(diff -r #{node['cluster']['cluster_user_home']} #{node['cluster']['cluster_user_local_home']}) - if [ $? -eq 0 ]; then + if [[ $diff_output != *"Only in #{node['cluster']['cluster_user_home']}"* ]]; then rm -rf /tmp#{node['cluster']['cluster_user_home']} rm -rf #{node['cluster']['cluster_user_home']} else - echo "Data integrity check failed comparing #{node['cluster']['cluster_user_local_home']} and #{node['cluster']['cluster_user_home']}: $diff_output" >&2 + only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{node['cluster']['cluster_user_home']}") + echo "Data integrity check failed comparing #{node['cluster']['cluster_user_local_home']} and #{node['cluster']['cluster_user_home']}. Differences:" + echo "$only_in_cluster_user_home" systemctl start sshd exit 1 fi diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index cf2e8af1c..e66a0b7ac 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -17,67 +17,24 @@ if node['cluster']['node_type'] == 'HeadNode' # Restore the shared storage home data if it doesn't already exist # This is necessary to preserve any data in these directories that was - # generated during the node bootstrap after converting to shared storage. + # generated during the node bootstrap after converting to shared storage + # and backed up to a temporary location previously. # Before removing the backup, ensure the data in the new home is the same # as the original to avoid any data loss or inconsistency. This is done # by using rsync to copy the data and diff to check for differences. - # The diff command excludes files that were originally in /home to focus on - # the newly synchronized files. This approach is necessary because the /home - # directory may contain pre-existing files such as slurm-*.out generated by - # running SLURM jobs and the automatically created lost+found directory. # Remove the backup after the copy is done and the data integrity is verified. bash "Restore /home" do user 'root' group 'root' code <<-EOH - # Generate a list of existing files and dirs in /home before the sync - find /home -mindepth 1 > /tmp/home_existing_files.txt - - # Initialize an empty set for exclude options and directories to exclude - declare -A exclude_dirs - - touch /tmp/exclude_options.txt - - # Process each file and directory in the list to determine which paths should be excluded from the diff check - while IFS= read -r file; do - # Remove the /home/ prefix - relative_path=${file#/home/} - current_path="/tmp/home" - - # Split the relative path by / - IFS='/' read -ra parts <<< "$relative_path" - - for part in "${parts[@]}"; do - current_path="$current_path/$part" - if [ ! -e "$current_path" ]; then - # If the path does not exist in /tmp/home, add the last part of path to the exclude list - if [ -z "${exclude_dirs[$part]}" ]; then - exclude_dirs[$part]=1 - echo $part >> /tmp/exclude_options.txt - fi - break - else - if [ -f "$current_path" ]; then - # If the path is a file, add it to the exclude list - echo $part >> /tmp/exclude_options.txt - break - fi - # If the path is a directory, continue checking subdirectories - fi - done - done < /tmp/home_existing_files.txt - - # Sync data from /tmp/home to /home rsync -a --ignore-existing /tmp/home/ /home - - # Perform the diff check, excluding the original files - diff_output=$(diff -r --exclude-from=/tmp/exclude_options.txt /tmp/home /home) - if [ $? -eq 0 ]; then + diff_output=$(diff -r /tmp/home/ /home) + if [[ $diff_output != *"Only in /tmp/home"* ]]; then rm -rf /tmp/home/ - rm -rf /tmp/home_existing_files.txt - rm -rf /tmp/exclude_options.txt else - echo "Data integrity check failed comparing /home and /tmp/home: $diff_output" + only_in_tmp=$(echo "$diff_output" | grep "Only in /tmp/home") + echo "Data integrity check failed comparing /home and /tmp/home. Differences:" + echo "$only_in_tmp" exit 1 fi EOH diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb index 39dc93d9c..b2332d1d0 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -30,10 +30,12 @@ code <<-EOH rsync -a --ignore-existing /tmp#{dir}/ #{dir} diff_output=$(diff -r /tmp#{dir}/ #{dir}) - if [ $? -eq 0 ]; then + if [[ $diff_output != *"Only in /tmp#{dir}"* ]]; then rm -rf /tmp#{dir}/ else - echo "Data integrity check failed comparing #{dir} and /tmp#{dir}: $diff_output" + only_in_tmp=$(echo "$diff_output" | grep "Only in /tmp#{dir}") + echo "Data integrity check failed comparing #{dir} and /tmp#{dir}. Differences:" + echo "$only_in_tmp" exit 1 fi EOH diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb index 20ee258d2..6931fbd8b 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb @@ -29,11 +29,13 @@ expect(chef_run).to run_bash("Verify data integrity for #{user_home}").with( code: <<-CODE diff_output=$(diff -r #{user_home} #{user_local_home}) - if [ $? -eq 0 ]; then + if [[ $diff_output != *"Only in #{user_home}"* ]]; then rm -rf /tmp#{user_home} rm -rf #{user_home} else - echo "Data integrity check failed comparing #{user_local_home} and #{user_home}: $diff_output" >&2 + only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{user_home}") + echo "Data integrity check failed comparing #{user_local_home} and #{user_home}. Differences:" + echo "$only_in_cluster_user_home" systemctl start sshd exit 1 fi diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb index 34123f56b..c417d351b 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb @@ -37,10 +37,12 @@ code: <<-CODE rsync -a --ignore-existing /tmp#{dir}/ #{dir} diff_output=$(diff -r /tmp#{dir}/ #{dir}) - if [ $? -eq 0 ]; then + if [[ $diff_output != *"Only in /tmp#{dir}"* ]]; then rm -rf /tmp#{dir}/ else - echo "Data integrity check failed comparing #{dir} and /tmp#{dir}: $diff_output" + only_in_tmp=$(echo "$diff_output" | grep "Only in /tmp#{dir}") + echo "Data integrity check failed comparing #{dir} and /tmp#{dir}. Differences:" + echo "$only_in_tmp" exit 1 fi CODE From 5a16e245c242482cf1c334206e5827aaa22c25ae Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Fri, 14 Jun 2024 15:02:47 -0400 Subject: [PATCH 09/13] Minor changes on comments --- .../recipes/init/config_default_user_home.rb | 2 +- .../recipes/init/restore_home_shared_data.rb | 6 +++--- .../recipes/init/restore_internal_use_shared_data.rb | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb index d81fbd18e..cf5643804 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb @@ -63,7 +63,7 @@ end # Data integrity check and cleanup for temporary backup and original home directory -# 1. Verifies data integrity by comparing the temporary backup directory and the new local home directory. +# 1. Verifies data integrity by comparing the original home directory and the new local home directory. # 2. If the data integrity check passes, it removes both the temporary backup directory and the original home directory. # 3. If the data integrity check fails, it outputs an error message and exits with an error code 1. bash "Verify data integrity for #{node['cluster']['cluster_user_home']}" do diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index e66a0b7ac..eca904b6c 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -17,9 +17,9 @@ if node['cluster']['node_type'] == 'HeadNode' # Restore the shared storage home data if it doesn't already exist # This is necessary to preserve any data in these directories that was - # generated during the node bootstrap after converting to shared storage - # and backed up to a temporary location previously. - # Before removing the backup, ensure the data in the new home is the same + # generated during the build of ParallelCluster AMIs after converting to + # shared storage and backed up to a temporary location previously + # Before removing the backup, ensure the new data in the new home is the same # as the original to avoid any data loss or inconsistency. This is done # by using rsync to copy the data and diff to check for differences. # Remove the backup after the copy is done and the data integrity is verified. diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb index b2332d1d0..11c1344c8 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -19,7 +19,7 @@ # This is necessary to preserve any data in these directories that was # generated during the build of ParallelCluster AMIs after converting to # shared storage and backed up to a temporary location previously - # Before removing the backup, ensure the data in the new directory is the same + # Before removing the backup, ensure the new data in the new directory is the same # as the original to avoid any data loss or inconsistency. This is done # by using rsync to copy the data and diff to check for differences. # Remove the backup after the copy is done and the data integrity is verified. From 4ceb301e6357ea6c2bf552c55bf199f3e5bee214 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 13:04:07 -0400 Subject: [PATCH 10/13] Minor changes on comments to avoid any confusion --- .../recipes/init/config_default_user_home.rb | 1 + .../spec/unit/recipes/config_default_user_home_spec.rb | 1 + 2 files changed, 2 insertions(+) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb index cf5643804..955a50da0 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb @@ -66,6 +66,7 @@ # 1. Verifies data integrity by comparing the original home directory and the new local home directory. # 2. If the data integrity check passes, it removes both the temporary backup directory and the original home directory. # 3. If the data integrity check fails, it outputs an error message and exits with an error code 1. +# To avoid any confusion, ['cluster_user_home'] is the original dir, ['cluster_user_local_home'] is the new replaced dir bash "Verify data integrity for #{node['cluster']['cluster_user_home']}" do user 'root' group 'root' diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb index 6931fbd8b..fe80ada92 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb @@ -24,6 +24,7 @@ end it 'moves the cluster user home directory with data integrity check' do + # To avoid any confusion, user_home is the original dir, user_local_home is the new replaced dir user_home = "/home/user" user_local_home = "/local/home/user" expect(chef_run).to run_bash("Verify data integrity for #{user_home}").with( From c12e053109641a837bb80b0641942acb5a8d0eca Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 13:17:28 -0400 Subject: [PATCH 11/13] Minor changes on comments and variable names. --- .../recipes/init/config_default_user_home.rb | 4 ++-- .../recipes/config_default_user_home_spec.rb | 20 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb index 955a50da0..9116262c2 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb @@ -66,8 +66,8 @@ # 1. Verifies data integrity by comparing the original home directory and the new local home directory. # 2. If the data integrity check passes, it removes both the temporary backup directory and the original home directory. # 3. If the data integrity check fails, it outputs an error message and exits with an error code 1. -# To avoid any confusion, ['cluster_user_home'] is the original dir, ['cluster_user_local_home'] is the new replaced dir -bash "Verify data integrity for #{node['cluster']['cluster_user_home']}" do +# To avoid any confusion, ['cluster_user_home'] is the original dir, ['cluster_user_local_home'] is the destination dir. +bash "Verify data integrity for #{node['cluster']['cluster_user_local_home']}" do user 'root' group 'root' code <<-EOH diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb index fe80ada92..76af46035 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb @@ -24,18 +24,18 @@ end it 'moves the cluster user home directory with data integrity check' do - # To avoid any confusion, user_home is the original dir, user_local_home is the new replaced dir - user_home = "/home/user" - user_local_home = "/local/home/user" - expect(chef_run).to run_bash("Verify data integrity for #{user_home}").with( + # To avoid any confusion, user_home is the original dir, user_local_home is the destination dir. + original_user_home = "/home/user" + destination_user_local_home = "/local/home/user" + expect(chef_run).to run_bash("Verify data integrity for #{destination_user_local_home}").with( code: <<-CODE - diff_output=$(diff -r #{user_home} #{user_local_home}) - if [[ $diff_output != *"Only in #{user_home}"* ]]; then - rm -rf /tmp#{user_home} - rm -rf #{user_home} + diff_output=$(diff -r #{original_user_home} #{destination_user_local_home}) + if [[ $diff_output != *"Only in #{original_user_home}"* ]]; then + rm -rf /tmp#{original_user_home} + rm -rf #{original_user_home} else - only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{user_home}") - echo "Data integrity check failed comparing #{user_local_home} and #{user_home}. Differences:" + only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{original_user_home}") + echo "Data integrity check failed comparing #{destination_user_local_home} and #{original_user_home}. Differences:" echo "$only_in_cluster_user_home" systemctl start sshd exit 1 From 8a5a40768098e3022ea8a4d47eef61f6857a99dd Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 13:36:58 -0400 Subject: [PATCH 12/13] Remove the original cluster_user_home deletion logic to avoid any file system data loss risks. --- .../recipes/init/config_default_user_home.rb | 4 ++-- .../spec/unit/recipes/config_default_user_home_spec.rb | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb index 9116262c2..2ab529d47 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb @@ -62,11 +62,12 @@ EOH end -# Data integrity check and cleanup for temporary backup and original home directory +# Data integrity check and cleanup for temporary backup directory # 1. Verifies data integrity by comparing the original home directory and the new local home directory. # 2. If the data integrity check passes, it removes both the temporary backup directory and the original home directory. # 3. If the data integrity check fails, it outputs an error message and exits with an error code 1. # To avoid any confusion, ['cluster_user_home'] is the original dir, ['cluster_user_local_home'] is the destination dir. +# To avoid any potential file system data loss risks, we decided to keep node['cluster']['cluster_user_home']. bash "Verify data integrity for #{node['cluster']['cluster_user_local_home']}" do user 'root' group 'root' @@ -74,7 +75,6 @@ diff_output=$(diff -r #{node['cluster']['cluster_user_home']} #{node['cluster']['cluster_user_local_home']}) if [[ $diff_output != *"Only in #{node['cluster']['cluster_user_home']}"* ]]; then rm -rf /tmp#{node['cluster']['cluster_user_home']} - rm -rf #{node['cluster']['cluster_user_home']} else only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{node['cluster']['cluster_user_home']}") echo "Data integrity check failed comparing #{node['cluster']['cluster_user_local_home']} and #{node['cluster']['cluster_user_home']}. Differences:" diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb index 76af46035..5711544d1 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb @@ -32,7 +32,6 @@ diff_output=$(diff -r #{original_user_home} #{destination_user_local_home}) if [[ $diff_output != *"Only in #{original_user_home}"* ]]; then rm -rf /tmp#{original_user_home} - rm -rf #{original_user_home} else only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{original_user_home}") echo "Data integrity check failed comparing #{destination_user_local_home} and #{original_user_home}. Differences:" From bc4d7f88d07e3a750a6a7359780b6c9ca2f93d47 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 11:11:40 -0400 Subject: [PATCH 13/13] Add a new echo message when data integrity check succeeded --- .../recipes/init/config_default_user_home.rb | 1 + .../recipes/init/restore_home_shared_data.rb | 1 + .../recipes/init/restore_internal_use_shared_data.rb | 3 ++- .../spec/unit/recipes/config_default_user_home_spec.rb | 1 + .../spec/unit/recipes/mount_internal_use_efs_spec.rb | 3 ++- 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb index 2ab529d47..39d1cf587 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/config_default_user_home.rb @@ -74,6 +74,7 @@ code <<-EOH diff_output=$(diff -r #{node['cluster']['cluster_user_home']} #{node['cluster']['cluster_user_local_home']}) if [[ $diff_output != *"Only in #{node['cluster']['cluster_user_home']}"* ]]; then + echo "Data integrity check succeeded, removing temporary directory /tmp#{node['cluster']['cluster_user_home']}" rm -rf /tmp#{node['cluster']['cluster_user_home']} else only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{node['cluster']['cluster_user_home']}") diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb index eca904b6c..ab694587b 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_home_shared_data.rb @@ -30,6 +30,7 @@ rsync -a --ignore-existing /tmp/home/ /home diff_output=$(diff -r /tmp/home/ /home) if [[ $diff_output != *"Only in /tmp/home"* ]]; then + echo "Data integrity check succeeded, removing temporary directory /tmp/home" rm -rf /tmp/home/ else only_in_tmp=$(echo "$diff_output" | grep "Only in /tmp/home") diff --git a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb index 11c1344c8..5351d0274 100644 --- a/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb +++ b/cookbooks/aws-parallelcluster-environment/recipes/init/restore_internal_use_shared_data.rb @@ -31,7 +31,8 @@ rsync -a --ignore-existing /tmp#{dir}/ #{dir} diff_output=$(diff -r /tmp#{dir}/ #{dir}) if [[ $diff_output != *"Only in /tmp#{dir}"* ]]; then - rm -rf /tmp#{dir}/ + echo "Data integrity check succeeded, removing temporary directory /tmp#{dir}" + rm -rf /tmp#{dir} else only_in_tmp=$(echo "$diff_output" | grep "Only in /tmp#{dir}") echo "Data integrity check failed comparing #{dir} and /tmp#{dir}. Differences:" diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb index 5711544d1..141baed4e 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/config_default_user_home_spec.rb @@ -31,6 +31,7 @@ code: <<-CODE diff_output=$(diff -r #{original_user_home} #{destination_user_local_home}) if [[ $diff_output != *"Only in #{original_user_home}"* ]]; then + echo "Data integrity check succeeded, removing temporary directory /tmp#{original_user_home}" rm -rf /tmp#{original_user_home} else only_in_cluster_user_home=$(echo "$diff_output" | grep "Only in #{original_user_home}") diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb index c417d351b..17e690de5 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/recipes/mount_internal_use_efs_spec.rb @@ -38,7 +38,8 @@ rsync -a --ignore-existing /tmp#{dir}/ #{dir} diff_output=$(diff -r /tmp#{dir}/ #{dir}) if [[ $diff_output != *"Only in /tmp#{dir}"* ]]; then - rm -rf /tmp#{dir}/ + echo "Data integrity check succeeded, removing temporary directory /tmp#{dir}" + rm -rf /tmp#{dir} else only_in_tmp=$(echo "$diff_output" | grep "Only in /tmp#{dir}") echo "Data integrity check failed comparing #{dir} and /tmp#{dir}. Differences:"