From 99e0d1bddcfdf9030e94d23a41f4750b62570561 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Thu, 2 May 2024 16:25:43 -0400 Subject: [PATCH] Add a log message describing the active processes using a mount point during unmounting (#2719) This includes refactoring the init, finalize, and update entrypoints to include PATH dirs --- .../recipes/finalize.rb | 2 ++ .../recipes/init.rb | 2 ++ .../recipes/update.rb | 1 + .../spec/unit/recipes/finalize_spec.rb | 3 ++ .../spec/unit/recipes/update_spec.rb | 2 ++ .../resources/efs/partial/_mount_umount.rb | 4 +++ .../resources/file_utils.rb | 28 +++++++++++++++++++ .../lustre/partial/_mount_unmount.rb | 5 +++- .../resources/volume.rb | 5 +++- .../spec/unit/resources/efs_spec.rb | 9 +++++- .../spec/unit/resources/lustre_mount_spec.rb | 4 ++- .../spec/unit/resources/volume_spec.rb | 10 +++++++ 12 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-environment/resources/file_utils.rb diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/finalize.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/finalize.rb index 2c15e11fc..7532147bf 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/finalize.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/finalize.rb @@ -14,6 +14,8 @@ include_recipe "aws-parallelcluster-platform::enable_chef_error_handler" +include_recipe "aws-parallelcluster-shared::setup_envars" + fetch_config 'Fetch and load cluster configs' if is_custom_node? diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb index 538eeaf37..664228d96 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/init.rb @@ -14,6 +14,8 @@ include_recipe "aws-parallelcluster-platform::enable_chef_error_handler" +include_recipe "aws-parallelcluster-shared::setup_envars" + os_type 'Validate OS type specified by the user is the same as the OS identified by Ohai' # Validate init system diff --git a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb index 213a0e861..f69aa2453 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/recipes/update.rb @@ -11,6 +11,7 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. +include_recipe "aws-parallelcluster-shared::setup_envars" # Fetch and load cluster configs include_recipe 'aws-parallelcluster-platform::update' diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/finalize_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/finalize_spec.rb index ef96601ee..1564a87a1 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/finalize_spec.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/finalize_spec.rb @@ -18,6 +18,7 @@ @included_recipes = [] %w( aws-parallelcluster-platform::enable_chef_error_handler + aws-parallelcluster-shared::setup_envars aws-parallelcluster-computefleet::custom_parallelcluster_node aws-parallelcluster-platform::finalize aws-parallelcluster-slurm::finalize @@ -50,6 +51,7 @@ expected_recipes = if is_custom_node %w( aws-parallelcluster-platform::enable_chef_error_handler + aws-parallelcluster-shared::setup_envars aws-parallelcluster-computefleet::custom_parallelcluster_node aws-parallelcluster-platform::finalize aws-parallelcluster-slurm::finalize @@ -58,6 +60,7 @@ else %w( aws-parallelcluster-platform::enable_chef_error_handler + aws-parallelcluster-shared::setup_envars aws-parallelcluster-platform::finalize aws-parallelcluster-slurm::finalize aws-parallelcluster-environment::finalize diff --git a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb index 5bdc57954..c3df20518 100644 --- a/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb +++ b/cookbooks/aws-parallelcluster-entrypoints/spec/unit/recipes/update_spec.rb @@ -17,6 +17,7 @@ before do @included_recipes = [] %w( + aws-parallelcluster-shared::setup_envars aws-parallelcluster-platform::update aws-parallelcluster-environment::update aws-parallelcluster-slurm::update @@ -47,6 +48,7 @@ cached(:node) { chef_run.node } expected_recipes = %w( + aws-parallelcluster-shared::setup_envars aws-parallelcluster-platform::update aws-parallelcluster-environment::update aws-parallelcluster-slurm::update diff --git a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb index 29149a6a6..841b68229 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/efs/partial/_mount_umount.rb @@ -101,6 +101,10 @@ # Path needs to be fully qualified, for example "shared/temp" becomes "/shared/temp" efs_shared_dir = "/#{efs_shared_dir}" unless efs_shared_dir.start_with?('/') # Unmount EFS + file_utils "check active processes on #{efs_shared_dir}" do + file efs_shared_dir + action :check_active_processes + end execute 'unmount efs' do command "umount -fl #{efs_shared_dir}" retries 10 diff --git a/cookbooks/aws-parallelcluster-environment/resources/file_utils.rb b/cookbooks/aws-parallelcluster-environment/resources/file_utils.rb new file mode 100644 index 000000000..10c7c70c9 --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/resources/file_utils.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +# Copyright:: 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +provides :file_utils +unified_mode true + +property :file, String, required: %i(check_active_processes) + +default_action :check_active_processes + +action :check_active_processes do + file = new_resource.file + Chef::Log.info("The following processes are using #{file}") + execute "active processes" do + retries 3 + retry_delay 3 + timeout 10 + live_stream true + command "fuser -mv #{file}" + end +end diff --git a/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_mount_unmount.rb b/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_mount_unmount.rb index 58ee34ee8..03d5269ac 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_mount_unmount.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_mount_unmount.rb @@ -71,7 +71,10 @@ return if on_docker? new_resource.fsx_fs_id_array.dup.each_with_index do |_fsx_fs_id, index| fsx = FSx.new(node, new_resource, index) - + file_utils "check active processes on #{fsx.shared_dir}" do + file fsx.shared_dir + action :check_active_processes + end execute "unmount fsx #{fsx.shared_dir}" do command "umount -fl #{fsx.shared_dir}" retries 10 diff --git a/cookbooks/aws-parallelcluster-environment/resources/volume.rb b/cookbooks/aws-parallelcluster-environment/resources/volume.rb index 4f166a39e..884968a53 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/volume.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/volume.rb @@ -94,7 +94,10 @@ action :unmount do shared_dir = format_directory(new_resource.shared_dir) - + file_utils "check active processes on #{shared_dir}" do + file shared_dir + action :check_active_processes + end # TODO: can we use mount resource to unmount and disable (see raid) execute "unmount volume #{shared_dir}" do command "umount -fl #{shared_dir}" diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb index b9ffbf38e..1fc4c19ce 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efs_spec.rb @@ -95,7 +95,7 @@ def mock_already_installed(package, expected_version, installed) cd efs-utils-#{utils_version} ./build-deb.sh apt-get -y install ./build/amazon-efs-utils*deb - EFSUTILSINSTALL + EFSUTILSINSTALL end context "utils package not yet installed" do @@ -365,6 +365,13 @@ def mock_already_installed(package, expected_version, installed) is_expected.to unmount_efs('unmount') end + it 'checks active processes' do + is_expected.to check_active_processes_file_utils('check active processes on /shared_dir_1') + .with(file: '/shared_dir_1') + is_expected.to check_active_processes_file_utils('check active processes on /shared_dir_2') + .with(file: '/shared_dir_2') + end + it 'unmounts efs only if mounted' do is_expected.not_to run_execute('unmount efs') .with(command: 'umount -fl /shared_dir_1') diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_mount_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_mount_spec.rb index 75db5fa1e..e0082fbb6 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_mount_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/lustre_mount_spec.rb @@ -370,7 +370,9 @@ it 'unmounts fsx only if mounted' do is_expected.not_to run_execute('unmount fsx /shared_dir_1') - + is_expected.to check_active_processes_file_utils('check active processes on /shared_dir_1') + is_expected.to check_active_processes_file_utils('check active processes on /shared_dir_2') + .with(file: '/shared_dir_2') is_expected.to run_execute('unmount fsx /shared_dir_2') .with(command: "umount -fl /shared_dir_2") .with(retries: 10) diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/volume_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/volume_spec.rb index 03e9ac468..a160423d3 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/volume_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/volume_spec.rb @@ -142,6 +142,11 @@ is_expected.not_to run_execute('unmount volume') end + it 'checks active processes' do + is_expected.to check_active_processes_file_utils('check active processes on /SHARED_DIR') + .with(file: '/SHARED_DIR') + end + it "removes volume /SHARED_DIR from /etc/fstab" do is_expected.to edit_delete_lines("remove volume /SHARED_DIR from /etc/fstab") .with(path: "/etc/fstab") @@ -179,6 +184,11 @@ allow(Dir).to receive(:empty?).with("/SHARED_DIR").and_return(is_dir_empty) end + it 'checks active processes' do + is_expected.to check_active_processes_file_utils('check active processes on /SHARED_DIR') + .with(file: '/SHARED_DIR') + end + it 'unmounts volume' do is_expected.to unmount_volume('unmount') is_expected.to run_execute('unmount volume /SHARED_DIR')