diff --git a/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb b/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb index 936a15edd9..5a702b9782 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/recipes/awsbatch_virtualenv.rb @@ -25,7 +25,6 @@ activate_virtual_env virtualenv_name do pyenv_path virtualenv_path python_version python_version - not_if { ::File.exist?("#{virtualenv_path}/bin/activate") } end node.default['cluster']['awsbatch_virtualenv_path'] = virtualenv_path diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb index 063acb04b4..3b831f0bd5 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/install/parallelcluster_node.rb @@ -30,7 +30,6 @@ activate_virtual_env node_virtualenv_name do pyenv_path node_virtualenv_path python_version node_python_version - not_if { ::File.exist?("#{virtualenv_path}/bin/activate") } end if is_custom_node? diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb index 9f6fb6aacf..db78a5647c 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/awscli.rb @@ -34,6 +34,10 @@ overwrite true end +alinux_extras_topic 'python 3.8' do + topic 'python3.8' +end + bash 'install awscli' do - code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" + code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws" end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb index afca952e0b..fdf4fe2202 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cookbook_virtualenv.rb @@ -21,9 +21,36 @@ install_pyenv 'pyenv for default python version' +# alinux_extras_topic 'python 3.8' do +# topic 'python3.8' +# end + activate_virtual_env cookbook_virtualenv_name do pyenv_path cookbook_virtualenv_path python_version cookbook_python_version - requirements_path "cookbook_virtualenv/requirements.txt" - not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") } end + +cookbook_file "#{virtualenv_path}/requirements.txt" do + source "cookbook_virtualenv/requirements.txt" + mode '0755' +end + +bash 'pip install' do + user 'root' + group 'root' + cwd "#{node['cluster']['base_dir']}" + code <<-REQ + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/PyPi/dependencies.tar.gz dependencies.tar.gz + tar xzf dependencies.tar.gz + cd dependencies + #{virtualenv_path}/bin/pip install * -f ./ --no-index + REQ +end + +# activate_virtual_env cookbook_virtualenv_name do +# pyenv_path cookbook_virtualenv_path +# python_version cookbook_python_version +# requirements_path "cookbook_virtualenv/requirements.txt" +# # not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") } +# end diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index 65c366b649..99ba89e80d 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -35,12 +35,25 @@ node_attributes 'Save cuda and cuda samples versions for InSpec tests' # Get CUDA run file -remote_file tmp_cuda_run do - source cuda_url - mode '0755' - retries 3 - retry_delay 5 - not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}") } +# remote_file tmp_cuda_run do +# source cuda_url +# mode '0755' +# retries 3 +# retry_delay 5 +# not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}") } +# end + +bash 'get cuda and cuda samples from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-CUDA + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run #{tmp_cuda_run} + chmod 755 #{tmp_cuda_run} + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz #{tmp_cuda_sample_archive} + chmod 644 #{tmp_cuda_sample_archive} + CUDA end # Install CUDA driver @@ -57,13 +70,13 @@ end # Get CUDA Sample Files -remote_file tmp_cuda_sample_archive do - source cuda_samples_url - mode '0644' - retries 3 - retry_delay 5 - not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}/samples") } -end +# remote_file tmp_cuda_sample_archive do +# source cuda_samples_url +# mode '0644' +# retries 3 +# retry_delay 5 +# not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}/samples") } +# end # Unpack CUDA Samples bash 'cuda.sample install' do diff --git a/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb b/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb index 284d86179f..0cb1edfe5e 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/arm_pl/partial/_arm_pl_common.rb @@ -115,13 +115,23 @@ gcc_tarball = "#{new_resource.sources_dir}/gcc-#{gcc_version}.tar.gz" # Get gcc tarball - remote_file gcc_tarball do - source gcc_url - mode '0644' - retries 5 - retry_delay 10 - ssl_verify_mode :verify_none - action :create_if_missing + # remote_file gcc_tarball do + # source gcc_url + # mode '0644' + # retries 5 + # retry_delay 10 + # ssl_verify_mode :verify_none + # action :create_if_missing + # end + + bash 'get gcc from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-GCC + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/gcc/gcc-#{gcc_version}.tar.gz #{gcc_tarball} + GCC end # Install gcc diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb index 5856bebff4..375dcb02ce 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb @@ -24,3 +24,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + 'rhel7' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb index 5fcddd3761..e66bea4c2d 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_centos7.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + 'rhel7' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb index 1eb5216da0..223cabaf89 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_redhat8.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb index 8d12f10331..c0d76676c2 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_rocky8.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version _nvidia_driver_version end + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb index c01265485d..ac6de0b145 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb @@ -26,3 +26,7 @@ def fabric_manager_package def fabric_manager_version "#{_nvidia_driver_version}*" end + +def platform + "ubuntu#{node['platform_version'].delete('.')}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index bf1c45750b..7cdc7e203b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -27,15 +27,15 @@ node_attributes "dump node attributes" # Add NVIDIA repo for fabric manager and datacenter-gpu-manager - nvidia_repo 'add nvidia repository' do - action :add - end + # nvidia_repo 'add nvidia repository' do + # action :add + # end action_install_package - nvidia_repo 'remove nvidia repository' do - action :remove - end + # nvidia_repo 'remove nvidia repository' do + # action :remove + # end end action :configure do diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb index 79629f4998..5157d3ba0b 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb @@ -15,9 +15,27 @@ action :install_package do # For ubuntu, CINC17 apt-package resources need full versions for `version` execute "install_fabricmanager_for_ubuntu" do - command "apt -y install #{fabric_manager_package}=#{fabric_manager_version} "\ + bash "Install #{fabric_manager_package}" do + user 'root' + code <<-FABRIC_MANAGER + set -e + aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.deb + FABRIC_MANAGER + retries 3 + retry_delay 5 + end + + command "apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb "\ "&& apt-mark hold #{fabric_manager_package}" retries 3 retry_delay 5 end end + +def arch_suffix + arm_instance? ? 'arm64' : 'amd64' +end + +def fabric_manager_url + "s3://hgreebe-dependencies/archives/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb index 4339622f4b..5e3e1b82b8 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb @@ -18,10 +18,19 @@ user 'root' code <<-FABRIC_MANAGER_INSTALL set -e - yum install -y #{fabric_manager_package}-#{fabric_manager_version} + aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.rpm + yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm yum versionlock #{fabric_manager_package} FABRIC_MANAGER_INSTALL retries 3 retry_delay 5 end end + +def arch_suffix + arm_instance? ? 'aarch64' : 'x86_64' +end + +def fabric_manager_url + "s3://hgreebe-dependencies/archives/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb index db58f83591..a4544625a1 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/gdrcopy/partial/_gdrcopy_common.rb @@ -38,13 +38,23 @@ def gdrcopy_checksum recursive true end - remote_file gdrcopy_tarball do - source gdrcopy_url - mode '0644' - retries 3 - retry_delay 5 - checksum gdrcopy_checksum - action :create_if_missing + # remote_file gdrcopy_tarball do + # source gdrcopy_url + # mode '0644' + # retries 3 + # retry_delay 5 + # checksum gdrcopy_checksum + # action :create_if_missing + # end + + bash 'get gdrcopy from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-GDR + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz #{gdrcopy_tarball} + GDR end package_repos 'update package repos' do diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb index 261128cb3b..293fc1bc78 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_amazon2.rb @@ -15,7 +15,12 @@ provides :nvidia_dcgm, platform: 'amazon', platform_version: '2' use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled !arm_instance? && _nvidia_enabled end + +def platform + 'rhel7' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb index 00d5c18ea7..2170999305 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_centos7.rb @@ -17,7 +17,12 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled !arm_instance? && _nvidia_enabled end + +def platform + 'rhel7' +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb index 88a2e98e71..247ef2fcad 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_redhat8.rb @@ -17,7 +17,12 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled _nvidia_enabled end + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb index b56aa2cf5b..f8630ef2da 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_rocky8.rb @@ -17,7 +17,12 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_rhel.rb' def _nvidia_dcgm_enabled _nvidia_enabled end + +def platform + "rhel#{node['platform_version'].to_i}" +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb index 520c655e37..e4e54bb40c 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/nvidia_dcgm_ubuntu20+.rb @@ -17,11 +17,16 @@ end use 'partial/_nvidia_dcgm_common.rb' +use 'partial/_nvidia_dcgm_debian.rb' def _nvidia_dcgm_enabled _nvidia_enabled end -def package_version - "1:#{node['cluster']['nvidia']['dcgm_version']}" # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. See details here: https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver -end +# def package_version +# "1:#{node['cluster']['nvidia']['dcgm_version']}" # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. See details here: https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver +# end + +def platform + "ubuntu#{node['platform_version'].delete('.')}" +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_alinux2_centos7.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_alinux2_centos7.rb deleted file mode 100644 index 8543561c23..0000000000 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_alinux2_centos7.rb +++ /dev/null @@ -1,19 +0,0 @@ -# frozen_string_literal: true -# -# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. - -action :setup do - return if arm_instance? || !_nvidia_enabled - - action_install_package -end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb index b02b5476f9..795d90a6cb 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb @@ -21,25 +21,23 @@ return unless _nvidia_dcgm_enabled # Add NVIDIA repo for fabric manager and datacenter-gpu-manager - nvidia_repo 'add nvidia repository' do - action :add - end - - package 'datacenter-gpu-manager' do - retries 3 - retry_delay 5 - version package_version - end - - nvidia_repo 'remove nvidia repository' do - action :remove - end + # nvidia_repo 'add nvidia repository' do + # action :add + # end + + # package 'datacenter-gpu-manager' do + # retries 3 + # retry_delay 5 + # version package_version + # end + + action_install_package + + # nvidia_repo 'remove nvidia repository' do + # action :remove + # end end def _nvidia_enabled nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled -end - -def package_version - node['cluster']['nvidia']['dcgm_version'] -end +end \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb new file mode 100644 index 0000000000..97b54300cb --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true +# +# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_package do + + bash "Install #{dcgm_package}" do + user 'root' + code <<-DCGM_INSTALL + set -e + aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.deb + DCGM_INSTALL + retries 3 + retry_delay 5 + end + + command "apt -y install #{dcgm_package}-#{package_version}.deb " + retries 3 + retry_delay 5 + +end + +def dcgm_url + "s3://hgreebe-dependencies/archives/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb" +end + +def dcgm_package + 'datacenter-gpu-manager' +end + +def arch_suffix + arm_instance? ? 'arm64' : 'amd64' +end + +def package_version + node['cluster']['nvidia']['dcgm_version'] +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb new file mode 100644 index 0000000000..2aaf01bb3c --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true +# +# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +action :install_package do + + bash "Install #{dcgm_package}" do + user 'root' + code <<-DCGM_INSTALL + set -e + aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.rpm + yum install -y #{dcgm_package}-#{package_version}.rpm + DCGM_INSTALL + retries 3 + retry_delay 5 + end + +end + +def dcgm_url + "s3://hgreebe-dependencies/archives/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm" +end + +def dcgm_package + 'datacenter-gpu-manager' +end + +def arch_suffix + arm_instance? ? 'aarch64' : 'x86_64' +end + +def package_version + node['cluster']['nvidia']['dcgm_version'] +end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb index fe6f149402..8ecd0c1284 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb @@ -27,11 +27,23 @@ node.default['cluster']['nvidia']['driver_version'] = _nvidia_driver_version node_attributes "Save Nvidia driver version for Inspec tests" - remote_file tmp_nvidia_run do - source nvidia_driver_url - mode '0755' - retries 3 - retry_delay 5 + # remote_file tmp_nvidia_run do + # source nvidia_driver_url + # mode '0755' + # retries 3 + # retry_delay 5 + # not_if { ::File.exist?('/usr/bin/nvidia-smi') } + # end + + bash 'get nvidia driver from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-NVIDIA + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{_nvidia_driver_version}.run #{tmp_nvidia_run} + chmod 755 #{tmp_nvidia_run} + NVIDIA not_if { ::File.exist?('/usr/bin/nvidia-smi') } end diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index 9192895e9c..87ca20c00d 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -1,5 +1,6 @@ # Python Version default['cluster']['python-version'] = '3.9.17' +default['cluster']['python-major-minor-version'] = '3.9' # ParallelCluster versions default['cluster']['parallelcluster-version'] = '3.10.0' diff --git a/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb b/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb index 22da62c1c2..32b29cdb14 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/activate_virtual_env.rb @@ -15,29 +15,14 @@ default_action :run action :run do - pyenv_script "pyenv virtualenv #{new_resource.pyenv_name}" do - code "pyenv virtualenv #{new_resource.python_version} #{new_resource.pyenv_name}" - user new_resource.user if new_resource.user - end - - pyenv_pip "pip" do - virtualenv new_resource.pyenv_path - user new_resource.user if new_resource.user - action :upgrade - end - - unless new_resource.requirements_path.empty? - # Copy requirements file - cookbook_file "#{new_resource.pyenv_path}/requirements.txt" do - source new_resource.requirements_path - mode '0755' - end - - # Install given requirements in the virtual environment - pyenv_pip "#{new_resource.pyenv_path}/requirements.txt" do - virtualenv new_resource.pyenv_path - user new_resource.user if new_resource.user - requirement true - end + bash 'create venv' do + user 'root' + group 'root' + cwd "#{node['cluster']['system_pyenv_root']}" + code <<-VENV + set -e + versions/#{new_resource.python_version}/bin/python#{node['cluster']['python-major-minor-version']} -m venv #{new_resource.pyenv_path} + source #{new_resource.pyenv_path}/bin/activate + VENV end end diff --git a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb index 1d9f86bded..2a8f0cea09 100644 --- a/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb +++ b/cookbooks/aws-parallelcluster-shared/resources/install_pyenv.rb @@ -13,8 +13,14 @@ default_action :run action :run do + #sudo /usr/bin/python3.8 -m venv /opt/parallelcluster/test-venv python_version = new_resource.python_version || node['cluster']['python-version'] + # alinux_extras_topic 'python 3.8' do + # topic 'python3.8' + # end + + if new_resource.user_only raise "user property is required for resource install_pyenv when user_only is set to true" unless new_resource.user @@ -29,24 +35,39 @@ recursive true end - pyenv_install 'system' do - prefix prefix + bash "install python #{python_version}" do + user 'root' + group 'root' + cwd "#{prefix}" + code <<-VENV + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/python/Python-#{python_version}.tgz Python-#{python_version}.tgz + tar -xzf Python-#{python_version}.tgz + cd Python-#{python_version} + ./configure --prefix=#{prefix}/versions/#{python_version} + make + make install + VENV end + # pyenv_install 'system' do + # prefix prefix + # end + # Remove the profile.d script that the pyenv cookbook writes. # This is done in order to avoid exposing the ParallelCluster pyenv installation to customers # on login. - file '/etc/profile.d/pyenv.sh' do - action :delete - end + # file '/etc/profile.d/pyenv.sh' do + # action :delete + # end end - pyenv_python python_version do - user new_resource.user if new_resource.user_only - end - - pyenv_plugin 'virtualenv' do - git_url 'https://github.com/pyenv/pyenv-virtualenv' - user new_resource.user if new_resource.user_only - end + # pyenv_python python_version do + # user new_resource.user if new_resource.user_only + # end + # + # pyenv_plugin 'virtualenv' do + # git_url 'https://github.com/pyenv/pyenv-virtualenv' + # user new_resource.user if new_resource.user_only + # end end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb index d75daf60cb..adde16739d 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_jwt.rb @@ -20,13 +20,23 @@ jwt_tarball = "#{node['cluster']['sources_dir']}/libjwt-#{jwt_version}.tar.gz" jwt_sha256 = 'cb2fd95123689e7d209a3a8c060e02f68341c9a5ded524c0cd881a8cd20d711f' -remote_file jwt_tarball do - source jwt_url - mode '0644' - retries 3 - retry_delay 5 - checksum jwt_sha256 - action :create_if_missing +# remote_file jwt_tarball do +# source jwt_url +# mode '0644' +# retries 3 +# retry_delay 5 +# checksum jwt_sha256 +# action :create_if_missing +# end + +bash 'get jwt from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-JWT + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/jwt/v#{jwt_version}.tar.gz #{jwt_tarball} + JWT end jwt_dependencies 'Install jwt dependencies' diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb index ad6f2d3c73..469df4700d 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_pmix.rb @@ -21,13 +21,23 @@ pmix_sha256 = node['cluster']['pmix']['sha256'] pmix_tarball = "#{node['cluster']['sources_dir']}/pmix-#{pmix_version}.tar.gz" -remote_file pmix_tarball do - source pmix_url - mode '0644' - retries 3 - retry_delay 5 - checksum pmix_sha256 - action :create_if_missing +# remote_file pmix_tarball do +# source pmix_url +# mode '0644' +# retries 3 +# retry_delay 5 +# checksum pmix_sha256 +# action :create_if_missing +# end + +bash 'get pmix from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-PMIX + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/pmix/pmix-#{pmix_version}.tar.gz #{pmix_tarball} + PMIX end bash 'Install PMIx' do diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_slurm.rb b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_slurm.rb index 656c5b33ae..74929274d6 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/install/install_slurm.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/install/install_slurm.rb @@ -38,13 +38,23 @@ include_recipe 'aws-parallelcluster-slurm::slurm_users' # Get slurm tarball -remote_file slurm_tarball do - source slurm_url - mode '0644' - retries 3 - retry_delay 5 - checksum slurm_sha256 - action :create_if_missing +# remote_file slurm_tarball do +# source slurm_url +# mode '0644' +# retries 3 +# retry_delay 5 +# checksum slurm_sha256 +# action :create_if_missing +# end + +bash 'get slurm from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-SLURM + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/slurm/#{slurm_tar_name}.tar.gz #{slurm_tarball} + SLURM end # Copy Slurm patches diff --git a/cookbooks/aws-parallelcluster-slurm/resources/munge/partial/_munge_actions.rb b/cookbooks/aws-parallelcluster-slurm/resources/munge/partial/_munge_actions.rb index b97e037eaf..cac3e8cb22 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/munge/partial/_munge_actions.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/munge/partial/_munge_actions.rb @@ -62,14 +62,23 @@ action :download_source_code do # Get munge tarball - remote_file munge_tarball do - source munge_url - mode '0644' - retries 3 - retry_delay 5 - checksum munge_sha256 - action :create_if_missing + bash 'get munge from s3' do + user 'root' + group 'root' + cwd "#{node['cluster']['sources_dir']}" + code <<-MUNGE + set -e + aws s3 cp s3://hgreebe-dependencies/archives/dependencies/munge/munge-#{munge_version}.tar.gz #{munge_tarball} + MUNGE end + # remote_file munge_tarball do + # source munge_url + # mode '0644' + # retries 3 + # retry_delay 5 + # checksum munge_sha256 + # action :create_if_missing + # end end action :compile_and_install do