Skip to content

Commit

Permalink
[BuildImage] Download build image dependencies from s3 bucket (#2761)
Browse files Browse the repository at this point in the history
* Retrieve dependencies from s3 bucket rather than from the open-internet

* Modify spec test to expect dependencies to be retrieved from an s3 bucket

* Get python dependency using https rather than aws cli

* Point s3 url for dependency download to production bucket

* Fix fabric manager install to install from a local package

* Update changelog to include changes to build-image

---------

Co-authored-by: Hanwen <[email protected]>
  • Loading branch information
hgreebe and hanwen-pcluste authored Jun 17, 2024
1 parent 0079b13 commit a77217b
Show file tree
Hide file tree
Showing 51 changed files with 410 additions and 211 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste

**ENHANCEMENTS**
- Add support for external Slurmdbd.
- Allow build-image to be run in an isolated network.

**CHANGES**
- Upgrade Cinc Client to version to 18.4.12 from 18.2.7.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,25 @@
not_if { ::File.exist?("#{virtualenv_path}/bin/activate") }
end

remote_file "#{node['cluster']['base_dir']}/awsbatch-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
tar xzf awsbatch-dependencies.tgz
cd awsbatch
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end

node.default['cluster']['awsbatch_virtualenv_path'] = virtualenv_path
node_attributes "dump node attributes"
1 change: 1 addition & 0 deletions cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
mkdir aws-parallelcluster-awsbatch-cli
tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli
cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-*
#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/
CLI
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@
# TODO: once the pyenv Chef resource supports installing packages from a path (e.g. `pip install .`), convert the
# bash block to a recipe that uses the pyenv resource.

remote_file "#{Chef::Config[:file_cache_path]}/node-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "install custom aws-parallelcluster-node" do
cwd Chef::Config[:file_cache_path]
code <<-NODE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@
if is_custom_node?
include_recipe 'aws-parallelcluster-computefleet::custom_parallelcluster_node'
else
pyenv_pip 'aws-parallelcluster-node' do
version node['cluster']['parallelcluster-node-version']
virtualenv virtualenv_path
execute "install official aws-parallelcluster-node" do
command "#{virtualenv_path}/bin/pip install aws-parallelcluster-node==#{node['cluster']['parallelcluster-node-version']}"
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,7 @@
end

it 'installs official node package' do
is_expected.to install_pyenv_pip('aws-parallelcluster-node').with(
version: node_version,
virtualenv: virtualenv_path
)
is_expected.to run_execute('install official aws-parallelcluster-node')
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,26 @@
not_if { ::File.exist?("#{virtualenv_path}/bin/activate") }
end

remote_file "#{node['cluster']['base_dir']}/cfn-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cfn-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
tar xzf cfn-dependencies.tgz
cd cfn
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end

cfnbootstrap_version = '2.0-28'
cfnbootstrap_package = "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
)
end

it 'installs python packages' do
is_expected.to run_bash("pip install").with(
user: 'root',
group: 'root',
cwd: "#{node['cluster']['base_dir']}"
)
end

it 'sets virtualenv path' do
expect(node.default['cluster']['cfn_bootstrap_virtualenv_path']).to eq(virtualenv_path)
is_expected.to write_node_attributes('dump node attributes')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,25 @@
activate_virtual_env cookbook_virtualenv_name do
pyenv_path cookbook_virtualenv_path
python_version cookbook_python_version
requirements_path "cookbook_virtualenv/requirements.txt"
not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") }
end

remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cookbook-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
tar xzf cookbook-dependencies.tgz
cd dependencies
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '535.104.05'
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
cuda_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
cuda_samples_version = '12.2'
cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz"
cuda_samples_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz"
tmp_cuda_run = '/tmp/cuda.run'
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
intelmpi_installation_path = "/opt/intel/mpi/#{intelmpi_version}"
intelmpi_installer = "l_mpi_oneapi_p_#{intelmpi_full_version}_offline.sh"
intelmpi_installer_path = "#{node['cluster']['sources_dir']}/#{intelmpi_installer}"
intelmpi_installer_url = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{aws_domain}/archives/impi/#{intelmpi_installer}"
intelmpi_installer_url = "#{node['cluster']['artifacts_s3_url']}/impi/#{intelmpi_installer}"
intelmpi_qt_version = '6.4.2'

# Prerequisite for module install
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@
armpl_tarball_name = "arm-performance-libraries_#{armpl_version}_#{armpl_platform}_gcc-#{gcc_major_minor_version}.tar"

armpl_url = %W(
https://#{new_resource.region}-aws-parallelcluster.s3.#{new_resource.region}.#{new_resource.aws_domain}
archives/armpl/#{armpl_platform}
#{node['cluster']['artifacts_s3_url']}
armpl/#{armpl_platform}
#{armpl_tarball_name}
).join('/')

Expand Down Expand Up @@ -111,7 +111,7 @@
end

gcc_version = "#{gcc_major_minor_version}.#{new_resource.gcc_patch_version}"
gcc_url = "https://ftp.gnu.org/gnu/gcc/gcc-#{gcc_version}/gcc-#{gcc_version}.tar.gz"
gcc_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/gcc/gcc-#{gcc_version}.tar.gz"
gcc_tarball = "#{new_resource.sources_dir}/gcc-#{gcc_version}.tar.gz"

# Get gcc tarball
Expand All @@ -137,7 +137,7 @@
tar -xf #{gcc_tarball}
cd gcc-#{gcc_version}
# Patch the download_prerequisites script to download over https and not ftp. This works better in China regions.
sed -i "s#ftp://gcc\.gnu\.org#https://gcc.gnu.org#g" ./contrib/download_prerequisites
sed -i "s#ftp://gcc\.gnu\.org/pub/gcc/infrastructure##{node['cluster']['artifacts_s3_url']}/dependencies/gcc/prerequisites#g" ./contrib/download_prerequisites
./contrib/download_prerequisites
mkdir build && cd build
../configure --prefix=/opt/arm/armpl/gcc/#{gcc_version} --disable-bootstrap --enable-checking=release --enable-languages=c,c++,fortran --disable-multilib
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def dcv_gpu_accel_supported?
end

def dcv_url
"https://d1uj6qtbmh3dt5.cloudfront.net/#{node['cluster']['dcv']['version'].split('-')[0]}/Servers/#{dcv_package}.tgz"
"#{node['cluster']['artifacts_s3_url']}/dependencies/dcv/#{dcv_package}.tgz"
end

def dcv_tarball
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel9'
end
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,9 @@ def fabric_manager_package
end

def fabric_manager_version
"#{_nvidia_driver_version}*"
"#{_nvidia_driver_version}"
end

def platform
"ubuntu#{node['platform_version'].delete('.')}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,7 @@
node.default['cluster']['nvidia']['fabricmanager']['version'] = fabric_manager_version
node_attributes "dump node attributes"

# Add NVIDIA repo for fabric manager and datacenter-gpu-manager
nvidia_repo 'add nvidia repository' do
action :add
end

action_install_package

nvidia_repo 'remove nvidia repository' do
action :remove
end
end

action :configure do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,30 @@

action :install_package do
# For ubuntu, CINC17 apt-package resources need full versions for `version`
execute "install_fabricmanager_for_ubuntu" do
command "apt -y install #{fabric_manager_package}=#{fabric_manager_version} "\
"&& apt-mark hold #{fabric_manager_package}"
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.deb" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "install_fabricmanager_for_ubuntu" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER
set -e
dpkg -i #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}
FABRIC_MANAGER
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'arm64' : 'amd64'
end

def fabric_manager_url
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
end
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,32 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.rpm" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

package 'yum-plugin-versionlock'
bash "Install #{fabric_manager_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER_INSTALL
set -e
yum install -y #{fabric_manager_package}-#{fabric_manager_version}
yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum versionlock #{fabric_manager_package}
FABRIC_MANAGER_INSTALL
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'aarch64' : 'x86_64'
end

def fabric_manager_url
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
end
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,5 @@ def gdrcopy_version_extended
end

def gdrcopy_url
"https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v#{gdrcopy_version}.tar.gz"
"#{node['cluster']['artifacts_s3_url']}/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz"
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
end

def platform
'rhel9'
end
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
provides :nvidia_dcgm, platform: 'amazon', platform_version: '2'

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
end

def platform
'rhel7'
end
Loading

0 comments on commit a77217b

Please sign in to comment.