Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BuildImage] Download build image dependencies from s3 bucket #2761

Merged
merged 7 commits into from
Jun 17, 2024
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste

**ENHANCEMENTS**
- Add support for external Slurmdbd.
- Allow build-image to be run in an isolated network.

**CHANGES**
- Upgrade Cinc Client to version to 18.4.12 from 18.2.7.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,25 @@
not_if { ::File.exist?("#{virtualenv_path}/bin/activate") }
hgreebe marked this conversation as resolved.
Show resolved Hide resolved
end

remote_file "#{node['cluster']['base_dir']}/awsbatch-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
tar xzf awsbatch-dependencies.tgz
cd awsbatch
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end

node.default['cluster']['awsbatch_virtualenv_path'] = virtualenv_path
node_attributes "dump node attributes"
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
mkdir aws-parallelcluster-awsbatch-cli
tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli
cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-*

#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/
CLI
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@
# TODO: once the pyenv Chef resource supports installing packages from a path (e.g. `pip install .`), convert the
# bash block to a recipe that uses the pyenv resource.

remote_file "#{Chef::Config[:file_cache_path]}/node-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "install custom aws-parallelcluster-node" do
cwd Chef::Config[:file_cache_path]
code <<-NODE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@
if is_custom_node?
include_recipe 'aws-parallelcluster-computefleet::custom_parallelcluster_node'
else
pyenv_pip 'aws-parallelcluster-node' do
version node['cluster']['parallelcluster-node-version']
virtualenv virtualenv_path
execute "install official aws-parallelcluster-node" do
command "#{virtualenv_path}/bin/pip install aws-parallelcluster-node==#{node['cluster']['parallelcluster-node-version']}"
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,7 @@
end

it 'installs official node package' do
is_expected.to install_pyenv_pip('aws-parallelcluster-node').with(
version: node_version,
virtualenv: virtualenv_path
)
is_expected.to run_execute('install official aws-parallelcluster-node')
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,26 @@
not_if { ::File.exist?("#{virtualenv_path}/bin/activate") }
end

remote_file "#{node['cluster']['base_dir']}/cfn-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cfn-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
tar xzf cfn-dependencies.tgz
cd cfn
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end

cfnbootstrap_version = '2.0-28'
cfnbootstrap_package = "aws-cfn-bootstrap-py3-#{cfnbootstrap_version}.tar.gz"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
)
end

it 'installs python packages' do
is_expected.to run_bash("pip install").with(
user: 'root',
group: 'root',
cwd: "#{node['cluster']['base_dir']}"
)
end

it 'sets virtualenv path' do
expect(node.default['cluster']['cfn_bootstrap_virtualenv_path']).to eq(virtualenv_path)
is_expected.to write_node_attributes('dump node attributes')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,25 @@
activate_virtual_env cookbook_virtualenv_name do
pyenv_path cookbook_virtualenv_path
python_version cookbook_python_version
requirements_path "cookbook_virtualenv/requirements.txt"
not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") }
hgreebe marked this conversation as resolved.
Show resolved Hide resolved
end

remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/cookbook-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
tar xzf cookbook-dependencies.tgz
cd dependencies
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
cuda_version_suffix = '535.104.05'
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
cuda_url = "https://developer.download.nvidia.com/compute/cuda/#{cuda_complete_version}/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
cuda_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
cuda_samples_version = '12.2'
cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz"
cuda_samples_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz"
tmp_cuda_run = '/tmp/cuda.run'
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
intelmpi_installation_path = "/opt/intel/mpi/#{intelmpi_version}"
intelmpi_installer = "l_mpi_oneapi_p_#{intelmpi_full_version}_offline.sh"
intelmpi_installer_path = "#{node['cluster']['sources_dir']}/#{intelmpi_installer}"
intelmpi_installer_url = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{aws_domain}/archives/impi/#{intelmpi_installer}"
intelmpi_installer_url = "#{node['cluster']['artifacts_s3_url']}/impi/#{intelmpi_installer}"
intelmpi_qt_version = '6.4.2'

# Prerequisite for module install
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@
armpl_tarball_name = "arm-performance-libraries_#{armpl_version}_#{armpl_platform}_gcc-#{gcc_major_minor_version}.tar"

armpl_url = %W(
https://#{new_resource.region}-aws-parallelcluster.s3.#{new_resource.region}.#{new_resource.aws_domain}
archives/armpl/#{armpl_platform}
#{node['cluster']['artifacts_s3_url']}
armpl/#{armpl_platform}
#{armpl_tarball_name}
).join('/')

Expand Down Expand Up @@ -111,7 +111,7 @@
end

gcc_version = "#{gcc_major_minor_version}.#{new_resource.gcc_patch_version}"
gcc_url = "https://ftp.gnu.org/gnu/gcc/gcc-#{gcc_version}/gcc-#{gcc_version}.tar.gz"
gcc_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/gcc/gcc-#{gcc_version}.tar.gz"
gcc_tarball = "#{new_resource.sources_dir}/gcc-#{gcc_version}.tar.gz"

# Get gcc tarball
Expand All @@ -137,7 +137,7 @@
tar -xf #{gcc_tarball}
cd gcc-#{gcc_version}
# Patch the download_prerequisites script to download over https and not ftp. This works better in China regions.
sed -i "s#ftp://gcc\.gnu\.org#https://gcc.gnu.org#g" ./contrib/download_prerequisites
sed -i "s#ftp://gcc\.gnu\.org/pub/gcc/infrastructure##{node['cluster']['artifacts_s3_url']}/dependencies/gcc/prerequisites#g" ./contrib/download_prerequisites
./contrib/download_prerequisites
mkdir build && cd build
../configure --prefix=/opt/arm/armpl/gcc/#{gcc_version} --disable-bootstrap --enable-checking=release --enable-languages=c,c++,fortran --disable-multilib
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def dcv_gpu_accel_supported?
end

def dcv_url
"https://d1uj6qtbmh3dt5.cloudfront.net/#{node['cluster']['dcv']['version'].split('-')[0]}/Servers/#{dcv_package}.tgz"
"#{node['cluster']['artifacts_s3_url']}/dependencies/dcv/#{dcv_package}.tgz"
end

def dcv_tarball
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel9'
end
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,9 @@ def fabric_manager_package
end

def fabric_manager_version
"#{_nvidia_driver_version}*"
"#{_nvidia_driver_version}"
end

def platform
"ubuntu#{node['platform_version'].delete('.')}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,7 @@
node.default['cluster']['nvidia']['fabricmanager']['version'] = fabric_manager_version
node_attributes "dump node attributes"

# Add NVIDIA repo for fabric manager and datacenter-gpu-manager
nvidia_repo 'add nvidia repository' do
action :add
end

action_install_package

nvidia_repo 'remove nvidia repository' do
action :remove
end
end

action :configure do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,30 @@

action :install_package do
# For ubuntu, CINC17 apt-package resources need full versions for `version`
execute "install_fabricmanager_for_ubuntu" do
command "apt -y install #{fabric_manager_package}=#{fabric_manager_version} "\
"&& apt-mark hold #{fabric_manager_package}"
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.deb" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "install_fabricmanager_for_ubuntu" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER
set -e
dpkg -i #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}
FABRIC_MANAGER
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'arm64' : 'amd64'
end

def fabric_manager_url
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
end
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,32 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.rpm" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

package 'yum-plugin-versionlock'
bash "Install #{fabric_manager_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER_INSTALL
set -e
yum install -y #{fabric_manager_package}-#{fabric_manager_version}
yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum versionlock #{fabric_manager_package}
FABRIC_MANAGER_INSTALL
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'aarch64' : 'x86_64'
end

def fabric_manager_url
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
end
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,5 @@ def gdrcopy_version_extended
end

def gdrcopy_url
"https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v#{gdrcopy_version}.tar.gz"
"#{node['cluster']['artifacts_s3_url']}/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz"
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
end

def platform
'rhel9'
end
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
provides :nvidia_dcgm, platform: 'amazon', platform_version: '2'

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
end

def platform
'rhel7'
end
Loading
Loading