Skip to content

Commit

Permalink
Retrieve dependencies from s3 bucket and use built in venv
Browse files Browse the repository at this point in the history
  • Loading branch information
hgreebe committed Apr 23, 2024
1 parent 7d414fe commit f1ed06c
Show file tree
Hide file tree
Showing 32 changed files with 412 additions and 150 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
activate_virtual_env virtualenv_name do
pyenv_path virtualenv_path
python_version python_version
not_if { ::File.exist?("#{virtualenv_path}/bin/activate") }
end

node.default['cluster']['awsbatch_virtualenv_path'] = virtualenv_path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
activate_virtual_env node_virtualenv_name do
pyenv_path node_virtualenv_path
python_version node_python_version
not_if { ::File.exist?("#{virtualenv_path}/bin/activate") }
end

if is_custom_node?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
overwrite true
end

alinux_extras_topic 'python 3.8' do
topic 'python3.8'
end

bash 'install awscli' do
code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws"
code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws"
end
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,36 @@

install_pyenv 'pyenv for default python version'

# alinux_extras_topic 'python 3.8' do
# topic 'python3.8'
# end

activate_virtual_env cookbook_virtualenv_name do
pyenv_path cookbook_virtualenv_path
python_version cookbook_python_version
requirements_path "cookbook_virtualenv/requirements.txt"
not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") }
end

cookbook_file "#{virtualenv_path}/requirements.txt" do
source "cookbook_virtualenv/requirements.txt"
mode '0755'
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
aws s3 cp s3://hgreebe-dependencies/archives/dependencies/PyPi/dependencies.tar.gz dependencies.tar.gz
tar xzf dependencies.tar.gz
cd dependencies
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end

# activate_virtual_env cookbook_virtualenv_name do
# pyenv_path cookbook_virtualenv_path
# python_version cookbook_python_version
# requirements_path "cookbook_virtualenv/requirements.txt"
# # not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") }
# end
39 changes: 26 additions & 13 deletions cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,25 @@
node_attributes 'Save cuda and cuda samples versions for InSpec tests'

# Get CUDA run file
remote_file tmp_cuda_run do
source cuda_url
mode '0755'
retries 3
retry_delay 5
not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}") }
# remote_file tmp_cuda_run do
# source cuda_url
# mode '0755'
# retries 3
# retry_delay 5
# not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}") }
# end

bash 'get cuda and cuda samples from s3' do
user 'root'
group 'root'
cwd "#{node['cluster']['sources_dir']}"
code <<-CUDA
set -e
aws s3 cp s3://hgreebe-dependencies/archives/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run #{tmp_cuda_run}
chmod 755 #{tmp_cuda_run}
aws s3 cp s3://hgreebe-dependencies/archives/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz #{tmp_cuda_sample_archive}
chmod 644 #{tmp_cuda_sample_archive}
CUDA
end

# Install CUDA driver
Expand All @@ -57,13 +70,13 @@
end

# Get CUDA Sample Files
remote_file tmp_cuda_sample_archive do
source cuda_samples_url
mode '0644'
retries 3
retry_delay 5
not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}/samples") }
end
# remote_file tmp_cuda_sample_archive do
# source cuda_samples_url
# mode '0644'
# retries 3
# retry_delay 5
# not_if { ::File.exist?("/usr/local/cuda-#{cuda_version}/samples") }
# end

# Unpack CUDA Samples
bash 'cuda.sample install' do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,23 @@
gcc_tarball = "#{new_resource.sources_dir}/gcc-#{gcc_version}.tar.gz"

# Get gcc tarball
remote_file gcc_tarball do
source gcc_url
mode '0644'
retries 5
retry_delay 10
ssl_verify_mode :verify_none
action :create_if_missing
# remote_file gcc_tarball do
# source gcc_url
# mode '0644'
# retries 5
# retry_delay 10
# ssl_verify_mode :verify_none
# action :create_if_missing
# end

bash 'get gcc from s3' do
user 'root'
group 'root'
cwd "#{node['cluster']['sources_dir']}"
code <<-GCC
set -e
aws s3 cp s3://hgreebe-dependencies/archives/dependencies/gcc/gcc-#{gcc_version}.tar.gz #{gcc_tarball}
GCC
end

# Install gcc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
_nvidia_driver_version
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,7 @@ def fabric_manager_package
def fabric_manager_version
"#{_nvidia_driver_version}*"
end

def platform
"ubuntu#{node['platform_version'].delete('.')}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@
node_attributes "dump node attributes"

# Add NVIDIA repo for fabric manager and datacenter-gpu-manager
nvidia_repo 'add nvidia repository' do
action :add
end
# nvidia_repo 'add nvidia repository' do
# action :add
# end

action_install_package

nvidia_repo 'remove nvidia repository' do
action :remove
end
# nvidia_repo 'remove nvidia repository' do
# action :remove
# end
end

action :configure do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,27 @@
action :install_package do
# For ubuntu, CINC17 apt-package resources need full versions for `version`
execute "install_fabricmanager_for_ubuntu" do
command "apt -y install #{fabric_manager_package}=#{fabric_manager_version} "\
bash "Install #{fabric_manager_package}" do
user 'root'
code <<-FABRIC_MANAGER
set -e
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.deb
FABRIC_MANAGER
retries 3
retry_delay 5
end

command "apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb "\
"&& apt-mark hold #{fabric_manager_package}"
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'arm64' : 'amd64'
end

def fabric_manager_url
"s3://hgreebe-dependencies/archives/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
end
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,19 @@
user 'root'
code <<-FABRIC_MANAGER_INSTALL
set -e
yum install -y #{fabric_manager_package}-#{fabric_manager_version}
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum versionlock #{fabric_manager_package}
FABRIC_MANAGER_INSTALL
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'aarch64' : 'x86_64'
end

def fabric_manager_url
"s3://hgreebe-dependencies/archives/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
end
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,23 @@ def gdrcopy_checksum
recursive true
end

remote_file gdrcopy_tarball do
source gdrcopy_url
mode '0644'
retries 3
retry_delay 5
checksum gdrcopy_checksum
action :create_if_missing
# remote_file gdrcopy_tarball do
# source gdrcopy_url
# mode '0644'
# retries 3
# retry_delay 5
# checksum gdrcopy_checksum
# action :create_if_missing
# end

bash 'get gdrcopy from s3' do
user 'root'
group 'root'
cwd "#{node['cluster']['sources_dir']}"
code <<-GDR
set -e
aws s3 cp s3://hgreebe-dependencies/archives/dependencies/gdr_copy/v#{gdrcopy_version}.tar.gz #{gdrcopy_tarball}
GDR
end

package_repos 'update package repos' do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
provides :nvidia_dcgm, platform: 'amazon', platform_version: '2'

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
end

def platform
'rhel7'
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
end

def platform
"rhel#{node['platform_version'].to_i}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_debian.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
end

def package_version
"1:#{node['cluster']['nvidia']['dcgm_version']}" # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. See details here: https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver
end
# def package_version
# "1:#{node['cluster']['nvidia']['dcgm_version']}" # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. See details here: https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver
# end

def platform
"ubuntu#{node['platform_version'].delete('.')}"
end

This file was deleted.

Loading

0 comments on commit f1ed06c

Please sign in to comment.