Skip to content

Commit

Permalink
Fix fabric manager install to install from a local package
Browse files Browse the repository at this point in the history
  • Loading branch information
hgreebe committed Jun 17, 2024
1 parent fbc9d4b commit 4259bef
Show file tree
Hide file tree
Showing 22 changed files with 98 additions and 101 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,27 @@
activate_virtual_env virtualenv_name do
pyenv_path virtualenv_path
python_version python_version
not_if { ::File.exist?("#{virtualenv_path}/bin/activate") }
end

remote_file "#{node['cluster']['base_dir']}/awsbatch-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash 'pip install' do
user 'root'
group 'root'
cwd "#{node['cluster']['base_dir']}"
code <<-REQ
set -e
tar xzf awsbatch-dependencies.tgz
cd awsbatch
#{virtualenv_path}/bin/pip install * -f ./ --no-index
REQ
end

node.default['cluster']['awsbatch_virtualenv_path'] = virtualenv_path
Expand Down
24 changes: 2 additions & 22 deletions cookbooks/aws-parallelcluster-awsbatch/recipes/install.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,32 +44,12 @@
tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli
cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-*
aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz awsbatch-dependencies.tgz --region #{node['cluster']['region']}
tar xzf awsbatch-dependencies.tgz
cd awsbatch
#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install * -f ./ --no-index
cd ..
#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/
CLI
end
else
# Install aws-parallelcluster-awsbatch-cli package
bash "install aws-parallelcluster-awsbatch-cli" do
cwd Chef::Config[:file_cache_path]
code <<-CLI
set -e
package_url=#{node['cluster']['artifacts_build_url']}/awsbatch/aws-parallelcluster.tgz
aws s3 cp ${package_url} aws-parallelcluster.tgz --region #{node['cluster']['region']}
mkdir aws-parallelcluster-awsbatch-cli
tar -xzf aws-parallelcluster.tgz --directory aws-parallelcluster-awsbatch-cli
aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/awsbatch-dependencies.tgz awsbatch-dependencies.tgz --region #{node['cluster']['region']}
tar xzf awsbatch-dependencies.tgz
cd awsbatch
#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install * -f ./ --no-index
cd ..
cd aws-parallelcluster-awsbatch-cli/*aws-parallelcluster-*
#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install awsbatch-cli/
CLI
execute "pip_install_parallelcluster_awsbatch_cli" do
command "#{node['cluster']['awsbatch_virtualenv_path']}/bin/pip install aws-parallelcluster-awsbatch-cli==#{node['cluster']['parallelcluster-awsbatch-cli-version']}"
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@
# TODO: once the pyenv Chef resource supports installing packages from a path (e.g. `pip install .`), convert the
# bash block to a recipe that uses the pyenv resource.

remote_file "#{Chef::Config[:file_cache_path]}/node-dependencies.tgz" do
source "#{node['cluster']['artifacts_s3_url']}/dependencies/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "install custom aws-parallelcluster-node" do
cwd Chef::Config[:file_cache_path]
code <<-NODE
Expand All @@ -38,12 +46,6 @@
mkdir aws-parallelcluster-custom-node
tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-custom-node
cd aws-parallelcluster-custom-node/*aws-parallelcluster-node-*
aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz node-dependencies.tgz --region #{node['cluster']['region']}
tar xzf node-dependencies.tgz
cd node
#{node_virtualenv_path}/bin/pip install * -f ./ --no-index
cd ..
pip install .
deactivate
NODE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,27 +36,7 @@
if is_custom_node?
include_recipe 'aws-parallelcluster-computefleet::custom_parallelcluster_node'
else
bash "install official aws-parallelcluster-node" do
cwd Chef::Config[:file_cache_path]
code <<-NODE
set -e
[[ ":$PATH:" != *":/usr/local/bin:"* ]] && PATH="/usr/local/bin:${PATH}"
echo "PATH is $PATH"
source #{node_virtualenv_path}/bin/activate
pip uninstall --yes aws-parallelcluster-node
node_url=#{node['cluster']['artifacts_build_url']}/node/aws-parallelcluster-node.tgz
aws s3 cp ${node_url} aws-parallelcluster-node.tgz --region #{node['cluster']['region']}
rm -fr aws-parallelcluster-node
mkdir aws-parallelcluster-node
tar -xzf aws-parallelcluster-node.tgz --directory aws-parallelcluster-node
aws s3 cp #{node['cluster']['artifacts_build_url']}/PyPi/#{node['kernel']['machine']}/node-dependencies.tgz node-dependencies.tgz --region #{node['cluster']['region']}
tar xzf node-dependencies.tgz
cd node
#{node_virtualenv_path}/bin/pip install * -f ./ --no-index
cd ..
cd aws-parallelcluster-node/*aws-parallelcluster-node-*
pip install .
deactivate
NODE
execute "install official aws-parallelcluster-node" do
command "#{virtualenv_path}/bin/pip install aws-parallelcluster-node==#{node['cluster']['parallelcluster-node-version']}"
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
end

it 'installs official node package' do
is_expected.to run_bash('install official aws-parallelcluster-node')
is_expected.to run_execute('install official aws-parallelcluster-node')
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@
end

bash 'install awscli' do
code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws"
code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws"
end
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@
activate_virtual_env cookbook_virtualenv_name do
pyenv_path cookbook_virtualenv_path
python_version cookbook_python_version
end

cookbook_file "#{virtualenv_path}/requirements.txt" do
source "cookbook_virtualenv/requirements.txt"
mode '0755'
not_if { ::File.exist?("#{cookbook_virtualenv_path}/bin/activate") }
end

remote_file "#{node['cluster']['base_dir']}/cookbook-dependencies.tgz" do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
tar -xf #{gcc_tarball}
cd gcc-#{gcc_version}
# Patch the download_prerequisites script to download over https and not ftp. This works better in China regions.
sed -i "s#ftp://gcc\.gnu\.org##{node['cluster']['artifacts_s3_url']}/dependencies/gcc/prerequisites#g" ./contrib/download_prerequisites
sed -i "s#ftp://gcc\.gnu\.org/pub/gcc/infrastructure##{node['cluster']['artifacts_s3_url']}/dependencies/gcc/prerequisites#g" ./contrib/download_prerequisites
./contrib/download_prerequisites
mkdir build && cd build
../configure --prefix=/opt/arm/armpl/gcc/#{gcc_version} --disable-bootstrap --enable-checking=release --enable-languages=c,c++,fortran --disable-multilib
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def fabric_manager_package
end

def fabric_manager_version
"#{_nvidia_driver_version}*"
"#{_nvidia_driver_version}"
end

def platform
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,21 @@

action :install_package do
# For ubuntu, CINC17 apt-package resources need full versions for `version`
execute "install_fabricmanager_for_ubuntu" do
bash "Install #{fabric_manager_package}" do
user 'root'
code <<-FABRIC_MANAGER
set -e
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.deb
FABRIC_MANAGER
retries 3
retry_delay 5
end
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.deb" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

command "apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb "\
"&& apt-mark hold #{fabric_manager_package}"
bash "install_fabricmanager_for_ubuntu" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER
set -e
dpkg -i #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}
FABRIC_MANAGER
retries 3
retry_delay 5
end
Expand All @@ -37,5 +39,5 @@ def arch_suffix
end

def fabric_manager_url
"#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
end
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,20 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.rpm" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

package 'yum-plugin-versionlock'
bash "Install #{fabric_manager_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER_INSTALL
set -e
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.rpm --region #{node['cluster']['region']}
yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum versionlock #{fabric_manager_package}
FABRIC_MANAGER_INSTALL
Expand All @@ -32,5 +40,5 @@ def arch_suffix
end

def fabric_manager_url
"#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
end
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,28 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do
source "#{dcgm_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "Install #{dcgm_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.deb --region #{node['cluster']['region']}
apt -y install #{dcgm_package}-#{package_version}.deb
dpkg -i #{dcgm_package}-#{package_version}.deb
DCGM_INSTALL
retries 3
retry_delay 5
end
end

def dcgm_url
"#{node['cluster']['artifacts_build_url']}/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
end

def dcgm_package
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,19 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do
source "#{dcgm_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "Install #{dcgm_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.rpm --region #{node['cluster']['region']}
yum install -y #{dcgm_package}-#{package_version}.rpm
DCGM_INSTALL
retries 3
Expand All @@ -26,7 +34,7 @@
end

def dcgm_url
"#{node['cluster']['artifacts_build_url']}/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm"
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm"
end

def dcgm_package
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,4 @@ def nvidia_kernel_module
else
"kernel-open"
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

it 'installs awscli into cookbook virtualev path' do
is_expected.to run_bash('install awscli')
.with_code "#{cookbook_virtualenv_path}/bin/python#{node['cluster']['python-major-minor-version']} #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws"
.with_code "#{cookbook_virtualenv_path}/bin/python #{file_cache_path}/awscli/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws"
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,6 @@
is_expected.to write_node_attributes('dump node attributes')
end

it 'copies requirements file' do
is_expected.to create_cookbook_file("#{virtualenv_path}/requirements.txt").with(
source: "cookbook_virtualenv/requirements.txt",
mode: '0755'
)
end

it 'installs python packages' do
is_expected.to run_bash("pip install").with(
user: 'root',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def self.configure(chef_run)
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }

context 'when fabric manager is to install' do
cached(:chef_run) do
Expand All @@ -193,10 +193,10 @@ def self.configure(chef_run)

if platform == 'ubuntu'
it 'installs fabric manager for ubuntu' do
is_expected.to run_execute('install_fabricmanager_for_ubuntu')
is_expected.to run_bash('install_fabricmanager_for_ubuntu')
.with_retries(3)
.with_retry_delay(5)
.with_command("apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}")
.with_code(/dpkg -i #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}/)
end
else
it 'installs yum-plugin-versionlock' do
Expand All @@ -222,7 +222,7 @@ def self.configure(chef_run)
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-535' : 'nvidia-fabric-manager' }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}*" : nvidia_driver_version }
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }

context('when nvswithes are > 1') do
cached(:chef_run) do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@

# URL for ParallelCluster Artifacts stored in public S3 buckets
# ['cluster']['region'] will need to be defined by image_dna.json during AMI build.
default['cluster']['base_build_url'] = "s3://#{node['cluster']['region']}-aws-parallelcluster"
default['cluster']['artifacts_build_url'] = "#{node['cluster']['base_build_url']}/archives/dependencies"
default['cluster']['artifacts_s3_url'] = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}/archives"
default['cluster']['artifacts_build_url'] = "s3://#{node['cluster']['region']}-aws-parallelcluster/archives/dependencies"
default['cluster']['artifacts_s3_url'] = "https://#{node['cluster']['region']}-aws-parallelcluster.s3.#{node['cluster']['region']}.#{node['cluster']['aws_domain']}/archives"
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
end

remote_file "#{prefix}/Python-#{python_version}.tgz" do
source "#{python_url}"
source python_url
mode '0644'
retries 3
retry_delay 5
Expand All @@ -52,5 +52,4 @@
make install
VENV
end

end
end
Loading

0 comments on commit 4259bef

Please sign in to comment.