Skip to content

Commit

Permalink
nvidia repo downloads
Browse files Browse the repository at this point in the history
  • Loading branch information
hgreebe committed Apr 23, 2024
1 parent f3a389b commit e067bf3
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@
node_attributes "dump node attributes"

# Add NVIDIA repo for fabric manager and datacenter-gpu-manager
nvidia_repo 'add nvidia repository' do
action :add
end
# nvidia_repo 'add nvidia repository' do
# action :add
# end

action_install_package

nvidia_repo 'remove nvidia repository' do
action :remove
end
# nvidia_repo 'remove nvidia repository' do
# action :remove
# end
end

action :configure do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,27 @@
action :install_package do
# For ubuntu, CINC17 apt-package resources need full versions for `version`
execute "install_fabricmanager_for_ubuntu" do
command "apt -y install #{fabric_manager_package}=#{fabric_manager_version} "\
bash "Install #{fabric_manager_package}" do
user 'root'
code <<-FABRIC_MANAGER
set -e
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.deb
FABRIC_MANAGER
retries 3
retry_delay 5
end

command "apt -y install #{fabric_manager_package}-#{fabric_manager_version}.deb "\
"&& apt-mark hold #{fabric_manager_package}"
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'arm64' : 'amd64'
end

def fabric_manager_url
"s3://hgreebe-dependencies/archives/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
end
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,19 @@
user 'root'
code <<-FABRIC_MANAGER_INSTALL
set -e
yum install -y #{fabric_manager_package}-#{fabric_manager_version}
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum versionlock #{fabric_manager_package}
FABRIC_MANAGER_INSTALL
retries 3
retry_delay 5
end
end

def arch_suffix
arm_instance? ? 'aarch64' : 'x86_64'
end

def fabric_manager_url
"s3://hgreebe-dependencies/archives/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
end
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
provides :nvidia_dcgm, platform: 'amazon', platform_version: '2'

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
!arm_instance? && _nvidia_enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_rhel.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
end

use 'partial/_nvidia_dcgm_common.rb'
use 'partial/_nvidia_dcgm_debian.rb'

def _nvidia_dcgm_enabled
_nvidia_enabled
end

def package_version
"1:#{node['cluster']['nvidia']['dcgm_version']}" # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. See details here: https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver
end
# def package_version
# "1:#{node['cluster']['nvidia']['dcgm_version']}" # The single digit "1" is epoch version. Without the "1", package install fails because version does not exist. See details here: https://askubuntu.com/questions/441879/why-do-some-packages-have-extra-numbers-before-a-colon-on-the-front-of-their-ver
# end

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,23 @@
return unless _nvidia_dcgm_enabled

# Add NVIDIA repo for fabric manager and datacenter-gpu-manager
nvidia_repo 'add nvidia repository' do
action :add
end

package 'datacenter-gpu-manager' do
retries 3
retry_delay 5
version package_version
end

nvidia_repo 'remove nvidia repository' do
action :remove
end
# nvidia_repo 'add nvidia repository' do
# action :add
# end

# package 'datacenter-gpu-manager' do
# retries 3
# retry_delay 5
# version package_version
# end

action_install_package

# nvidia_repo 'remove nvidia repository' do
# action :remove
# end
end

def _nvidia_enabled
nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled
end

def package_version
node['cluster']['nvidia']['dcgm_version']
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# frozen_string_literal: true
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do

bash "Install #{dcgm_package}" do
user 'root'
code <<-DCGM_INSTALL
set -e
aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.deb
DCGM_INSTALL
retries 3
retry_delay 5
end

command "apt -y install #{dcgm_package}-#{package_version}.deb "
retries 3
retry_delay 5

end

def dcgm_url
"s3://hgreebe-dependencies/archives/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
end

def dcgm_package
'datacenter-gpu-manager'
end

def arch_suffix
arm_instance? ? 'arm64' : 'amd64'
end

def package_version
node['cluster']['nvidia']['dcgm_version']
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# frozen_string_literal: true
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do

bash "Install #{dcgm_package}" do
user 'root'
code <<-DCGM_INSTALL
set -e
aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.rpm
yum install -y #{dcgm_package}-#{package_version}.rpm
DCGM_INSTALL
retries 3
retry_delay 5
end

end

def dcgm_url
"s3://hgreebe-dependencies/archives/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm"
end

def dcgm_package
'datacenter-gpu-manager'
end

def arch_suffix
arm_instance? ? 'aarch64' : 'x86_64'
end

def package_version
node['cluster']['nvidia']['dcgm_version']
end

0 comments on commit e067bf3

Please sign in to comment.