Skip to content

Commit

Permalink
Get nvidia fabric manager and dcgm using url
Browse files Browse the repository at this point in the history
  • Loading branch information
hgreebe committed Jun 17, 2024
1 parent 6429473 commit d782cc1
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,19 @@

action :install_package do
# For ubuntu, CINC17 apt-package resources need full versions for `version`
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.deb" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "install_fabricmanager_for_ubuntu" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER
set -e
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.deb
dpkg -i #{fabric_manager_package}-#{fabric_manager_version}.deb && apt-mark hold #{fabric_manager_package}
FABRIC_MANAGER
retries 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,20 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{fabric_manager_package}-#{fabric_manager_version}.rpm" do
source "#{fabric_manager_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

package 'yum-plugin-versionlock'
bash "Install #{fabric_manager_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-FABRIC_MANAGER_INSTALL
set -e
aws s3 cp #{fabric_manager_url} #{fabric_manager_package}-#{fabric_manager_version}.rpm --region #{node['cluster']['region']}
yum install -y #{fabric_manager_package}-#{fabric_manager_version}.rpm
yum versionlock #{fabric_manager_package}
FABRIC_MANAGER_INSTALL
Expand All @@ -32,5 +40,5 @@ def arch_suffix
end

def fabric_manager_url
"#{node['cluster']['artifacts_build_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
"#{node['cluster']['artifacts_s3_url']}/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
end
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,19 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do
source "#{dcgm_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "Install #{dcgm_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.deb --region #{node['cluster']['region']}
dpkg -i #{dcgm_package}-#{package_version}.deb
DCGM_INSTALL
retries 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,19 @@
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do
source "#{dcgm_url}"
mode '0644'
retries 3
retry_delay 5
action :create_if_missing
end

bash "Install #{dcgm_package}" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-DCGM_INSTALL
set -e
aws s3 cp #{dcgm_url} #{dcgm_package}-#{package_version}.rpm --region #{node['cluster']['region']}
yum install -y #{dcgm_package}-#{package_version}.rpm
DCGM_INSTALL
retries 3
Expand Down

0 comments on commit d782cc1

Please sign in to comment.