Skip to content

Commit

Permalink
Merge branch 'develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
hgreebe authored Nov 29, 2023
2 parents cc27d86 + 387a868 commit 37c199a
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 15 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Libfabric-aws: `libfabric-aws-1.19.0-1`
- Rdma-core: `rdma-core-46.0-1`
- Open MPI: `openmpi40-aws-4.1.6-1`
- Upgrade GDRCopy to version 2.4.
- Upgrade GDRCopy to version 2.4 in all supported OSes, except for Centos 7 where version 2.3.1 is used.

**BUG FIXES**
- Fix inconsistent scaling configuration after cluster update rollback when modifying the list of instance types declared in the Compute Resources.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,25 @@
use 'partial/_gdrcopy_common.rb'
use 'partial/_gdrcopy_common_rhel.rb'

def gdrcopy_version
'2.3.1'
end

def gdrcopy_checksum
'59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8'
end

# The installation code must be overridden in Centos7
# because it has GDRCopy pinned to v2.3.1.
def installation_code
<<~COMMAND
CUDA=/usr/local/cuda ./build-rpm-packages.sh
rpm -q gdrcopy-kmod-#{gdrcopy_version_extended}dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version_extended}dkms.noarch.#{gdrcopy_platform}.rpm
rpm -q gdrcopy-#{gdrcopy_version_extended}.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version_extended}.#{gdrcopy_arch}.#{gdrcopy_platform}.rpm
rpm -q gdrcopy-devel-#{gdrcopy_version_extended}.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version_extended}.noarch.#{gdrcopy_platform}.rpm
COMMAND
end

def gdrcopy_enabled?
!arm_instance? && nvidia_enabled?
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

property :gdrcopy_version, String, default: '2.4'
property :gdrcopy_checksum, String, default: '39e74d505ca16160567f109cc23478580d157da897f134989df1d563e55f7a5b'
def gdrcopy_version
'2.4'
end

def gdrcopy_checksum
'39e74d505ca16160567f109cc23478580d157da897f134989df1d563e55f7a5b'
end

unified_mode true
default_action :setup
Expand All @@ -23,11 +28,11 @@
return if on_docker?

# Save gdrcopy version for InSpec tests
node.default['cluster']['nvidia']['gdrcopy']['version'] = new_resource.gdrcopy_version
node.default['cluster']['nvidia']['gdrcopy']['version'] = gdrcopy_version
node.default['cluster']['nvidia']['gdrcopy']['service'] = gdrcopy_service
node_attributes 'dump node attributes'

gdrcopy_tarball = "#{node['cluster']['sources_dir']}/gdrcopy-#{new_resource.gdrcopy_version}.tar.gz"
gdrcopy_tarball = "#{node['cluster']['sources_dir']}/gdrcopy-#{gdrcopy_version}.tar.gz"

directory node['cluster']['sources_dir'] do
recursive true
Expand All @@ -38,7 +43,7 @@
mode '0644'
retries 3
retry_delay 5
checksum new_resource.gdrcopy_checksum
checksum gdrcopy_checksum
action :create_if_missing
end

Expand All @@ -58,7 +63,7 @@
code <<-GDRCOPY_INSTALL
set -e
tar -xf #{gdrcopy_tarball}
cd gdrcopy-#{new_resource.gdrcopy_version}/packages
cd gdrcopy-#{gdrcopy_version}/packages
#{installation_code}
GDRCOPY_INSTALL
end
Expand All @@ -85,7 +90,7 @@
action :configure do
return if on_docker?
# Save gdrcopy version for InSpec tests
node.default['cluster']['nvidia']['gdrcopy']['version'] = new_resource.gdrcopy_version
node.default['cluster']['nvidia']['gdrcopy']['version'] = gdrcopy_version
node.default['cluster']['nvidia']['gdrcopy']['service'] = gdrcopy_service
node_attributes 'dump node attributes'

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
require 'spec_helper'

class ConvergeGdrcopy
def self.setup(chef_run, gdrcopy_version: nil, gdrcopy_checksum: nil)
def self.setup(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
gdrcopy 'setup' do
gdrcopy_version gdrcopy_version
gdrcopy_checksum gdrcopy_checksum
action :setup
end
end
Expand Down Expand Up @@ -40,7 +38,7 @@ def self.configure(chef_run)
end
end
cached(:resource) do
ConvergeGdrcopy.setup(chef_run, gdrcopy_version: gdrcopy_version)
ConvergeGdrcopy.setup(chef_run)
chef_run.find_resource('gdrcopy', 'setup')
end

Expand Down Expand Up @@ -122,6 +120,54 @@ def self.configure(chef_run)
end
end

describe 'gdrcopy:gdrcopy_version' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.setup(runner)
end
cached(:resource) do
chef_run.find_resource('gdrcopy', 'setup')
end

it 'returns the expected gdrcopy version' do
expected_gdrcopy_version = if platform == "centos"
"2.3.1"
else
"2.4"
end
expect(resource.gdrcopy_version).to eq(expected_gdrcopy_version)
end
end
end
end

describe 'gdrcopy:gdrcopy_checksum' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
runner = runner(platform: platform, version: version, step_into: ['gdrcopy'])
ConvergeGdrcopy.setup(runner)
end
cached(:resource) do
chef_run.find_resource('gdrcopy', 'setup')
end

it 'returns the expected gdrcopy checksum' do
expected_gdrcopy_checksum = if platform == "centos"
"59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8"
else
"39e74d505ca16160567f109cc23478580d157da897f134989df1d563e55f7a5b"
end
expect(resource.gdrcopy_checksum).to eq(expected_gdrcopy_checksum)
end
end
end
end

describe 'gdrcopy:setup' do
for_all_oses do |platform, version|
context "on #{platform}#{version} when gdrcopy not enabled" do
Expand All @@ -140,8 +186,14 @@ def self.configure(chef_run)

context "on #{platform}#{version} when gdrcopy enabled" do
cached(:sources_dir) { 'sources_dir' }
cached(:gdrcopy_version) { 'gdrcopy_version' }
cached(:gdrcopy_checksum) { 'gdrcopy_checksum' }
cached(:gdrcopy_version) { platform == 'centos' ? '2.3.1' : '2.4' }
cached(:gdrcopy_checksum) do
if platform == 'centos'
'59b3cc97a4fc6008a5407506d9e67ecc4144cfad61c261217fabcb671cd30ca8'
else
'39e74d505ca16160567f109cc23478580d157da897f134989df1d563e55f7a5b'
end
end
cached(:gdrcopy_service) { platform == 'ubuntu' ? 'gdrdrv' : 'gdrcopy' }
cached(:gdrcopy_tarball) { "#{sources_dir}/gdrcopy-#{gdrcopy_version}.tar.gz" }
cached(:gdrcopy_url) { "https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v#{gdrcopy_version}.tar.gz" }
Expand Down Expand Up @@ -173,7 +225,7 @@ def self.configure(chef_run)
runner = runner(platform: platform, version: version, step_into: ['gdrcopy']) do |node|
node.override['cluster']['sources_dir'] = sources_dir
end
ConvergeGdrcopy.setup(runner, gdrcopy_version: gdrcopy_version, gdrcopy_checksum: gdrcopy_checksum)
ConvergeGdrcopy.setup(runner)
end
cached(:node) { chef_run.node }

Expand Down Expand Up @@ -217,6 +269,11 @@ def self.configure(chef_run)
expect(installation_code).to match(/dpkg -i libgdrapi_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
expect(installation_code).to match(/dpkg -i gdrcopy-tests_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}\+cuda\*.deb/)
expect(installation_code).to match(/dpkg -i gdrcopy_#{gdrcopy_version}-1_#{gdrcopy_arch}.#{gdrcopy_platform}.deb/)
elsif platform == 'centos'
expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh})
expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.noarch.#{gdrcopy_platform}.rpm/)
expect(installation_code).to match(/rpm -q gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch} || rpm -Uvh gdrcopy-#{gdrcopy_version}-1.#{gdrcopy_arch}.#{gdrcopy_platform}.rpm/)
expect(installation_code).to match(/rpm -q gdrcopy-devel-#{gdrcopy_version}-1.noarch || rpm -Uvh gdrcopy-devel-#{gdrcopy_version}-1.noarch.#{gdrcopy_platform}.rpm/)
else
expect(installation_code).to match(%r{CUDA=/usr/local/cuda ./build-rpm-packages.sh})
expect(installation_code).to match(/rpm -q gdrcopy-kmod-#{gdrcopy_version}-1dkms || rpm -Uvh gdrcopy-kmod-#{gdrcopy_version}-1dkms.#{gdrcopy_platform}.noarch.rpm/)
Expand Down

0 comments on commit 37c199a

Please sign in to comment.