Skip to content

Commit

Permalink
Install enroot + pyxis in AMIs during build process
Browse files Browse the repository at this point in the history
  • Loading branch information
hgreebe committed Aug 22, 2024
1 parent 47f2c6a commit 1931138
Show file tree
Hide file tree
Showing 13 changed files with 310 additions and 0 deletions.
4 changes: 4 additions & 0 deletions cookbooks/aws-parallelcluster-platform/attributes/platform.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
# ArmPL
default['conditions']['arm_pl_supported'] = arm_instance?

# Enroot + Pyxis
default['cluster']['enroot']['version'] = '3.4.1'
default['cluster']['pyxis']['version'] = 'v0.19.0'

# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
default['cluster']['nvidia']['driver_version'] = '535.183.01'
Expand Down
1 change: 1 addition & 0 deletions cookbooks/aws-parallelcluster-platform/recipes/install.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@
include_recipe "aws-parallelcluster-platform::intel_mpi"
arm_pl 'Install ARM Performance Library'
intel_hpc 'Setup Intel HPC'
enroot 'Setup Enroot'
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :enroot, platform: 'amazon' do |node|
node['platform_version'].to_i >= 8
end

use 'partial/_enroot_common.rb'
use 'partial/_enroot_rhel.rb'

def prerequisites
%w(jq squashfs-tools parallel pigz squashfuse zstd)
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :enroot, platform: 'amazon', platform_version: '2'

use 'partial/_enroot_common.rb'
use 'partial/_enroot_rhel.rb'

def prerequisites
%w(jq squashfs-tools parallel pigz squashfuse zstd)
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :enroot, platform: 'redhat' do |node|
node['platform_version'].to_i >= 8
end

use 'partial/_enroot_common.rb'
use 'partial/_enroot_rhel.rb'

def prerequisites
%w(jq fuse-overlayfs squashfs-tools parallel pigz squashfuse zstd)
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :enroot, platform: 'rocky' do |node|
node['platform_version'].to_i >= 8
end

use 'partial/_enroot_common.rb'
use 'partial/_enroot_rhel.rb'

def prerequisites
%w(jq fuse-overlayfs squashfs-tools parallel pigz squashfuse zstd)
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

# Copyright:: 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :enroot, platform: 'ubuntu' do |node|
node['platform_version'].to_i >= 20
end
use 'partial/_enroot_common.rb'
use 'partial/_enroot_debian.rb'
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# frozen_string_literal: true
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

unified_mode true
default_action :setup

action :setup do

action_install_package

bash "Configure enroot" do
user 'root'
code <<-ENROOT_CONFIGURE
set -e
ENROOT_CONFIG_RELEASE=pyxis
wget -O /tmp/enroot.template.conf https://raw.githubusercontent.com/aws-samples/aws-parallelcluster-post-install-scripts/${ENROOT_CONFIG_RELEASE}/pyxis/enroot.template.conf
mkdir -p #{node['cluster']['shared_dir']}/enroot
chown #{node['cluster']['cluster_user']} #{node['cluster']['shared_dir']}/enroot
ENROOT_CACHE_PATH=#{node['cluster']['shared_dir']}/enroot envsubst < /tmp/enroot.template.conf > /tmp/enroot.conf
mv /tmp/enroot.conf /etc/enroot/enroot.conf
chmod 0644 /etc/enroot/enroot.conf
mkdir -p /tmp/enroot
chmod 1777 /tmp/enroot
mkdir -p /tmp/enroot/data
chmod 1777 /tmp/enroot/data
chmod 1777 #{node['cluster']['shared_dir']}/enroot
ENROOT_CONFIGURE
retries 3
retry_delay 5
end
end

def package_version
node['cluster']['enroot']['version']
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# frozen_string_literal: true
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do

bash "Install enroot" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-ENROOT_INSTALL
set -e
apt-get install -y jq squashfs-tools parallel fuse-overlayfs pigz squashfuse zstd
curl -fSsL -O #{enroot_url}
curl -fSsL -O #{enroot_caps_url}
apt install -y ./*.deb
ln -s /usr/share/enroot/hooks.d/50-slurm-pmi.sh /etc/enroot/hooks.d/
ln -s /usr/share/enroot/hooks.d/50-slurm-pytorch.sh /etc/enroot/hooks.d/
mkdir -p /etc/sysconfig
echo "PATH=/opt/slurm/sbin:/opt/slurm/bin:$(bash -c 'source /etc/environment ; echo $PATH')" >> /etc/sysconfig/slurmd
ENROOT_INSTALL
retries 3
retry_delay 5
end

end

def enroot_url
"https://github.com/NVIDIA/enroot/releases/download/v#{package_version}/enroot_#{package_version}-1_#{arch_suffix}.deb"
end

def enroot_caps_url
"https://github.com/NVIDIA/enroot/releases/download/v#{package_version}/enroot+caps_#{package_version}-1_#{arch_suffix}.deb"
end

def arch_suffix
arm_instance? ? 'arm64' : 'amd64'
end

Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# frozen_string_literal: true
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

action :install_package do

package prerequisites do
retries 3
retry_delay 5
end

bash "Install enroot" do
user 'root'
cwd node['cluster']['sources_dir']
code <<-ENROOT_INSTALL
set -e
yum install -y #{enroot_url}
yum install -y #{enroot_caps_url}
ENROOT_INSTALL
retries 3
retry_delay 5
end

end

def enroot_url
"https://github.com/NVIDIA/enroot/releases/download/v#{package_version}/enroot-#{package_version}-1.el8.#{arch_suffix}.rpm"
end

def enroot_caps_url
"https://github.com/NVIDIA/enroot/releases/download/v#{package_version}/enroot+caps-#{package_version}-1.el8.#{arch_suffix}.rpm"
end

def arch_suffix
arm_instance? ? 'aarch64' : 'x86_64'
end

1 change: 1 addition & 0 deletions cookbooks/aws-parallelcluster-slurm/recipes/install.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@
action :setup
end
include_recipe 'aws-parallelcluster-slurm::install_slurm'
include_recipe 'aws-parallelcluster-slurm::install_pyxis'
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# frozen_string_literal: true

#
# Cookbook:: aws-parallelcluster-slurm
# Recipe:: install_pmix
#
# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

pyxis_version = "0.20.0"
pyxis_tarball = "#{node['cluster']['sources_dir']}/pyxis-#{pyxis_version}.tar.gz"

bash "Install pyxis" do
user 'root'
code <<-PYXIS_INSTALL
set -e
git clone --depth 1 --branch #{pyxis_version} https://github.com/NVIDIA/pyxis.git /tmp/pyxis
cd /tmp/pyxis
CPPFLAGS='-I /opt/slurm/include/' make
CPPFLAGS='-I /opt/slurm/include/' make install
mkdir -p /opt/slurm/etc/plugstack.conf.d
echo -e 'include /opt/slurm/etc/plugstack.conf.d/*' | tee /opt/slurm/etc/plugstack.conf
ln -fs /usr/local/share/pyxis/pyxis.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
SHARED_DIR=#{node['cluster']['shared_dir']}
NONROOT_USER=#{node['cluster']['cluster_user']}
mkdir -p ${SHARED_DIR}/pyxis/
chown ${NONROOT_USER} ${SHARED_DIR}/pyxis/
sed -i '${s/$/ runtime_path=${SHARED_DIR}\\/pyxis/}' /opt/slurm/etc/plugstack.conf.d/pyxis.conf
SHARED_DIR=${SHARED_DIR} envsubst < /opt/slurm/etc/plugstack.conf.d/pyxis.conf > /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf
mv /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
PYXIS_INSTALL
retries 3
retry_delay 5
end
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
#!/bin/bash
source /etc/profile.d/aws-cli-default-config.sh
sudo -u <%= node['cluster']['cluster_admin_user'] %> <%= node_virtualenv_path %>/bin/slurm_fleet_status_manager "$@"

0 comments on commit 1931138

Please sign in to comment.