Skip to content

Commit

Permalink
Fix the way Pyxis and Enroot are configured.
Browse files Browse the repository at this point in the history
  1. Pyxis is disabled by default. In particular, the SPANK config file and the Pyxis config file required to enable it are stored in `/opt/parallelcluster/examples` folder so that they are ineffective but can be used by the user to enable Pyxisby simply moving them to the expected location.

  2. Move Pyxis and Enroot configuration to build time (there was no reason to configure Pyxis and Enroot at runtime)

  3. Changed Pyxis runtime path to `/run/pyxis`. As per [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#slurm-plugstack-configuration) a tmpfs should be used. Asd a consequence, we needed to define a tmpfiles config to make sure that the dedicated folder is not deleted at boot time.

  4. Changed Enroot paths, following the suggestion in the [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#enroot-configuration-example)
    1. Using tmpfs storage for `ENROOT_RUNTIME_PATH` and `ENROOT_DATA_PATH`
    2. Using a persistent local storage for `ENROOT_CACHE_PATH` and `ENROOT_CONFIG_PATH`.

  5. *Minor*: Moved Pyxis attributes from platform cookbook to slurm cookbook because Pyxis is a SLURM plugin so it would be conceptually wrong to have its attributes defined in platform cookbook.

Signed-off-by: Giacomo Marciani <[email protected]>
  • Loading branch information
gmarciani committed Oct 15, 2024
1 parent 0963973 commit f293655
Show file tree
Hide file tree
Showing 17 changed files with 349 additions and 109 deletions.
5 changes: 3 additions & 2 deletions cookbooks/aws-parallelcluster-platform/attributes/platform.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
# ArmPL
default['conditions']['arm_pl_supported'] = arm_instance?

# Enroot + Pyxis
# Enroot
default['cluster']['enroot']['version'] = '3.4.1'
default['cluster']['pyxis']['version'] = '0.20.0'
default['cluster']['enroot']['temporary_dir'] = '/run/enroot'
default['cluster']['enroot']['persistent_dir'] = '/var/enroot'

# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
Expand Down
3 changes: 0 additions & 3 deletions cookbooks/aws-parallelcluster-platform/recipes/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,3 @@
include_recipe 'aws-parallelcluster-platform::supervisord_config'
fetch_config 'Fetch and load cluster configs'
include_recipe 'aws-parallelcluster-platform::config_login' if node['cluster']['node_type'] == 'LoginNode'
enroot 'Configure Enroot' do
action :configure
end
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
directory node['cluster']['license_dir']
directory node['cluster']['configs_dir']
directory node['cluster']['shared_dir']
directory node['cluster']['examples_dir']
directory node['cluster']['shared_dir_login_nodes']

# Create ParallelCluster log folder
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# frozen_string_literal: true
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
Expand All @@ -18,50 +18,36 @@
action :setup do
return if on_docker?
action_install_package
end

action :configure do
return if on_docker?
return unless enroot_installed

cookbook_file "/tmp/enroot.template.conf" do
source 'enroot/enroot.template.conf'
cookbook 'aws-parallelcluster-platform'
template "/etc/enroot/enroot.conf" do
source 'enroot/enroot.conf.erb'
owner 'root'
group 'root'
mode '0755'
action :create_if_missing
mode '0644'
end

bash "Configure enroot" do
user 'root'
code <<-ENROOT_CONFIGURE
set -e
ENROOT_CONFIG_RELEASE=pyxis
SHARED_DIR=#{node['cluster']['shared_dir']}
NONROOT_USER=#{node['cluster']['cluster_user']}
mkdir -p ${SHARED_DIR}/enroot
chown ${NONROOT_USER} ${SHARED_DIR}/enroot
ENROOT_CACHE_PATH=${SHARED_DIR}/enroot envsubst < /tmp/enroot.template.conf > /tmp/enroot.conf
mv /tmp/enroot.conf /etc/enroot/enroot.conf
chmod 0644 /etc/enroot/enroot.conf
mkdir -p /tmp/enroot
chmod 1777 /tmp/enroot
mkdir -p /tmp/enroot/data
chmod 1777 /tmp/enroot/data
chmod 1777 ${SHARED_DIR}/enroot
directory node['cluster']['enroot']['persistent_dir'] do
owner 'root'
group 'root'
mode '1777'
recursive true
end

mkdir -p ${SHARED_DIR}/pyxis/
chown ${NONROOT_USER} ${SHARED_DIR}/pyxis/
sed -i '${s/$/ runtime_path=${SHARED_DIR}\\/pyxis/}' /opt/slurm/etc/plugstack.conf.d/pyxis.conf
SHARED_DIR=${SHARED_DIR} envsubst < /opt/slurm/etc/plugstack.conf.d/pyxis.conf > /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf
mv /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
directory node['cluster']['enroot']['temporary_dir'] do
owner 'root'
group 'root'
mode '1777'
recursive true
end

ENROOT_CONFIGURE
retries 3
retry_delay 5
# We assume the Enroot temporary dir to be a temporary folder in /run.
# Folders in /run must be defined in /usr/lib/tmpfiles.d, otherwise they get
# deleted on node boot.
template "/usr/lib/tmpfiles.d/enroot.conf" do
source 'enroot/tmpfiles/enroot.conf.erb'
owner 'root'
group 'root'
mode '0644'
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
is_expected.to create_directory(node['cluster']['shared_dir'])
end

it 'creates examples directory' do
is_expected.to create_directory(node['cluster']['examples_dir'])
end

it 'creates log directory' do
is_expected.to create_directory(node['cluster']['log_base_dir']).with(
owner: 'root',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,9 @@ def self.setup(chef_run)
end
end
end

def self.configure(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
enroot 'configure' do
action :configure
end
end
end
end

describe 'enroot:package_version' do
describe 'aws-parallelcluster-platform::enroot:package_version' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
Expand All @@ -39,7 +31,7 @@ def self.configure(chef_run)
end
end

describe 'enroot:arch_suffix' do
describe 'aws-parallelcluster-platform::enroot:arch_suffix' do
for_all_oses do |platform, version|
context "on #{platform}#{version} - arm" do
cached(:chef_run) do
Expand Down Expand Up @@ -81,15 +73,73 @@ def self.configure(chef_run)
end
end

describe 'enroot:setup' do
describe 'aws-parallelcluster-platform::enroot:setup' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:enroot_persistent_dir) { '/path/to/enroot/persistent/dir' }
cached(:enroot_temporary_dir) { '/path/to/enroot/temporary/dir' }

let(:chef_run) do
runner(platform: platform, version: version, step_into: ['enroot']) do |node|
node.override['cluster']['enroot']['version'] = package_version
node.override['cluster']['enroot']['persistent_dir'] = enroot_persistent_dir
node.override['cluster']['enroot']['temporary_dir'] = enroot_temporary_dir
end
end

before do
ConvergeEnroot.setup(chef_run)
end

it 'creates the Enroot configuration' do
is_expected.to create_template('/etc/enroot/enroot.conf').with(
source: 'enroot/enroot.conf.erb',
owner: 'root',
group: 'root',
mode: '0644'
)
end

# it 'the Enroot configuration has the correct content' do
# is_expected.to render_file('/etc/enroot/enroot.conf')
# .with_content("ENROOT_RUNTIME_PATH #{enroot_temporary_dir}/runtime/user-$(id -u)")
# .with_content("ENROOT_DATA_PATH #{enroot_temporary_dir}/data/user-$(id -u)")
# .with_content("ENROOT_CONFIG_PATH #{enroot_persistent_dir}/config/user-$(id -u)")
# .with_content("ENROOT_CACHE_PATH #{enroot_persistent_dir}/cache/group-$(id -g)")
# end

it 'creates the Enroot persistent directory' do
is_expected.to create_directory(enroot_persistent_dir).with(
owner: 'root',
group: 'root',
mode: '1777',
recursive: true
)
end

it 'creates the Enroot temporary directory' do
is_expected.to create_directory(enroot_temporary_dir).with(
owner: 'root',
group: 'root',
mode: '1777',
recursive: true
)
end

it 'creates the Enroot tmpfiles.d configuration' do
is_expected.to create_template('/usr/lib/tmpfiles.d/enroot.conf').with(
source: 'enroot/tmpfiles/enroot.conf.erb',
owner: 'root',
group: 'root',
mode: '0644'
)
end

# it 'the Enroot tmpfile.d configuration has the correct content' do
# is_expected.to render_file('/usr/lib/tmpfiles.d/enroot.conf')
# .with_content("D #{enroot_temporary_dir} 0777 root root")
# end

context 'when nvidia is enabled' do
before do
stubs_for_provider('enroot') do |resource|
Expand Down Expand Up @@ -128,44 +178,3 @@ def self.configure(chef_run)
end
end
end

describe 'enroot:configure' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
let(:chef_run) do
runner(platform: platform, version: version, step_into: ['enroot'])
end

context 'when enroot is installed' do
before do
stubs_for_provider('enroot') do |resource|
allow(resource).to receive(:enroot_installed).and_return(true)
end
ConvergeEnroot.configure(chef_run)
end
it 'run configure enroot script' do
is_expected.to run_bash('Configure enroot')
.with(retries: 3)
.with(retry_delay: 5)
.with(user: 'root')
end
end

context 'when enroot is not installed' do
before do
stubs_for_provider('enroot') do |resource|
allow(resource).to receive(:enroot_installed).and_return(false)
end
ConvergeEnroot.configure(chef_run)
end

it 'does not run configure enroot script' do
is_expected.not_to run_bash('Configure enroot')
.with(retries: 3)
.with(retry_delay: 5)
.with(user: 'root')
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ENROOT_LIBRARY_PATH /usr/lib/enroot
#ENROOT_SYSCONF_PATH /etc/enroot
ENROOT_RUNTIME_PATH /tmp/enroot/user-$(id -u)
ENROOT_CONFIG_PATH ${ENROOT_CONFIG_PATH}
ENROOT_CACHE_PATH ${ENROOT_CACHE_PATH}
ENROOT_DATA_PATH /tmp/enroot/data/user-$(id -u)
ENROOT_RUNTIME_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/runtime/user-$(id -u)
ENROOT_DATA_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/data/user-$(id -u)
ENROOT_CONFIG_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/config/user-$(id -u)
ENROOT_CACHE_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/cache/group-$(id -g)
#ENROOT_TEMP_PATH ${TMPDIR:-/tmp}

# Gzip program used to uncompress digest layers.
Expand Down Expand Up @@ -68,4 +68,4 @@ ENROOT_RESTRICT_DEV no
#all_proxy
#no_proxy
#http_proxy
#https_proxy
#https_proxy
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
D <%= node['cluster']['enroot']['temporary_dir'] %> 0777 root root
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,36 @@

expected_enroot_version = node['cluster']['enroot']['version']

describe "gdrcopy version is expected to be #{expected_enroot_version}" do
describe "enroot version is expected to be #{expected_enroot_version}" do
subject { command('enroot version').stdout.strip() }
it { should eq expected_enroot_version }
end

persistent_dirs = %w[/etc/enroot /var/enroot]
persistent_dirs.each do |path|
describe directory(path) do
it { should exist }
its('owner') { should eq 'root' }
its('group') { should eq 'root' }
its('mode') { should cmp '01777' }
end
end

temporary_dirs = [ "/run/enroot" ]
temporary_dirs.each do |path|
describe directory(path) do
it { should exist }
its('owner') { should eq 'root' }
its('group') { should eq 'root' }
its('mode') { should cmp '01777' }
end
end
end

control 'tag:config_enroot_enabled_on_graphic_instances' do
only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }

describe file("/opt/parallelcluster/shared/enroot") do
describe file("/var/enroot/cache-group-1000") do
it { should exist }
its('group') { should eq 'root' }
end unless os_properties.redhat_on_docker?
Expand Down
1 change: 1 addition & 0 deletions cookbooks/aws-parallelcluster-shared/attributes/cluster.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
default['cluster']['license_dir'] = "#{node['cluster']['base_dir']}/licenses"
default['cluster']['configs_dir'] = "#{node['cluster']['base_dir']}/configs"
default['cluster']['shared_dir'] = "#{node['cluster']['base_dir']}/shared"
default['cluster']['examples_dir'] = "#{node['cluster']['base_dir']}/examples"
default['cluster']['shared_dir_login_nodes'] = "#{node['cluster']['base_dir']}/shared_login_nodes"
default['cluster']['log_base_dir'] = '/var/log/parallelcluster'
default['cluster']['etc_dir'] = '/etc/parallelcluster'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,10 @@

# Slurmdbd
default['cluster']['slurmdbd_service_enabled'] = "true"

# Spank
default['cluster']['slurm']['spank_config_dir'] = "#{node['cluster']['slurm']['install_dir']}/etc/plugstack.conf.d"

# Pyxis
default['cluster']['pyxis']['version'] = '0.20.0'
default['cluster']['pyxis']['runtime_path'] = '/run/pyxis'
Loading

0 comments on commit f293655

Please sign in to comment.