Skip to content

Commit

Permalink
[DRAFT] Pyxis fix.
Browse files Browse the repository at this point in the history
Signed-off-by: Giacomo Marciani <[email protected]>
  • Loading branch information
gmarciani committed Oct 15, 2024
1 parent 0963973 commit 6c68565
Show file tree
Hide file tree
Showing 15 changed files with 159 additions and 106 deletions.
5 changes: 3 additions & 2 deletions cookbooks/aws-parallelcluster-platform/attributes/platform.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
# ArmPL
default['conditions']['arm_pl_supported'] = arm_instance?

# Enroot + Pyxis
# Enroot
default['cluster']['enroot']['version'] = '3.4.1'
default['cluster']['pyxis']['version'] = '0.20.0'
default['cluster']['enroot']['temporary_dir'] = '/run/enroot'
default['cluster']['enroot']['persistent_dir'] = '/var/enroot'

# NVidia
default['cluster']['nvidia']['enabled'] = 'no'
Expand Down
3 changes: 0 additions & 3 deletions cookbooks/aws-parallelcluster-platform/recipes/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,3 @@
include_recipe 'aws-parallelcluster-platform::supervisord_config'
fetch_config 'Fetch and load cluster configs'
include_recipe 'aws-parallelcluster-platform::config_login' if node['cluster']['node_type'] == 'LoginNode'
enroot 'Configure Enroot' do
action :configure
end
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
directory node['cluster']['license_dir']
directory node['cluster']['configs_dir']
directory node['cluster']['shared_dir']
directory node['cluster']['examples_dir']
directory node['cluster']['shared_dir_login_nodes']

# Create ParallelCluster log folder
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# frozen_string_literal: true
#
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
Expand All @@ -18,50 +18,36 @@
action :setup do
return if on_docker?
action_install_package
end

action :configure do
return if on_docker?
return unless enroot_installed

cookbook_file "/tmp/enroot.template.conf" do
source 'enroot/enroot.template.conf'
cookbook 'aws-parallelcluster-platform'
template "/etc/enroot/enroot.conf" do
source 'enroot/enroot.conf.erb'
owner 'root'
group 'root'
mode '0755'
action :create_if_missing
mode '0644'
end

bash "Configure enroot" do
user 'root'
code <<-ENROOT_CONFIGURE
set -e
ENROOT_CONFIG_RELEASE=pyxis
SHARED_DIR=#{node['cluster']['shared_dir']}
NONROOT_USER=#{node['cluster']['cluster_user']}
mkdir -p ${SHARED_DIR}/enroot
chown ${NONROOT_USER} ${SHARED_DIR}/enroot
ENROOT_CACHE_PATH=${SHARED_DIR}/enroot envsubst < /tmp/enroot.template.conf > /tmp/enroot.conf
mv /tmp/enroot.conf /etc/enroot/enroot.conf
chmod 0644 /etc/enroot/enroot.conf
mkdir -p /tmp/enroot
chmod 1777 /tmp/enroot
mkdir -p /tmp/enroot/data
chmod 1777 /tmp/enroot/data
chmod 1777 ${SHARED_DIR}/enroot
directory node['cluster']['enroot']['persistent_dir'] do
owner 'root'
group 'root'
mode '1777'
recursive true
end

mkdir -p ${SHARED_DIR}/pyxis/
chown ${NONROOT_USER} ${SHARED_DIR}/pyxis/
sed -i '${s/$/ runtime_path=${SHARED_DIR}\\/pyxis/}' /opt/slurm/etc/plugstack.conf.d/pyxis.conf
SHARED_DIR=${SHARED_DIR} envsubst < /opt/slurm/etc/plugstack.conf.d/pyxis.conf > /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf
mv /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
directory node['cluster']['enroot']['temporary_dir'] do
owner 'root'
group 'root'
mode '1777'
recursive true
end

ENROOT_CONFIGURE
retries 3
retry_delay 5
# We assume the Enroot temporary dir to be a temporary folder in /run.
# Folders in /run must be defined in /usr/lib/tmpfiles.d, otherwise they get
# deleted on node boot.
template "/usr/lib/tmpfiles.d/enroot.conf" do
source 'enroot/tmpfiles/enroot.conf.erb'
owner 'root'
group 'root'
mode '0644'
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,6 @@ def self.setup(chef_run)
end
end
end

def self.configure(chef_run)
chef_run.converge_dsl('aws-parallelcluster-platform') do
enroot 'configure' do
action :configure
end
end
end
end

describe 'enroot:package_version' do
Expand Down Expand Up @@ -128,44 +120,3 @@ def self.configure(chef_run)
end
end
end

describe 'enroot:configure' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
let(:chef_run) do
runner(platform: platform, version: version, step_into: ['enroot'])
end

context 'when enroot is installed' do
before do
stubs_for_provider('enroot') do |resource|
allow(resource).to receive(:enroot_installed).and_return(true)
end
ConvergeEnroot.configure(chef_run)
end
it 'run configure enroot script' do
is_expected.to run_bash('Configure enroot')
.with(retries: 3)
.with(retry_delay: 5)
.with(user: 'root')
end
end

context 'when enroot is not installed' do
before do
stubs_for_provider('enroot') do |resource|
allow(resource).to receive(:enroot_installed).and_return(false)
end
ConvergeEnroot.configure(chef_run)
end

it 'does not run configure enroot script' do
is_expected.not_to run_bash('Configure enroot')
.with(retries: 3)
.with(retry_delay: 5)
.with(user: 'root')
end
end
end
end
end
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#ENROOT_LIBRARY_PATH /usr/lib/enroot
#ENROOT_SYSCONF_PATH /etc/enroot
ENROOT_RUNTIME_PATH /tmp/enroot/user-$(id -u)
ENROOT_CONFIG_PATH ${ENROOT_CONFIG_PATH}
ENROOT_CACHE_PATH ${ENROOT_CACHE_PATH}
ENROOT_DATA_PATH /tmp/enroot/data/user-$(id -u)
ENROOT_RUNTIME_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/runtime/user-$(id -u)
ENROOT_DATA_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/data/user-$(id -u)
ENROOT_CONFIG_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/config/user-$(id -u)
ENROOT_CACHE_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/cache/group-$(id -g)
#ENROOT_TEMP_PATH ${TMPDIR:-/tmp}

# Gzip program used to uncompress digest layers.
Expand Down Expand Up @@ -68,4 +68,4 @@ ENROOT_RESTRICT_DEV no
#all_proxy
#no_proxy
#http_proxy
#https_proxy
#https_proxy
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
D <%= node['cluster']['enroot']['temporary_dir'] %> 0777 root root
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,37 @@

expected_enroot_version = node['cluster']['enroot']['version']

describe "gdrcopy version is expected to be #{expected_enroot_version}" do
describe "enroot version is expected to be #{expected_enroot_version}" do
subject { command('enroot version').stdout.strip() }
it { should eq expected_enroot_version }
end

base_dir1 = "/etc/enroot"
etc_dirs = [ base_dir1, "#{base_dir1}/enroot-cache"]

etc_dirs.each do |path|
describe directory(path) do
it { should exist }
its('mode') { should cmp '01777' }
its('owner') { should eq 'root' }
its('group') { should eq 'root' }
end
end

base_dir2 = "/run/enroot"
tmp_dirs = [ base_dir2, "#{base_dir2}/data" ]
tmp_dirs.each do |path|
describe directory(path) do
it { should exist }
its('mode') { should cmp '01777' }
end
end
end

control 'tag:config_enroot_enabled_on_graphic_instances' do
only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }

describe file("/opt/parallelcluster/shared/enroot") do
describe file("/etc/enroot/enroot-cache") do
it { should exist }
its('group') { should eq 'root' }
end unless os_properties.redhat_on_docker?
Expand Down
1 change: 1 addition & 0 deletions cookbooks/aws-parallelcluster-shared/attributes/cluster.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
default['cluster']['license_dir'] = "#{node['cluster']['base_dir']}/licenses"
default['cluster']['configs_dir'] = "#{node['cluster']['base_dir']}/configs"
default['cluster']['shared_dir'] = "#{node['cluster']['base_dir']}/shared"
default['cluster']['examples_dir'] = "#{node['cluster']['base_dir']}/examples"
default['cluster']['shared_dir_login_nodes'] = "#{node['cluster']['base_dir']}/shared_login_nodes"
default['cluster']['log_base_dir'] = '/var/log/parallelcluster'
default['cluster']['etc_dir'] = '/etc/parallelcluster'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,10 @@

# Slurmdbd
default['cluster']['slurmdbd_service_enabled'] = "true"

# Spank
default['cluster']['slurm']['spank_config_dir'] = "#{node['cluster']['slurm']['install_dir']}/etc/plugstack.conf.d"

# Pyxis
default['cluster']['pyxis']['version'] = '0.20.0'
default['cluster']['pyxis']['runtime_path'] = '/run/pyxis'
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Cookbook:: aws-parallelcluster-slurm
# Recipe:: install_pyxis
#
# Copyright:: Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
Expand All @@ -21,6 +21,9 @@
pyxis_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/pyxis/v#{pyxis_version}.tar.gz"
pyxis_tarball = "#{node['cluster']['sources_dir']}/pyxis-#{pyxis_version}.tar.gz"

spank_examples_dir = "#{node['cluster']['examples_dir']}/spank"
pyxis_examples_dir = "#{node['cluster']['examples_dir']}/pyxis"

remote_file pyxis_tarball do
source pyxis_url
mode '0644'
Expand All @@ -35,12 +38,59 @@
set -e
tar xf #{pyxis_tarball} -C /tmp
cd /tmp/pyxis-#{pyxis_version}
CPPFLAGS='-I /opt/slurm/include/' make
CPPFLAGS='-I /opt/slurm/include/' make install
mkdir -p /opt/slurm/etc/plugstack.conf.d
echo -e 'include /opt/slurm/etc/plugstack.conf.d/*' | tee /opt/slurm/etc/plugstack.conf
ln -fs /usr/local/share/pyxis/pyxis.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
CPPFLAGS='-I #{node['cluster']['slurm']['install_dir']}/include/' make
CPPFLAGS='-I #{node['cluster']['slurm']['install_dir']}/include/' make install
PYXIS_INSTALL
retries 3
retry_delay 5
end

# Spank configurations

directory node['cluster']['slurm']['spank_config_dir'] do
user 'root'
group 'root'
mode '0755'
recursive true
end

directory spank_examples_dir

template "#{spank_examples_dir}/plugstack.conf" do
source 'pyxis/plugstack.conf.erb'
owner 'root'
group 'root'
mode '0644'
end

# Pyxis configurations

directory node['cluster']['pyxis']['runtime_path'] do
user 'root'
group 'root'
mode '1777'
recursive true
end

# We assume the Pyxis runtime path to be a temporary folder in /run.
# Folders in /run must be defined in /usr/lib/tmpfiles.d, otherwise they get
# deleted on node boot.
template "/usr/lib/tmpfiles.d/pyxis.conf" do
source 'pyxis/tmpfiles/pyxis.conf.erb'
owner 'root'
group 'root'
mode '0644'
end

link '/usr/local/share/pyxis/pyxis.conf' do
to "#{node['cluster']['slurm']['spank_config_dir']}/pyxis.conf"
end

directory pyxis_examples_dir

template "#{pyxis_examples_dir}/pyxis.conf" do
source 'pyxis/pyxis.conf.erb'
owner 'root'
group 'root'
mode '0644'
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# THIS IS AN EXAMPLE OF Spank config file
# When you want to enable please move this to /opt/slurm/etc/plugstack.conf
include <%= node['cluster']['slurm']['spank_config_dir'] %>/*
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# THIS IS AN EXAMPLE OF pyxis.conf file
# When you want to enable please move this to /opt/slurm/etc/plugstack.conf.d/pyxis.conf
required /usr/local/lib/slurm/spank_pyxis.so runtime_path=<%= node['cluster']['pyxis']['runtime_path'] %>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
D <%= node['cluster']['pyxis']['runtime_path'] %> 0777 root root
32 changes: 31 additions & 1 deletion cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,37 @@

title 'Checks Pyxis has been installed'

describe file("/opt/slurm/etc/plugstack.conf.d/pyxis.conf") do
describe directory('/opt/slurm/etc') do
it { should exist }
its('mode') { should cmp '0755' }
its('owner') { should eq 'root' }
its('group') { should eq 'root' }
end

base_dir = "/opt/parallelcluster/configs/examples"
dirs = [ base_dir, "#{base_dir}/spank", "#{base_dir}/pyxis" ]
dirs.each do |path|
describe directory(path) do
it { should exist }
end
end

describe directory('/run/pyxis') do
it { should exist }
its('owner') { should eq "#{node['cluster']['cluster_user']}" }
end

describe directory('/opt/slurm/etc/plugstack.conf.d') do
it { should exist }
its('owner') { should eq 'root' }
its('group') { should eq 'root' }
end

describe file("#{base_dir}/pyxis/pyxis.conf") do
it { should exist }
end

describe file("#{base_dir}/spank/plugstack.conf") do
it { should exist }
end
end

0 comments on commit 6c68565

Please sign in to comment.