diff --git a/VERSION b/VERSION index 6f4eebd..100435b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.8.1 +0.8.2 diff --git a/ci/cscs.yml b/ci/cscs.yml new file mode 100644 index 0000000..b9c1a00 --- /dev/null +++ b/ci/cscs.yml @@ -0,0 +1,110 @@ +include: +- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml' + +stages: + - baseimage x86_64 + - build x86_64 + - build rpm x86_64 + - baseimage aarch64 + - build aarch64 + - build rpm aarch64 + +# dynamic name for sha on watched files, slurm version, uarch +.my-dynamic-image-name: + extends: [.dynamic-image-name] + before_script: + - DOCKER_TAG=`echo $(eval cat $WATCH_FILECHANGES; echo -n $slurm_version) $(uname -m) | sha256sum | head -c 16` + - export PERSIST_IMAGE_NAME=$PERSIST_IMAGE_NAME:$DOCKER_TAG + - echo "BASE_IMAGE=$PERSIST_IMAGE_NAME" > build.env + +.build slurm base: + timeout: 10h + variables: + CSCS_NOTIFICATION_CONTEXT: "$slurm_version" + DOCKERFILE: ci/slurm_docker/Dockerfile.base + DOCKER_BUILD_ARGS: '["SLURM_VERSION=$slurm_version"]' + WATCH_FILECHANGES: ci/slurm_docker/Dockerfile.base ci/slurm_docker/cgroup.conf ci/slurm_docker/entrypoint.sh ci/slurm_docker/install_slurm.sh ci/slurm_docker/slurm.conf.in + PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/slurm-base + +build slurm base x86_64: + stage: baseimage x86_64 + extends: [.my-dynamic-image-name, '.build slurm base', .container-builder-cscs-zen2] + +build slurm base aarch64: + stage: baseimage aarch64 + extends: [.my-dynamic-image-name, '.build slurm base', .container-builder-cscs-gh200] + +.build: + variables: + CSCS_REBUILD_POLICY: always + CSCS_NOTIFICATION_CONTEXT: "$slurm_version" + DOCKERFILE: ci/slurm_docker/Dockerfile + DOCKER_BUILD_ARGS: '["BASE_IMAGE=${BASE_IMAGE}"]' + +build x86_64: + needs: ["build slurm base x86_64"] + stage: build x86_64 + extends: [.build, .container-builder-cscs-zen2] + variables: + PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/slurm-uenv-mount-x86_64 + +build aarch64: + needs: ["build slurm base aarch64"] + stage: build aarch64 + extends: [.build, .container-builder-cscs-gh200] + variables: + PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/public/slurm-uenv-mount-aarch64 + +.build rpm upload artifact: + variables: + CSCS_NOTIFICATION_CONTEXT: "$slurm_version" + script: + - | + _rpm_build_dir=./ + mkdir -p ${_rpm_build_dir} + CXX=g++-12 CC=gcc-12 /src/rpm/make-rpm.sh --slurm-version "${slurm_version}" ${_rpm_build_dir} + binary_rpm=$(find RPMS -name '*.rpm') + # upload release + ret=$(curl -L \ + -X POST \ + -o response.json \ + -w "%{http_code}" \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${GHUB_WRITE_TOKEN}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + -H "Content-Type: application/octet-stream" \ + "https://uploads.github.com/repos/eth-cscs/slurm-uenv-mount/releases/${CI_COMMIT_REF_NAME}/assets?name=$(basename ${binary_rpm})" \ + --data-binary "@${binary_rpm}") + echo "http_code: $ret" + if [ $http_code -eq 400 ]; then + echo "$http_code: Bad request, couldn't upload release" + exit 1 + fi + # https://docs.github.com/en/rest/releases/assets?apiVersion=2022-11-28#upload-a-release-asset--status-codes + if [ $http_code -eq 201 ]; then + echo "$http_code: Successfully uploaded $(basename ${binary_rpm}) to release ${CI_COMMIT_REF_NAME}." + cat response.json + fi + if [ $http_code -eq 422 ]; then + echo "$http_code: Successfully replaced $(basename ${binary_rpm}) to release ${CI_COMMIT_REF_NAME}." + cat response.json + fi + + + +build rpm x86_64 and upload artifact: + needs: ["build x86_64"] + image: $CSCS_REGISTRY_PATH/base/public/slurm-uenv-mount-x86_64 + stage: build rpm x86_64 + extends: ['.build rpm upload artifact', .container-runner-eiger-mc] + +build rpm aarch64 and upload artifact: + needs: ["build aarch64"] + image: $CSCS_REGISTRY_PATH/base/public/slurm-uenv-mount-aarch64 + stage: build rpm aarch64 + extends: ['.build rpm upload artifact', .f7t-container-runner] + variables: + F7T_URL: 'https://firecrest-todi.v1.tds.cscs.ch' + FIRECREST_SYSTEM: 'todi' + ARCH: 'aarch64' + USE_CE: 'YES' diff --git a/ci/slurm_docker/Dockerfile b/ci/slurm_docker/Dockerfile new file mode 100644 index 0000000..37456f4 --- /dev/null +++ b/ci/slurm_docker/Dockerfile @@ -0,0 +1,8 @@ +ARG BASE_IMAGE +FROM $BASE_IMAGE + +COPY . /src + +RUN CXX=g++-12 CC=gcc-12 meson setup builddir /src \ + && meson install -C builddir \ + && echo "required /usr/local/lib64/libslurm-uenv-mount.so" > /etc/slurm/plugstack.conf diff --git a/ci/slurm_docker/Dockerfile.base b/ci/slurm_docker/Dockerfile.base new file mode 100644 index 0000000..2fd9bfe --- /dev/null +++ b/ci/slurm_docker/Dockerfile.base @@ -0,0 +1,101 @@ +FROM opensuse/leap:15.4 + +ARG SLURM_VERSION=23.02.7 +ARG SLURM_ROOT=/usr +ARG SLURM_CONFDIR=/etc/slurm + +ENV SLURM_VERSION ${SLURM_VERSION} +ENV SLURM_ROOT ${SLURM_ROOT} +ENV SLURM_CONFDIR ${SLURM_CONFDIR} + +RUN zypper install -y \ + munge \ + munge-devel \ + libnuma1 \ + libnuma-devel \ + librrd8 \ + readline-devel \ + hwloc \ + hwloc-devel \ + lz4 \ + liblz4-devel \ + libz1 \ + zlib-devel \ + freeipmi \ + freeipmi-devel \ + dbus-1 \ + dbus-1-devel \ + make \ + gcc12 \ + gcc12-c++ \ + curl \ + tar \ + bzip2 \ + python3 \ + vim \ + ca-certificates \ + less \ + sudo \ + fuse3-devel \ + git \ + sqlite3 \ + sqlite3-devel \ + libopenssl-devel \ + util-linux \ + util-linux-systemd \ + squashfs \ + rpm-build \ + lua53 \ + lua53-devel \ + libmount-devel + +RUN useradd -M slurm + +RUN mkdir -p /var/log/slurm +RUN mkdir -p /var/spool/slurmctld && chown slurm /var/spool/slurmctld && chmod u+rwx /var/spool/slurmctld +RUN mkdir -p /var/spool/slurmd && chown slurm /var/spool/slurmd && chmod u+rwx /var/spool/slurmd + + +COPY ci/slurm_docker/install_slurm.sh . + +RUN ./install_slurm.sh ${SLURM_VERSION} ${SLURM_ROOT} ${SLURM_CONFDIR} --enable-multiple-slurmd + +RUN mkdir -p ${SLURM_CONFDIR} +COPY ci/slurm_docker/cgroup.conf ${SLURM_CONFDIR} +COPY ci/slurm_docker/slurm.conf.in ${SLURM_CONFDIR} + +# slurm-uenv-mount +# install python +RUN curl -O https://www.python.org/ftp/python/3.10.11/Python-3.10.11.tgz \ + && tar xzvf Python-3.10.11.tgz \ + && cd Python-3.10.11 \ + && ./configure \ + && make install -j \ + && cd ../ && rm -r Python-3.10.11 +RUN zypper --non-interactive rm libopenssl-devel + +# rpmbuild > /usr/lib/rpm/macros.d/macros.meson are missing here ... +RUN python3 -m pip install --upgrade pip && python3 -m pip install meson ninja +RUN curl https://raw.githubusercontent.com/mesonbuild/meson/master/data/macros.meson -o /usr/lib/rpm/macros.d/macros.meson +# rpm build expects meson in /usr/bin/meson +RUN ln -s /usr/local/bin/meson /usr/bin/meson + +# download bash-bats +RUN curl -L https://github.com/bats-core/bats-core/archive/refs/tags/v1.9.0.tar.gz | tar xz +RUN ln -s /bats-core-1.9.0/bin/bats /usr/bin/bats +RUN mkdir bats-helpers +RUN git clone --depth 1 https://github.com/bats-core/bats-assert.git bats-helpers/bats-assert +RUN git clone --depth 1 https://github.com/bats-core/bats-support.git bats-helpers/bats-support +ENV BATS_LIB_PATH /bats-helpers + +RUN mkdir /user-environment +RUN mkdir /user-profilers +RUN mkdir /user-tools + +RUN useradd testuser +RUN mkdir -p /home/testuser +RUN chown testuser /home/testuser + +COPY ci/tests /tests + +COPY ci/slurm_docker/entrypoint.sh . diff --git a/ci/slurm_docker/cgroup.conf b/ci/slurm_docker/cgroup.conf new file mode 100644 index 0000000..102e318 --- /dev/null +++ b/ci/slurm_docker/cgroup.conf @@ -0,0 +1,5 @@ +CgroupAutomount=yes +ConstrainCores=no +ConstrainRAMSpace=no +CgroupMountpoint=/sys/fs/cgroup +CgroupPlugin=cgroup/v1 diff --git a/ci/slurm_docker/entrypoint.sh b/ci/slurm_docker/entrypoint.sh new file mode 100755 index 0000000..111485c --- /dev/null +++ b/ci/slurm_docker/entrypoint.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +dbus-launch +sudo -u munge munged + +: "${SLURM_CONF_IN=$SLURM_CONFDIR/slurm.conf.in}" +: "${SLURM_CONF=$SLURM_CONFDIR/slurm.conf}" + +# Default number of slurm nodes +: "${SLURM_NUMNODES=3}" + +# Default slurm controller +: "${SLURMCTLD_HOST=$HOSTNAME}" +: "${SLURMCTLD_ADDR=127.0.0.1}" + +# Default node info +: "${NODE_HOST=$HOSTNAME}" +: "${NODE_ADDR=127.0.0.1}" +: "${NODE_BASEPORT=6001}" + +# Default hardware profile +: "${NODE_HW=CPUs=4}" + +# Generate node names and associated ports +NODE_NAMES=$(printf "nd[%05i-%05i]" 1 $SLURM_NUMNODES) +NODE_PORTS=$(printf "%i-%i" $NODE_BASEPORT $(($NODE_BASEPORT+$SLURM_NUMNODES-1))) + + +echo "INFO:" +echo "INFO: Creating $SLURM_CONF with" +echo "INFO: " +column -t <<-EOF + INFO: SLURMCTLD_HOST=$SLURMCTLD_HOST SLURMCTLD_ADDR=$SLURMCTLD_ADDR + INFO: NODE_HOST=$NODE_HOST NODE_ADDR=$NODE_ADDR NODE_BASEPORT=$NODE_BASEPORT + INFO: NODE_HW=$NODE_HW + INFO: SLURM_NUMNODES=$SLURM_NUMNODES +EOF +echo "INFO: " +echo "INFO: Derived values:" +echo "INFO:" +column -t <<-EOF + INFO: NODE_NAMES=$NODE_NAMES + INFO: NODE_PORTS=$NODE_PORTS +EOF +echo "INFO:" +echo "INFO: Override any of the non-derived values by setting the respective environment variable" +echo "INFO: when starting Docker." +echo "INFO:" + +export PATH=$SLURM_ROOT/bin:$PATH +export LD_LIBRARY_PATH=$SLURM_ROOT/lib:$LD_LIBRARY_PATH +export MANPATH=$SLURM_ROOT/man:$MANPATH + +( + echo "NodeName=${NODE_NAMES} NodeHostname=${NODE_HOST} NodeAddr=${NODE_ADDR} Port=${NODE_PORTS} State=UNKNOWN ${NODE_HW}" + echo "PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP" +) \ +| sed -e "s/SLURMCTLDHOST/${SLURMCTLD_HOST}/" \ + -e "s/SLURMCTLDADDR/${SLURMCTLD_ADDR}/" \ + $SLURM_CONF_IN - \ +> $SLURM_CONF + +NODE_NAME_LIST=$(scontrol show hostnames $NODE_NAMES) + +for n in $NODE_NAME_LIST +do + echo "$NODE_ADDR $n" >> /etc/hosts +done + +echo +echo "Starting Slurm services..." +echo + +$SLURM_ROOT/sbin/slurmctld + +for n in $NODE_NAME_LIST +do + $SLURM_ROOT/sbin/slurmd -N $n +done + +echo +sinfo +echo +echo + +exec "$@" diff --git a/ci/slurm_docker/install_slurm.sh b/ci/slurm_docker/install_slurm.sh new file mode 100755 index 0000000..26e0202 --- /dev/null +++ b/ci/slurm_docker/install_slurm.sh @@ -0,0 +1,66 @@ +#!/bin/bash -x +# +# Usage: install_slurm.sh [configure-args] +# + +SLURM_VERSION=$1 +SLURM_ROOT=$2 +SLURM_CONFDIR=$3 +shift; shift; shift +ARGS=$* + +slurm_tar_file=slurm-${SLURM_VERSION}.tar.bz2 +slurm_url=https://download.schedmd.com/slurm/${slurm_tar_file} + + +if [ -z "$SLURM_VERSION" -o -z "$SLURM_ROOT" -o -z "$SLURM_CONFDIR" ]; +then + echo "Usage: install_slurm.sh [configure-args]" + echo "No Slurm version or install-prefix specified on command line. Aborting." + exit 1 +fi + +# +# Download slurm tarball and unpack it +# +if true; then + + mkdir -p /opt/src || exit 1 + ( + cd /opt/src + + if ! stat $slurm_tar_file; then + echo "=== downloading slurm ${SLURM_VERSION} from ${slurm_url}" + curl --fail --output ${slurm_tar_file} ${slurm_url} || exit 1 + fi + + echo "=== unpacking $slurm_tar_file" + tar -xjf ${slurm_tar_file} || exit 1 + ) + +fi + +if [ "$ARGS" = "NO_BUILD" ]; +then + exit 0 +fi + +# +# Remove any old build directory. +# Run configure, make, make install +# + +stat /opt/build/slurm-${SLURM_VERSION} && rm -rf /opt/build/slurm-${SLURM_VERSION} +mkdir -p /opt/build/slurm-${SLURM_VERSION} || exit 1 +( + cd /opt/build/slurm-${SLURM_VERSION} + CXX=g++-12 CC=gcc-12 /opt/src/slurm-${SLURM_VERSION}/configure --help + /opt/src/slurm-${SLURM_VERSION}/configure \ + --prefix=${SLURM_ROOT} \ + --sysconfdir=${SLURM_CONFDIR} \ + --disable-dependency-tracking \ + $ARGS + + make -j4 && make install +) + diff --git a/ci/slurm_docker/slurm.conf.in b/ci/slurm_docker/slurm.conf.in new file mode 100644 index 0000000..471ae3f --- /dev/null +++ b/ci/slurm_docker/slurm.conf.in @@ -0,0 +1,155 @@ +# +# Example slurm.conf file. Please run configurator.html +# (in doc/html) to build a configuration file customized +# for your environment. +# +# +# slurm.conf file generated by configurator.html. +# Put this file on all nodes of your cluster. +# See the slurm.conf man page for more information. +# +ClusterName=cluster +SlurmctldHost=SLURMCTLDHOST(SLURMCTLDADDR) +#SlurmctldHost= +# +#DisableRootJobs=NO +#EnforcePartLimits=NO +#Epilog= +#EpilogSlurmctld= +#FirstJobId=1 +#MaxJobId=67043328 +#GresTypes= +#GroupUpdateForce=0 +#GroupUpdateTime=600 +#JobFileAppend=0 +#JobRequeue=1 +#JobSubmitPlugins=lua +#KillOnBadExit=0 +#LaunchType=launch/slurm +#Licenses=foo*4,bar +#MailProg=/bin/mail +#MaxJobCount=10000 +#MaxStepCount=40000 +#MaxTasksPerNode=512 +MpiDefault=pmi2 +#MpiParams=ports=#-# +#PluginDir= +#PlugStackConfig= +#PrivateData=jobs +#ProctrackType=proctrack/cgroup +ProctrackType=proctrack/linuxproc +#Prolog= +#PrologFlags= +#PrologSlurmctld= +#PropagatePrioProcess=0 +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#RebootProgram= +ReturnToService=1 +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmctldPort=6817 +SlurmdPidFile=/var/run/slurmd.%n.pid +SlurmdPort=6818 +SlurmdSpoolDir=/var/spool/slurmd.%n +SlurmUser=slurm +#SlurmdUser=root +#SrunEpilog= +#SrunProlog= +StateSaveLocation=/var/spool/slurmctld +SwitchType=switch/none +#TaskEpilog= +TaskPlugin=task/affinity +#TaskProlog= +#TopologyPlugin=topology/tree +#TmpFS=/tmp +#TrackWCKey=no +#TreeWidth= +#UnkillableStepProgram= +#UsePAM=0 +# +# +# TIMERS +#BatchStartTimeout=10 +#CompleteWait=0 +#EpilogMsgTime=2000 +#GetEnvTimeout=2 +#HealthCheckInterval=0 +#HealthCheckProgram= +InactiveLimit=0 +KillWait=30 +#MessageTimeout=10 +#ResvOverRun=0 +MinJobAge=300 +#OverTimeLimit=0 +SlurmctldTimeout=120 +SlurmdTimeout=300 +#UnkillableStepTimeout=60 +#VSizeFactor=0 +Waittime=0 +# +# +# SCHEDULING +#DefMemPerCPU=0 +#MaxMemPerCPU=0 +#SchedulerTimeSlice=30 +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_CPU +# +# +# JOB PRIORITY +#PriorityFlags= +#PriorityType=priority/basic +#PriorityDecayHalfLife= +#PriorityCalcPeriod= +#PriorityFavorSmall= +#PriorityMaxAge= +#PriorityUsageResetPeriod= +#PriorityWeightAge= +#PriorityWeightFairshare= +#PriorityWeightJobSize= +#PriorityWeightPartition= +#PriorityWeightQOS= +# +# +# LOGGING AND ACCOUNTING +#AccountingStorageEnforce=0 +#AccountingStorageHost= +#AccountingStoragePass= +#AccountingStoragePort= +AccountingStorageType=accounting_storage/none +#AccountingStorageUser= +#AccountingStoreFlags= +#JobCompHost= +#JobCompLoc= +#JobCompPass= +#JobCompPort= +JobCompType=jobcomp/none +#JobCompUser= +#JobContainerType=job_container/none +JobAcctGatherFrequency=30 +JobAcctGatherType=jobacct_gather/none +SlurmctldDebug=debug2 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=debug2 +SlurmdLogFile=/var/log/slurm/slurmd.%n.log +#SlurmSchedLogFile= +#SlurmSchedLogLevel= +#DebugFlags= +# +# +# POWER SAVE SUPPORT FOR IDLE NODES (optional) +#SuspendProgram= +#ResumeProgram= +#SuspendTimeout= +#ResumeTimeout= +#ResumeRate= +#SuspendExcNodes= +#SuspendExcParts= +#SuspendRate= +#SuspendTime= +# +# +# COMPUTE NODES +#NodeName=nd[1-3] NodeHostname=DOCKER_HOSTNAME NodeAddr=127.0.0.1 Port=[6001-6003] CPUs=4 State=UNKNOWN +#PartitionName=dkr Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/rpm/macros.meson b/rpm/macros.meson new file mode 100644 index 0000000..21dd77a --- /dev/null +++ b/rpm/macros.meson @@ -0,0 +1,34 @@ +%__sourcedir . +%__builddir %{_target_platform} +%__meson_wrap_mode nodownload + +%meson_setup \ + mkdir -p %{__builddir} \ + CFLAGS="${CFLAGS:-%optflags}"; export CFLAGS; \ + CXXFLAGS="${CXXFLAGS:-%optflags}"; export CXXFLAGS; \ + FFLAGS="${FFLAGS:-%optflags}"; export FFLAGS; \ + FCFLAGS="${FCFLAGS:-%optflags}"; export FCFLAGS; \ + meson setup %{__sourcedir} %{__builddir} \\\ + %{?_enable_debug:-Ddebug=true} \\\ + --prefix=%{_prefix} \\\ + --bindir=%{_bindir} \\\ + --sbindir=%{_sbindir} \\\ + --libexecdir=%{_libexecdir} \\\ + --libdir=%{_libdir} \\\ + --localstatedir=%{_var} \\\ + --sharedstatedir=%{_sharedstatedir} \\\ + --includedir=%{_includedir} \\\ + --datadir=%{_datadir} \\\ + --sysconfdir=%{_sysconfdir} \\\ + --mandir=%{_mandir} \\\ + --infodir=%{_infodir} \\\ + --localedir=%{_datadir}/locale \\\ + -Dcli=false \\\ + -Dslurm_plugin=true \\\ + %{nil} + +%meson_build \ +meson compile %_smp_mflags -C %{__builddir} + +%meson_install \ +DESTDIR=%buildroot meson install --no-rebuild --skip-subprojects -C %{__builddir} diff --git a/rpm/make-rpm.sh b/rpm/make-rpm.sh index 89175d5..116c28b 100755 --- a/rpm/make-rpm.sh +++ b/rpm/make-rpm.sh @@ -104,6 +104,7 @@ mkdir -p "${dstdir}" --define "set_build_flags CXXFLAGS=\"-O2 -Wall -Wpedantic\"" \ --define "_smp_build_ncpus 1" \ --define "_vpath_srcdir slurm-uenv-mount-${SLURM_UENV_MOUNT_VERSION}" \ + --load ${_scriptdir}/macros.meson \ --rebuild SRPMS/slurm-uenv-mount-*.src.rpm fi )