Skip to content

Commit

Permalink
Move (conditional) installation of cuda compat libs to external script
Browse files Browse the repository at this point in the history
Only install cuda compat libs when either they are not installed yet
or they are outdated
  • Loading branch information
huebner-m committed May 18, 2022
1 parent 7d6af69 commit 2cc5ce9
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 60 deletions.
74 changes: 14 additions & 60 deletions gpu_support/add_gpu_support.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,76 +90,30 @@ if [ $ret -ne 0 ]; then
echo $latest_cuda_compat_url
exit 1
fi
latest_driver_version="${latest_cuda_compat_url%-*}"
latest_driver_version="${latest_driver_version##*_}"

# Create a general space for our NVIDIA compat drivers
install_compat_libs=false
host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then
mkdir -p ${host_injections_dir}
cd ${host_injections_dir}
# libcuda.so points to actual cuda compat lib with driver version in its name
# if this file exists, cuda compat libs are installed and we can compare the version
if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then
eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so)
eessi_driver_version="${eessi_driver_version##*so.}"
else
echo "Cannot write to eessi host_injections space, exiting now..." >&2
exit 1
eessi_driver_version=0
fi

# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest

driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi

# Check if our target CUDA is satisfied by what is installed already
# TODO: Find required CUDA version and see if we need an update

# If not, grab the latest compat library RPM or deb
# download and unpack in temporary directory, easier cleanup after installation
tmpdir=$(mktemp -d)
cd $tmpdir
compat_file=${latest_cuda_compat_url##*/}
wget ${latest_cuda_compat_url}

# Unpack it
# (the requirements here are OS dependent, can we get around that?)
# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e)
# (deb files can be unpacked with ar and tar)
file_extension=${compat_file##*.}
if [[ ${file_extension} == "rpm" ]]; then
rpm2cpio ${compat_file} | cpio -idmv
elif [[ ${file_extension} == "deb" ]]; then
ar x ${compat_file}
tar xf data.tar.*
if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then
install_compat_libs=true
else
echo "File extension of cuda compat lib not supported, exiting now..." >&2
exit 1
echo "CUDA compat libs are up-to-date, skip installation."
fi
cd $host_injections_dir
# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir
mv -n ${tmpdir}/usr/local/cuda-* .
rm -r ${tmpdir}

# Add a symlink that points to the latest version
latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1)
ln -sf ${latest_cuda_dir} latest

if [ ! -e latest ] ; then
echo "Symlink to latest cuda compat lib version is broken, exiting now..."
exit 1
fi

# Create the space to host the libraries
host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}
mkdir -p ${host_injection_libs_dir}
# Symlink in the path to the latest libraries
if [ ! -d "${host_injection_libs_dir}/lib" ]; then
ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib
elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..."
echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat"
exit 1
if [ "${install_compat_libs}" == true ]; then
source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url
fi

# return to initial dir
cd $current_dir

###############################################################################################
###############################################################################################
# Install CUDA
Expand Down
73 changes: 73 additions & 0 deletions gpu_support/install_cuda_compatlibs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash

libs_url=$1

current_dir=$(dirname $(realpath $0))

# Create a general space for our NVIDIA compat drivers
if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then
mkdir -p ${host_injections_dir}
cd ${host_injections_dir}
else
echo "Cannot write to eessi host_injections space, exiting now..." >&2
exit 1
fi

# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest

driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi

# Check if our target CUDA is satisfied by what is installed already
# TODO: Find required CUDA version and see if we need an update

# If not, grab the latest compat library RPM or deb
# download and unpack in temporary directory, easier cleanup after installation
tmpdir=$(mktemp -d)
cd $tmpdir
compat_file=${libs_url##*/}
wget ${libs_url}

# Unpack it
# (the requirements here are OS dependent, can we get around that?)
# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e)
# (deb files can be unpacked with ar and tar)
file_extension=${compat_file##*.}
if [[ ${file_extension} == "rpm" ]]; then
rpm2cpio ${compat_file} | cpio -idmv
elif [[ ${file_extension} == "deb" ]]; then
ar x ${compat_file}
tar xf data.tar.*
else
echo "File extension of cuda compat lib not supported, exiting now..." >&2
exit 1
fi
cd $host_injections_dir
# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir
mv -n ${tmpdir}/usr/local/cuda-* .
rm -r ${tmpdir}

# Add a symlink that points to the latest version
latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1)
ln -sf ${latest_cuda_dir} latest

if [ ! -e latest ] ; then
echo "Symlink to latest cuda compat lib version is broken, exiting now..."
exit 1
fi

# Create the space to host the libraries
host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}
mkdir -p ${host_injection_libs_dir}
# Symlink in the path to the latest libraries
if [ ! -d "${host_injection_libs_dir}/lib" ]; then
ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib
elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..."
echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat"
exit 1
fi

# return to initial dir
cd $current_dir

0 comments on commit 2cc5ce9

Please sign in to comment.