Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{lib,mpi}[GCCcore/13.3.0,NVHPC/24.9] Add NCCL 2.22.3, UCC-CUDA 1.3.0, OpenMPI 5.0.3 w/CUDA 12.6.0 #21546

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name = 'NCCL'
version = '2.22.3'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://developer.nvidia.com/nccl'
description = """The NVIDIA Collective Communications Library (NCCL) implements multi-GPU and multi-node collective
communication primitives that are performance optimized for NVIDIA GPUs."""

toolchain = {'name': 'GCCcore', 'version': '13.3.0'}

github_account = 'NVIDIA'
source_urls = [GITHUB_SOURCE]
sources = ['v%(version)s-1.tar.gz']
checksums = ['45151629a9494460e73375281e8b0fe379141528879301899ece9b776faca024']

builddependencies = [('binutils', '2.42')]

dependencies = [
('CUDA', '12.6.0', '', SYSTEM),
('UCX-CUDA', '1.16.0', versionsuffix),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name = 'OpenMPI'
version = '5.0.3'

homepage = 'https://www.open-mpi.org/'
description = """The Open MPI Project is an open source MPI-3 implementation."""

toolchain = {'name': 'NVHPC', 'version': '24.9-CUDA-12.6.0'}

source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads']
sources = [SOURCELOWER_TAR_BZ2]
patches = [
'OpenMPI-5.0.3_fix_hle_make_errors.patch',
'OpenMPI-5.0.3_disable_opal_path_nfs_test.patch',
('OpenMPI-5.0.2_build-with-internal-cuda-header.patch', 1)
]
checksums = [
{'openmpi-5.0.3.tar.bz2':
'990582f206b3ab32e938aa31bbf07c639368e4405dca196fabe7f0f76eeda90b'},
{'OpenMPI-5.0.3_fix_hle_make_errors.patch':
'881c907a9f5901d5d6af41cd33dffdcecba4a67a9e5123e602542aea57a80895'},
{'OpenMPI-5.0.3_disable_opal_path_nfs_test.patch':
'75d4417e35252ea3a19b2792f1b06e9aeb408c253aa4921d77226d57b71dee45'},
{'OpenMPI-5.0.2_build-with-internal-cuda-header.patch':
'f52dc470543f35efef10d651dd159c771ae25f8f76a420d20d87abf4dc769ed7'},
]

builddependencies = [
('pkgconf', '2.2.0'),
('Perl', '5.38.2'),
('Autotools', '20231222'),
]

dependencies = [
('zlib', '1.3.1'),
('hwloc', '2.10.0'),
('libevent', '2.1.12'),
('UCX', '1.16.0'),
('UCX-CUDA', '1.16.0', '-CUDA-%(cudaver)s'),
('libfabric', '1.21.0'),
('PMIx', '5.0.2'),
('PRRTE', '3.0.5'),
('UCC', '1.3.0'),
('UCC-CUDA', '1.3.0', '-CUDA-%(cudaver)s'),
]

# CUDA related patches and custom configure option can be removed if CUDA support isn't wanted.
preconfigopts = 'nvc -Iopal/mca/cuda/include -shared opal/mca/cuda/lib/cuda.c -o opal/mca/cuda/lib/libcuda.so && '
# Update configure to include changes from the "disable_opal_path_nfs_test" patch
preconfigopts += './autogen.pl --force && '

configopts = '--with-cuda=%(start_dir)s/opal/mca/cuda '
# Required to prevent internal compiler error in opal.
configopts += '--enable-alt-short-float=no '
# Set PGI compilers manually, as NVHPC compilers are not correctly detected
configopts += 'CC=pgcc CXX=pgc++ FC=pgfortran '

# site specific options
# configopts += '--without-psm2 '
# configopts += '--disable-oshmem '
# configopts += '--with-gpfs '
configopts += '--with-slurm '

moduleclass = 'mpi'
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Disable opal_path_nfs test in OpenMPI 5.0.3 as this test can easily fail on some systems,
when NFS mounts are used. Generally, this test is flaky, which may prevent users
from installing OpenMPI for no apparent reason.

diff --git a/test/util/Makefile.am b/test/util/Makefile.am
index e5ad472..33d63c4 100644
--- a/test/util/Makefile.am
+++ b/test/util/Makefile.am
@@ -38,7 +38,6 @@ AM_CPPFLAGS = -I$(top_srcdir)/test/support

check_PROGRAMS = \
opal_bit_ops \
- opal_path_nfs \
bipartite_graph \
opal_sha256

@@ -80,11 +79,11 @@ opal_bit_ops_LDADD = \
$(top_builddir)/test/support/libsupport.a
opal_bit_ops_DEPENDENCIES = $(opal_path_nfs_LDADD)

-opal_path_nfs_SOURCES = opal_path_nfs.c
-opal_path_nfs_LDADD = \
- $(top_builddir)/opal/lib@[email protected] \
- $(top_builddir)/test/support/libsupport.a
-opal_path_nfs_DEPENDENCIES = $(opal_path_nfs_LDADD)
+# opal_path_nfs_SOURCES = opal_path_nfs.c
+# opal_path_nfs_LDADD = \
+# $(top_builddir)/opal/lib@[email protected] \
+# $(top_builddir)/test/support/libsupport.a
+# opal_path_nfs_DEPENDENCIES = $(opal_path_nfs_LDADD)

#opal_os_path_SOURCES = opal_os_path.c
#opal_os_path_LDADD = \
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
On JUWELS, building OpenMPI 5.0.5 with NVHPC 24.9 fails with errors
related to `__ATOMIC_HLE_ACQUIRE` and `__ATOMIC_HLE_RELEASE` not being
defined. Add an additional macro check to let the build succeed.

--- a/opal/include/opal/sys/gcc_builtin/atomic.h 2024-07-23 01:23:20.567556032 +0200
+++ a/opal/include/opal/sys/gcc_builtin/atomic.h 2024-10-02 12:19:53.130698758 +0200
@@ -187,7 +187,7 @@
*
*********************************************************************/

-#if defined(__HLE__)
+#if defined(__HLE__) && defined(__ATOMIC_HLE_ACQUIRE) && defined(__ATOMIC_HLE_RELEASE)

# include <immintrin.h>

@@ -225,7 +225,7 @@
__ATOMIC_RELEASE | __ATOMIC_HLE_RELEASE);
}

-#else /* #if defined(__HLE__) */
+#else /* #if defined(__HLE__) && defined(__ATOMIC_HLE_ACQUIRE) && defined(__ATOMIC_HLE_RELEASE) */

#include "opal/sys/atomic_impl_spinlock.h"


Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
easyblock = 'ConfigureMake'

name = 'UCC-CUDA'
version = '1.3.0'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://www.openucx.org/'
description = """UCC (Unified Collective Communication) is a collective
communication operations API and library that is flexible, complete, and
feature-rich for current and emerging programming models and runtimes.

This module adds the UCC CUDA support.
"""

toolchain = {'name': 'GCCcore', 'version': '13.3.0'}
toolchainopts = {'pic': True}

source_urls = ['https://github.com/openucx/ucc/archive/refs/tags']
sources = ['v%(version)s.tar.gz']
patches = [
'%(name)s-%(version)s_link_against_existing_UCC_libs.patch',
]
checksums = [
{'v1.3.0.tar.gz': 'b56379abe5f1c125bfa83be305d78d81a64aa271b7b5fff0ac17b86725ff3acf'},
{'UCC-CUDA-1.3.0_link_against_existing_UCC_libs.patch':
'758228357ce2a6ae50fb26a0b43e9176feaf379e266365f38205ce679267fc0d'},
]

builddependencies = [
('binutils', '2.42'),
('Autotools', '20231222'),
]

dependencies = [
('UCC', version),
('CUDA', '12.6.0', '', SYSTEM),
('UCX-CUDA', '1.16.0', '-CUDA-%(cudaver)s'),
('NCCL', '2.22.3', '-CUDA-%(cudaver)s'),
]

preconfigopts = "./autogen.sh && "

buildopts = '-C src/components/mc/cuda V=1 && make -C src/components/tl/nccl V=1'
installopts = '-C src/components/mc/cuda && make -C src/components/tl/nccl install'

sanity_check_paths = {
'files': ['lib/ucc/libucc_mc_cuda.%s' % SHLIB_EXT, 'lib/ucc/libucc_tl_nccl.%s' % SHLIB_EXT],
'dirs': ['lib']
}

sanity_check_commands = ["ucc_info -c"]

modextrapaths = {'EB_UCC_EXTRA_COMPONENT_PATH': 'lib/ucc'}

moduleclass = 'lib'
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Make CUDA/NCCL plugins link against the existing ucc libraries.

Bart Oldeman, 2022-08-02
Mikael OEhman, 2023-06-16
diff -ur ucc-1.3.0.orig/src/components/mc/cuda/Makefile.am ucc-1.3.0/src/components/ec/cuda/Makefile.am
--- ucc-1.3.0.orig/src/components/mc/cuda/Makefile.am.orig 2023-06-16 12:56:53.205939925 +0200
+++ ucc-1.3.0/src/components/mc/cuda/Makefile.am 2023-06-16 13:02:21.716110609 +0200
@@ -14,7 +14,7 @@
libucc_mc_cuda_la_CFLAGS = $(BASE_CFLAGS)
libucc_mc_cuda_la_LDFLAGS = -version-info $(SOVERSION) --as-needed $(CUDA_LDFLAGS)
libucc_mc_cuda_la_LIBADD = $(CUDA_LIBS) \
- $(UCC_TOP_BUILDDIR)/src/libucc.la
+ -lucc

include $(top_srcdir)/config/module.am
endif
diff -ur ucc-1.0.0.orig/src/components/tl/nccl/Makefile.am ucc-1.0.0/src/components/tl/nccl/Makefile.am
--- ucc-1.0.0.orig/src/components/tl/nccl/Makefile.am 2022-04-15 12:43:33.000000000 +0000
+++ ucc-1.0.0/src/components/tl/nccl/Makefile.am 2022-08-02 12:13:59.334795989 +0000
@@ -21,6 +21,6 @@
libucc_tl_nccl_la_CPPFLAGS = $(AM_CPPFLAGS) $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS) $(NCCL_CPPFLAGS)
libucc_tl_nccl_la_CFLAGS = $(BASE_CFLAGS)
libucc_tl_nccl_la_LDFLAGS = -version-info $(SOVERSION) --as-needed $(CUDA_LDFLAGS) $(NCCL_LDFLAGS)
-libucc_tl_nccl_la_LIBADD = $(CUDA_LIBS) $(NCCL_LIBADD) $(UCC_TOP_BUILDDIR)/src/libucc.la
+libucc_tl_nccl_la_LIBADD = $(CUDA_LIBS) $(NCCL_LIBADD) -lucc

include $(top_srcdir)/config/module.am
Loading