Skip to content

Commit

Permalink
Merge branch 'main' into device_op_cuda
Browse files Browse the repository at this point in the history
  • Loading branch information
devreal authored Nov 6, 2023
2 parents 6418ae5 + 33a11af commit f14fe7b
Show file tree
Hide file tree
Showing 111 changed files with 4,643 additions and 1,498 deletions.
19 changes: 11 additions & 8 deletions .github/workflows/compile-rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,26 @@ env:
ROCM_VER: 5-4
jobs:
compile-rocm:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends wget lsb-core software-properties-common gpg curl
sudo apt update
sudo apt install -y --no-install-recommends wget lsb-core software-properties-common gpg curl
- name: Install extra dependencies
run: |
curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
echo 'deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/debian focal main' | sudo tee /etc/apt/sources.list.d/rocm.list
sudo apt-get update
sudo apt-get install -y rocm-hip-sdk
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/5.7.1/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7.1 jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt update
sudo apt install -y rocm-hip-runtime
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Build Open MPI
run: |
./autogen.pl
./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran
make -j
LD_LIBRARY_PATH=/opt/rocm/lib make -j
11 changes: 8 additions & 3 deletions .github/workflows/ompi_nvidia.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
name: ompi_NVIDIA CI
on: [pull_request, push]

on: [pull_request]
jobs:

deployment:
if: github.repository == 'open-mpi/ompi'
runs-on: [self-hosted, linux, x64, nvidia]
steps:
- name: Checkout
Expand All @@ -29,7 +30,11 @@ jobs:
- name: Running tests
run: /start test
clean:
if: ${{ always() }}
# always() should be used to run "clean" even when the workflow was canceled
# ( in case of the right repository name)
# The second condition doesn't work when the workflow was canceled

if: always() && (github.repository == 'open-mpi/ompi')
needs: [deployment, build, test]
runs-on: [self-hosted, linux, x64, nvidia]
steps:
Expand Down
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -534,3 +534,12 @@ docs/_templates
# Common Python virtual environment directory names
venv
py??

# Copies of PRRTE RST files (i.e., not source controlled in this tree)
docs/prrte-rst-content
docs/schizo-ompi-rst-content

# Copies of the built HTML docs and man pages (for distribution
# tarballs)
docs/html
docs/man
1 change: 1 addition & 0 deletions .mailmap
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
Jeff Squyres <[email protected]> <[email protected]>
Jeff Squyres <[email protected]> --quiet <--quiet>
Jeff Squyres <[email protected]>
Jeff Squyres <[email protected]>

George Bosilca <[email protected]> <[email protected]>

Expand Down
36 changes: 36 additions & 0 deletions .readthedocs-pre-create-environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

set -euxo pipefail

# The ReadTheDocs build process does not run autogen/configure/make.
# Hence, we have to copy the PRRTE RST files (from the 3rd-party/prrte
# tree) to our docs/ tree manually.

# Ensure that we're in the RTD CI environment

if [[ "${READTHEDOCS:-no}" == "no" ]]; then
echo "This script is only intended to be run in the ReadTheDocs CI environment"
exit 1
fi

SCHIZO_SRC_DIR=3rd-party/prrte/src/mca/schizo/ompi
SCHIZO_TARGET_DIR=docs/schizo-ompi-rst-content

PRRTE_RST_SRC_DIR=3rd-party/prrte/src/docs/prrte-rst-content
PRRTE_RST_TARGET_DIR=docs/prrte-rst-content

# Copy the OMPI schizo file from PRRTE

cp -rp $SCHIZO_SRC_DIR $SCHIZO_TARGET_DIR

# Only copy the PRRTE RST source files in prrte-rst-content that are
# referenced by ".. include::" in the schizo-ompi-cli.rst file. We do
# this because Sphinx complains if there are .rst files that are not
# referenced. :-(

mkdir -p $PRRTE_RST_TARGET_DIR
files=`fgrep '.. include::' $SCHIZO_TARGET_DIR/schizo-ompi-cli.rstxt | awk '{ print $3 }'`
for file in $files; do
filename=`basename $file`
cp -pf $PRRTE_RST_SRC_DIR/$filename $PRRTE_RST_TARGET_DIR
done
12 changes: 12 additions & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,20 @@ build:
os: ubuntu-22.04
tools:
python: "3.10"
jobs:
# RTD doesn't run configure or make. So we have to manually copy
# in the PRRTE RST files to docs/.
pre_create_environment:
- ./.readthedocs-pre-create-environment.sh

python:
install:
- requirements: docs/requirements.txt

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
fail_on_warning: true

submodules:
include: all
2 changes: 1 addition & 1 deletion 3rd-party/openpmix
Submodule openpmix updated 108 files
2 changes: 1 addition & 1 deletion 3rd-party/prrte
Submodule prrte updated 186 files
9 changes: 9 additions & 0 deletions Makefile.ompi-rules
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2020 Intel, Inc. All rights reserved.
# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
Expand All @@ -26,6 +27,14 @@ OMPI_V_GEN = $(ompi__v_GEN_$V)
ompi__v_GEN_ = $(ompi__v_GEN_$AM_DEFAULT_VERBOSITY)
ompi__v_GEN_0 = @echo " GENERATE" $@;

OMPI_V_COPYALL = $(ompi__v_COPYALL_$V)
ompi__v_COPYALL_ = $(ompi__v_COPYALL_$AM_DEFAULT_VERBOSITY)
ompi__v_COPYALL_0 = @echo " COPY tree $@";

OMPI_V_SPHINX_COPYRST = $(ompi__v_SPHINX_COPYRST_$V)
ompi__v_SPHINX_COPYRST_ = $(ompi__v_SPHINX_COPYRST_$AM_DEFAULT_VERBOSITY)
ompi__v_SPHINX_COPYRST_0 = @echo " COPY RST source files";

OMPI_V_SPHINX_HTML = $(ompi__v_SPHINX_HTML_$V)
ompi__v_SPHINX_HTML_ = $(ompi__v_SPHINX_HTML_$AM_DEFAULT_VERBOSITY)
ompi__v_SPHINX_HTML_0 = @echo " GENERATE HTML docs";
Expand Down
4 changes: 2 additions & 2 deletions VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ mpi_standard_subversion=1

# OMPI required dependency versions.
# List in x.y.z format.
pmix_min_version=4.1.2
prte_min_version=2.0.2
pmix_min_version=4.2.0
prte_min_version=3.0.0
hwloc_min_version=1.11.0
event_min_version=2.0.21
automake_min_version=1.13.4
Expand Down
2 changes: 1 addition & 1 deletion autogen.pl
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,7 @@ sub patch_autotools_output {
'# ICC 10 doesn\047t accept -KPIC any more.\n.*\n\s+' .
"lt_prog_compiler_wl${tag}=";
my $replace_string = "# Flang compiler
*flang)
*flang*)
lt_prog_compiler_wl${tag}='-Wl,'
lt_prog_compiler_pic${tag}='-fPIC -DPIC'
lt_prog_compiler_static${tag}='-static'
Expand Down
7 changes: 5 additions & 2 deletions config/ompi_check_ucx.m4
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ AC_DEFUN([OMPI_CHECK_UCX],[
UCP_ATOMIC_FETCH_OP_FXOR,
UCP_PARAM_FIELD_ESTIMATED_NUM_PPN,
UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK,
UCP_OP_ATTR_FLAG_MULTI_SEND],
UCP_OP_ATTR_FLAG_MULTI_SEND,
UCS_MEMORY_TYPE_RDMA,
UCP_MEM_MAP_SYMMETRIC_RKEY],
[], [],
[#include <ucp/api/ucp.h>])
AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS],
Expand All @@ -123,7 +125,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[
[#include <ucp/api/ucp.h>])
AC_CHECK_DECLS([ucp_tag_send_nbx,
ucp_tag_send_sync_nbx,
ucp_tag_recv_nbx],
ucp_tag_recv_nbx,
ucp_rkey_compare],
[], [],
[#include <ucp/api/ucp.h>])
AC_CHECK_TYPES([ucp_request_param_t],
Expand Down
59 changes: 52 additions & 7 deletions config/ompi_setup_prrte.m4
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dnl Copyright (c) 2019-2020 Intel, Inc. All rights reserved.
dnl Copyright (c) 2020-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
dnl Copyright (c) 2021 Nanook Consulting. All rights reserved.
dnl Copyright (c) 2021-2022 IBM Corporation. All rights reserved.
dnl Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
Expand All @@ -35,10 +36,25 @@ dnl
dnl A Makefile conditional OMPI_WANT_PRRTE will be defined based on the
dnl results of the build.
AC_DEFUN([OMPI_SETUP_PRRTE],[
OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy])
AC_REQUIRE([AC_PROG_LN_S])
OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy target_rst_dir])
opal_show_subtitle "Configuring PRRTE"
# We *must* have setup Sphinx before invoking this macro (i.e., it
# is a programming error -- not a run-time error -- if Sphinx was
# not previously setup).
OAC_ASSERT_BEFORE([OAC_SETUP_SPHINX], [OMPI_SETUP_PRRTE])
# These are sym links to folders with PRRTE's RST files that we'll
# slurp into mpirun.1.rst. We'll remove these links (or even
# accidental full copies) now and replace them with new links to
# the PRRTE that we find, below.
target_rst_dir="$OMPI_TOP_BUILDDIR/docs"
rm -rf "$target_rst_dir/prrte-rst-content"
rm -rf "$target_rst_dir/schizo-ompi-rst-content"
OPAL_3RDPARTY_WITH([prrte], [prrte], [package_prrte], [1])
AC_ARG_WITH([prrte-bindir],
Expand Down Expand Up @@ -101,12 +117,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
[$OMPI_USING_INTERNAL_PRRTE],
[Whether or not we are using the internal PRRTE])
OPAL_SUMMARY_ADD([Miscellaneous], [prrte], [], [$opal_prrte_mode])
AC_SUBST(OMPI_PRRTE_RST_CONTENT_DIR)
AC_SUBST(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)
AM_CONDITIONAL(OMPI_HAVE_PRRTE_RST, [test $OMPI_HAVE_PRRTE_RST -eq 1])
OPAL_SUMMARY_ADD([Miscellaneous], [PRRTE], [], [$opal_prrte_mode])
OPAL_VAR_SCOPE_POP
])


dnl _OMPI_SETUP_PRRTE_INTERNAL([action-if-success], [action-if-not-success])
dnl
dnl Attempt to configure the built-in PRRTE.
Expand Down Expand Up @@ -220,7 +239,15 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_INTERNAL], [
[AC_MSG_ERROR([PRRTE configuration failed. Cannot continue.])])
AS_IF([test "$internal_prrte_happy" = "yes"],
[$1], [$2])
[AC_MSG_CHECKING([for internal PRRTE RST files])
AS_IF([test -n "$SPHINX_BUILD"],
[OMPI_HAVE_PRRTE_RST=1
OMPI_PRRTE_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/docs/prrte-rst-content"
OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/mca/schizo/ompi"
AC_MSG_RESULT([found])],
[AC_MSG_RESULT([not found])])
$1],
[$2])
OPAL_VAR_SCOPE_POP
])
Expand Down Expand Up @@ -284,9 +311,27 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [
[AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH], ["${prterun_path}"], [Path to prterun])])
AS_IF([test "$setup_prrte_external_happy" = "yes"],
[$1], [$2])
[ # Determine if this external PRRTE has installed the RST
# directories that we care about
AC_MSG_CHECKING([for external PRRTE RST files])
prrte_install_dir=${with_prrte}/share/prte/rst
AS_IF([test -n "$SPHINX_BUILD"],
[AS_IF([test -d "$prrte_install_dir/prrte-rst-content" && \
test -d "$prrte_install_dir/schizo-ompi-rst-content"],
[OMPI_HAVE_PRRTE_RST=1
OMPI_PRRTE_RST_CONTENT_DIR="$prrte_install_dir/prrte-rst-content"
OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$prrte_install_dir/schizo-ompi-rst-content"
AC_MSG_RESULT([found])
],
[ # This version of PRRTE doesn't have installed RST
# files.
AC_MSG_RESULT([not found])
OMPI_HAVE_PRRTE_RST=0
])
])
$1],
[$2])
OPAL_VAR_SCOPE_POP
])


13 changes: 12 additions & 1 deletion config/opal_check_ofi.m4
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ dnl Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved.
dnl Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
dnl reserved.
dnl Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
dnl Copyright (c) 2023 Triad National Security, LLC. All rights
dnl reserved.
dnl $COPYRIGHT$
dnl
dnl Additional copyrights may follow
Expand Down Expand Up @@ -155,7 +157,16 @@ AC_DEFUN([OPAL_CHECK_OFI],[
AC_DEFINE_UNQUOTED([OPAL_OFI_HAVE_FI_HMEM_ROCR],
[${opal_check_fi_hmem_rocr}],
[check if FI_HMEM_ROCR avaiable in fi_hmem_iface])])
[check if FI_HMEM_ROCR avaiable in fi_hmem_iface])
AC_CHECK_DECL([FI_HMEM_ZE],
[opal_check_fi_hmem_ze=1],
[opal_check_fi_hmem_ze=0],
[#include <rdma/fi_domain.h>])
AC_DEFINE_UNQUOTED([OPAL_OFI_HAVE_FI_HMEM_ZE],
[${opal_check_fi_hmem_ze}],
[check if FI_HMEM_ZE avaiable in fi_hmem_iface])])
CPPFLAGS=${opal_check_ofi_save_CPPFLAGS}
LDFLAGS=${opal_check_ofi_save_LDFLAGS}
Expand Down
12 changes: 4 additions & 8 deletions config/opal_check_rocm.m4
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
[ with_rocm="/opt/rocm"] )
rocm_CPPFLAGS="-D__HIP_PLATFORM_AMD__"
rocm_LDFLAGS="-L${with_rocm}/lib/hip"
rocm_LDFLAGS="-L${with_rocm}/lib/"
AS_IF([ test -n "$with_rocm" && test "$with_rocm" != "no" ],
[ OPAL_APPEND([CPPFLAGS], [$rocm_CPPFLAGS])
Expand All @@ -52,15 +52,11 @@ AC_DEFUN([OPAL_CHECK_ROCM],[
LDFLAGS="$rocm_save_LDFLAGS"
LIBS="$rocm_save_LIBS"
OPAL_APPEND([CPPFLAGS], [${$1_CPPFLAGS}] )
OPAL_APPEND([LDFLAGS], [${$1_LDFLAGS}] )
OPAL_APPEND([LIBS], [${$1_LIBS}] )
AS_IF([ test "$opal_check_rocm_happy" = "no" ],
[ CPPFLAGS="$rocm_save_CPPFLAGS"])
CPPFLAGS="$rocm_save_CPPFLAGS"
AS_IF([ test "$opal_check_rocm_happy" = "yes" ],
[ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support])
[ OPAL_APPEND([$1_CPPFLAGS], [$rocm_CPPFLAGS])
AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support])
ROCM_SUPPORT=1 ],
[ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [0], [Disable ROCm support])
ROCM_SUPPORT=0 ])
Expand Down
Loading

0 comments on commit f14fe7b

Please sign in to comment.