From 0bb10c42bbafaaf35d9a0d3b9bad5724a00fc1c8 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 19 Dec 2022 16:51:55 +0100 Subject: [PATCH 01/73] coll/han: reorder free calls and avoid read-after-free in debug builds Inside the main loop, whenever we read a new string we free it first. The first iteration will be free(NULL), which is legal. At the end, we free all strings in all paths. This removes a potential read-after-free in a debug build and removes some calls to free from the error paths. Signed-off-by: Joseph Schuchart --- ompi/mca/coll/han/coll_han_dynamic_file.c | 38 ++++++++--------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index c41cf6280fc..0500cb99a90 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2018-2020 The University of Tennessee and The University + * Copyright (c) 2018-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved @@ -65,6 +65,7 @@ mca_coll_han_init_dynamic_rules(void) int algorithm_id; char * coll_name = NULL; char * algorithm_name = NULL; + char * target_comp_name = NULL; collective_rule_t *coll_rules; /* Topo information */ @@ -135,6 +136,7 @@ mca_coll_han_init_dynamic_rules(void) mca_coll_han_component.dynamic_rules.nb_collectives = i+1; /* Get the collective identifier */ + free(coll_name); if( getnext_string(fptr, &coll_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d." @@ -155,9 +157,7 @@ mca_coll_han_init_dynamic_rules(void) coll_name, fileline, ALLGATHER, COLLCOUNT); goto file_reading_error; } - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); coll_name = strdup(mca_coll_base_colltype_to_str(coll_id)); } @@ -321,7 +321,6 @@ mca_coll_han_init_dynamic_rules(void) /* Iterate on message size rules */ for( l = 0; l < nb_msg_size; l++ ) { - char* target_comp_name = NULL; conf_rules[k].nb_msg_size = l+1; /* Get the message size */ @@ -338,6 +337,7 @@ mca_coll_han_init_dynamic_rules(void) } /* Get the component identifier for this message size rule */ + free(target_comp_name); if( getnext_string(fptr, &target_comp_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " @@ -353,8 +353,6 @@ mca_coll_han_init_dynamic_rules(void) "reader encountered an unexpected EOF. Collective component id must be at " "least %d and less than %d\n", fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT); - free(target_comp_name); - target_comp_name = NULL; goto file_reading_error; } @@ -362,13 +360,13 @@ mca_coll_han_init_dynamic_rules(void) algorithm_id = 0; // default for all collectives if ((component == HAN) && (1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '@')) ) { + free(algorithm_name); + algorithm_name = NULL; if( getnext_string(fptr, &algorithm_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " "at line %d: cannot read the name/id of an algorithm\n", fname, fileline); - free(target_comp_name); - target_comp_name = NULL; goto file_reading_error; } algorithm_id = mca_coll_han_algorithm_name_to_id(coll_id, algorithm_name); @@ -376,15 +374,11 @@ mca_coll_han_init_dynamic_rules(void) char *endp; algorithm_id = (int)strtol(algorithm_name, &endp, 10); char endc = *endp; - free(algorithm_name); - algorithm_name = NULL; if (('\0' != endc ) || !mca_coll_han_algorithm_id_is_valid(coll_id, algorithm_id)) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " "at line %d: unknown algorithm '%s' for %s\n", fname, fileline, algorithm_name, coll_name); - free(target_comp_name); - target_comp_name = NULL; goto file_reading_error; } } @@ -422,19 +416,13 @@ mca_coll_han_init_dynamic_rules(void) "file %s line %d found end of file while reading the optional list " "of segment lengths for collective %s component %s\n", fname, fileline, coll_name, target_comp_name); - free(target_comp_name); goto file_reading_error; } } } - free(target_comp_name); } } } - if( NULL != coll_name ) { - free(coll_name); - coll_name = NULL; - } } if( getnext_long(fptr, &nb_coll) > 0 ) { @@ -455,7 +443,9 @@ mca_coll_han_init_dynamic_rules(void) fclose(fptr); check_dynamic_rules(); + free(coll_name); free(algorithm_name); + free(target_comp_name); return OMPI_SUCCESS; cannot_allocate: @@ -465,10 +455,9 @@ mca_coll_han_init_dynamic_rules(void) opal_output_verbose(0, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "cannot allocate dynamic rules\n"); - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); free(algorithm_name); + free(target_comp_name); fclose (fptr); /* We disable the module, we don't need to keep the rules */ mca_coll_han_free_dynamic_rules(); @@ -481,10 +470,9 @@ mca_coll_han_init_dynamic_rules(void) "Will use mca parameters defined rules. " "To see error detail, please set " "collective verbosity level over 5\n"); - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); free(algorithm_name); + free(target_comp_name); fclose (fptr); /* We disable the module, we don't need to keep the rules */ mca_coll_han_free_dynamic_rules(); From 30ec7e1e68560f94dcd48d45e848149ae2296bbb Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Fri, 14 Jul 2023 09:26:55 -0500 Subject: [PATCH 02/73] docs: update list of deprecated/removed functions This commit address issue #10099 Update documentation with the list of depreacted functions in MPI 2.2 and MPI-4.0. Add some examples and explanation on how to convert code from the deprecated interfaces to the new ones. Add a new section on ABI compatibility to Open MPI 4.1 Add text about Fortran ABI issues. Signed-off-by: Edgar Gabriel Co-authored-by: Jeff Squyres --- docs/building-apps/abi-compatibility.rst | 59 +++++ docs/building-apps/deprecation-warnings.rst | 250 +++++++++++++++++- docs/building-apps/index.rst | 1 + docs/building-apps/removed-mpi-constructs.rst | 50 ++-- 4 files changed, 325 insertions(+), 35 deletions(-) create mode 100644 docs/building-apps/abi-compatibility.rst diff --git a/docs/building-apps/abi-compatibility.rst b/docs/building-apps/abi-compatibility.rst new file mode 100644 index 00000000000..50fbf621b5e --- /dev/null +++ b/docs/building-apps/abi-compatibility.rst @@ -0,0 +1,59 @@ +.. _label-binary-compatibility: + +ABI compatibility to previous versions of Open MPI +================================================== + +The Open MPI |ompi_series| series maintains Application Binary Interface (ABI) +compatibility for the C MPI bindings to the last major Open MPI release. Specifically, an +application compiled with Open MPI v4.x can be executed with Open MPI +|ompi_series| without having to recompile the application. + +.. important:: ABI is maintained for *most* of the Fortran MPI bindings, too -- see below for additional information. + +There are however a few scenarios where an application compiled with +Open MPI v4.x might not execute correctly with Open MPI 5.0. + +- Fortran compilers provide varying degrees of ABI guarantees between + their releases. As such, Open MPI can only provide ABI guarantees + with MPI applications that use the Fortran MPI bindings when all + three entities |mdash| Open MPI v4.x, Open MPI |ompi_series|, and + the MPI application |mdash| were built with exactly the same Fortran + compiler and environment. + + If these conditions are met, Open MPI's ABI guarantees between the + Open MPI v4.x and |ompi_series| series are in effect, with the + exception of the following cases: + + * When using the Fortran ``mpi`` module bindings with GCC compiler + versions prior to v4.8, non-compliant Fortran interfaces for + ``MPI_Comm_spawn_multiple``, ``MPI_Testall``, ``MPI_Testsome``, + ``MPI_Waitall``, and ``MPI_Waitsome`` were removed starting with + Open MPI v5.0.0. + + * When using the Fortran ``mpi`` module with modern Fortran + compilers (e.g., GNU Fortran >= v4.9), Open MPI v5.0.0 removed the + names from the MPI interfaces when there is only a single + subroutine in the interface, and that subroutine name exactly + matches the iterface name. This change is likely to make Open MPI + |ompi_series|'s ``mpi`` module bindings *less* restrictive than + Open MPI v4.x, but it *may* also have ABI implications, depending + on your Fortran compiler. + + `See this git commit message + `_ + for more details. + + .. important:: This is likely a compiler-specific issue, and may + need to be tested in your environment. + + * When using the Fortran ``mpi_f08`` module bindings in an + environment where a Fortran ``INTEGER`` is 8 bytes but a C ``int`` + is 4 bytes, the size of a ``Type(MPI_Status)`` was increased + starting with Open MPI v5.0.0. + +- Open MPI v5.0.0 removed support for the MPI C++ bindings. If an application + was using the deprecated and now removed C++ bindings, it will not + be able to compile or execute with Open MPI v5.0.0. For details on deprecated and + removed functions see :ref:`Removed MPI constructs + ` and :ref:`Deprecation warnings + ` diff --git a/docs/building-apps/deprecation-warnings.rst b/docs/building-apps/deprecation-warnings.rst index c0801f5b661..a7f3f46de9a 100644 --- a/docs/building-apps/deprecation-warnings.rst +++ b/docs/building-apps/deprecation-warnings.rst @@ -1,9 +1,11 @@ +.. _label-deprecated-functions: + Deprecation warnings while compiling MPI applications ===================================================== If you see deprecation warnings when compiling MPI applications, it is because your application is symbols / functions that are deprecated in -MPI. For example: +MPI. For example: .. code-block:: sh @@ -25,16 +27,240 @@ advises you to use ``MPI_Comm_delete_attr()`` instead of Also, note that even if Open MPI was configured with ``--enable-mpi1-compatibility`` to re-enable removed MPI-1 symbols, you will still get compiler warnings when you use the removed symbols. -For example: -.. code-block:: sh +The following is a list of functions that have been deprecated in MPI, +and the function that is replacing them. Some functions have been +deprecated and removed from the MPI specification, these functions are +listed :ref:`here `. + +.. list-table:: + :header-rows: 1 + + * - Deprecated symbol + + (click for more details, below) + - Replaced with + + (click to go to the corresponding man page) + - MPI version deprecating the symbol + + * - :ref:`MPI_KEYVAL_CREATE ` + - :ref:`MPI_COMM_CREATE_KEYVAL ` + - MPI-2.0 (1996) + + * - :ref:`MPI_KEYVAL_FREE ` + - :ref:`MPI_COMM_FREE_KEYVAL ` + - MPI-2.0 (1996) + + * - :ref:`MPI_COPY_FUNCTION ` + - :ref:`MPI_COMM_COPY_ATTR_FUNCTION ` + - MPI-2.0 (1996) + + * - :ref:`MPI_DELETE_FUNCTION ` + - :ref:`MPI_COMM_DELETE_ATTR_FUNCTION ` + - MPI-2.0 (1996) + + * - :ref:`MPI_ATTR_PUT ` + - :ref:`MPI_COMM_SET_ATTR ` + - MPI-2.0 (1996) + + * - :ref:`MPI_ATTR_GET ` + - :ref:`MPI_COMM_GET_ATTR ` + - MPI-2.0 (1996) + + * - :ref:`MPI_ATTR_DELETE ` + - :ref:`MPI_COMM_DELETE_ATTR ` + - MPI-2.0 (1996) + + * - :ref:`MPI_Comm_errhandler_fn ` + - :ref:`MPI_Comm_errhandler_function ` + - MPI-2.2 (2009) + + * - :ref:`MPI_File_errhandler_fn ` + - :ref:`MPI_File_errhandler_function ` + - MPI-2.2 (2009) + + * - :ref:`MPI_Win_errhandler_fn ` + - :ref:`MPI_Win_errhandler_function ` + - MPI-2.2 (2009) + + * - :ref:`MPI_INFO_GET ` + - :ref:`MPI_INFO_GET_STRING ` + - MPI-4.0 (2021) + + * - :ref:`MPI_INFO_GET_VALUELEN ` + - :ref:`MPI_INFO_GET_STRING ` + - MPI-4.0 (2021) + + * - :ref:`MPI_SIZEOF ` + - Fortran intrinsics``c_sizeof`` or ``storage_size`` + - MPI-4.0 (2021) + +.. _label-mpi-keyval-create: + +MPI_Keyval_create +----------------- + +``MPI_Keyval_create`` has been deprecated and replaced by +``MPI_Comm_create_keyval``. The C binding of the new function is +identical to the deprecated version. Hence, applications can simply +replace the function that is being invoked. + +The Fortran binding differs in that the ``extra_state`` argument is an +address-sized integer in the new interfaces (vs. a regular integer in +the old interfaces). Also, the copy and delete callback functions have +Fortran bindings that are consistent with address-sized attributes. + +.. code-block:: Fortran + + USE mpi + EXTERNAL my_copy_attr_function + EXTERNAL my_copy_delete_function + INTEGER ierror + INTEGER comm_keyval + INTEGER old_extra_state + INTEGER(KIND=MPI_ADDRESS_KIND) new_extra_state + + ! Old way + CALL MPI_KEYVAL_CREATE(my_copy_attr_function, my_copy_delete_function, + comm_keyval, old_extra_state, ierror) + + ! New way + CALL MPI_COMM_CREATE_KEYVAL(my_copy_attr_function, my_delete_attr_function, + comm_keyval, new_extra_state, ierror) + + +.. _label-mpi-keyval-free: + +MPI_Keyval_free +---------------- + +The binding of ``MPI_Keyval_free`` and ``MPI_Comm_free_keyval`` are identical +for both C and Fortran. Users can directly replace the deprecated function with its +new version. + +.. _label-mpi-copy-delete-function: + +MPI_Copy_function and MPI_Delete_function +------------------------------------------ + +The ``MPI_Copy_function`` and ``MPI_Delete_function`` are only used in the +deprecated function ``MPI_Keyval_create()``, as described in the +:ref:`MPI_COMM_CREATE_KEYVAL `. + +For C codes, developers can simply use the new, exactly-equivalent +type name (i.e., the return type, number, and type of parameters +didn't change) ``MPI_Comm_copy_attr_function``, and +``MPI_Comm_delete_attr_function`` respectively. + +For Fortran applications, the only difference lies in required integer type for the +``extra_state`` argument, which now has to be an address-sized integer. + +.. _label-mpi-attr-put: + +MPI_Attr_put +------------ + +The C binding for the deprecated ``MPI_Attr_put`` is identical to its +replacement, ``MPI_Comm_set_attr``. The Fortran binding differ in the +usage of an addressed size integer for the attribute value in the new +``MPI_Comm_set_attr`` vs. a regular integer in ``MPI_Attr_put``. + +.. code-block:: Fortran + + USE mpi + INTEGER ierror + INTEGER comm_keyval + INTEGER old_attr_val + INTEGER(KIND=MPI_ADDRESS_KIND) new_attr_val + + ! Old way + CALL MPI_ATTR_PUT(MPI_COMM_WORLD, comm_keyval, + old_attr_val, ierror) + + ! New way + CALL MPI_COMM_SET_ATTR(MPI_COMM_WORLD, comm_keyval, + new_attr_val, ierror) + +.. _label-mpi-attr-get: + +MPI_Attr_get +------------ + +The C bindings of the old and the new interfaces are identical. +Fortran binding differ in the usage of an addressed size integer for +the attribute value in the new ``MPI_Comm_get_attr`` vs. a regular +integer in ``MPI_Attr_get``. + +.. _label-mpi-attr-delete: + +MPI_Attr_delete +--------------- + +C and Fortran bindings are identical for ``MPI_Attr_delete`` and +``MPI_Comm_delete_attr``, hence developers should be able to just +directly substitute one function call by the other. + + +.. _label-mpi-info-get: + +MPI_Info_get +------------ + +Applications should replace the use of ``MPI_Info_get`` with ``MPI_Info_get_string``, +but the usage differs slightly. See the example below. + +.. code-block:: c++ + + MPI_Info info; + + // Create an info object using MPI_Info_create() + ... + + // Retrieve the the value of a provided key later in the code + char key[] = "my_key"; + char value[64]; + int valuelen=64; + int flag; + + // Old way + MPI_Info_get(info, key, valuelen, &value, &flag); + + // New way + // Note that we pass the address of valuelen with + // the new interfaces, since the variable will + // contain the length of the value string after + // the function call. + MPI_Info_get_string(info, key, &valuelen, &value, &flag); + } + +.. _label-mpi-info-get-valuelen: + +MPI_Info_get_valuelen +--------------------- + +``MPI_Info_get_valuelen`` has been deprecated since the new function +``MPI_Info_get_string`` also returns the length of the value string. +Please refer to the example shown in :ref:`MPI_INFO_GET `. + +.. _label-mpi-sizeof: + +MPI_Sizeof +---------- + +The ``MPI_SIZEOF`` construct in Fortran has been deprected since there +are standard Fortran language constructs such as ``c_sizeof`` and +``storage_size`` that can be used instead. + +.. _label-errhandler-fn: + +MPI_Comm_errhandler_fn, MPI_File_errhandler_fn, MPI_Win_errhandler_fn +--------------------------------------------------------------------- + +The following function typedefs have been deprecated and are superseded by new +names. Other than the typedef names, the function signatures are exactly the same; the +names were updated to match conventions of other function typedef names. - shell$ mpicc deleted-example.c -c - deleted-example.c: In function 'foo': - deleted-example.c:8:5: warning: 'MPI_Address' is deprecated: MPI_Address was removed in MPI-3.0; use MPI_Get_address instead. [-Wdeleted-declarations] - MPI_Address(buffer, &address); - ^~~~~~~~~~~ - In file included from deleted-example.c:2: - /usr/local/openmpi/include/mpi.h:2689:20: note: declared here - OMPI_DECLSPEC int MPI_Address(void *location, MPI_Aint *address) - ^~~~~~~~~~~ +* ``MPI_Comm_errhandler_fn`` |rarrow| ``MPI_Comm_errhandler_function`` +* ``MPI_File_errhandler_fn`` |rarrow| ``MPI_File_errhandler_function`` +* ``MPI_Win_errhandler_fn`` |rarrow| ``MPI_Win_errhandler_function`` diff --git a/docs/building-apps/index.rst b/docs/building-apps/index.rst index 649dedee626..5c140be17e0 100644 --- a/docs/building-apps/index.rst +++ b/docs/building-apps/index.rst @@ -10,6 +10,7 @@ Open MPI "wrapper" compilers. quickstart customizing-wrappers extracting-wrapper-flags + abi-compatibility removed-mpi-constructs deprecation-warnings building-static-apps diff --git a/docs/building-apps/removed-mpi-constructs.rst b/docs/building-apps/removed-mpi-constructs.rst index 81229c7cb72..4dfa5b77293 100644 --- a/docs/building-apps/removed-mpi-constructs.rst +++ b/docs/building-apps/removed-mpi-constructs.rst @@ -3,9 +3,6 @@ Removed MPI constructs ====================== -.. error:: **TODO This section needs to be renamed/updated for the - 5.0.0 behavior.** - Starting with v4.0.0, Open MPI |mdash| by default |mdash| removes the prototypes from ``mpi.h`` for MPI symbols that were deprecated in 1996 in the MPI-2.0 standard, and finally removed from the MPI-3.0 standard @@ -110,18 +107,23 @@ default: - MPI-3.0 (2012) Although these symbols are no longer prototyped in ``mpi.h``, *they are -still present in the MPI library in Open MPI v4.0.x*. This enables +still present in the MPI library in Open MPI* |ompi_series|. This enables legacy MPI applications to *link and run* successfully with Open MPI -v4.0.x, even though they will fail to *compile*. +|ompi_series|, even though they will fail to *compile*. + +Furthermore, the MPI C++ interfaces were deprecated in version +2.2, and then were removed in MPI-3.0. Starting +from v5.0.0, Open MPI does not support the C++ interfaces +any more. Users who would like to continue using the C++ interfaces of +MPI will need to use an older release of Open MPI. .. warning:: The Open MPI team **strongly** encourages all MPI application developers to stop using these constructs that were first deprecated over 20 years ago, and finally removed from the MPI specification in MPI-3.0 (in 2012). -The FAQ items in this category -show how to update your application to stop using these removed -symbols. +The FAQ items in this category show how to update your application to +stop using these removed symbols. All that being said, if you are unable to immediately update your application to stop using these removed MPI-1 symbols, you can @@ -131,22 +133,24 @@ re-enable them in ``mpi.h`` by configuring Open MPI with the .. note:: Future releases of Open MPI may remove these symbols altogether. -Why on earth are you breaking the compilation of MPI applications? +Why is Open MPI breaking the compilation of MPI applications? ------------------------------------------------------------------ -.. error:: **TODO This section needs to be renamed/updated (or - deleted?) for the 5.0.0 behavior.** - The Open MPI developer community decided to take a first step of -removing the prototypes for these symbols from ``mpi.h`` starting with -the Open MPI v4.0.x series for the following reasons: - -#. These symbols have been deprecated since *1996.* It's time to start - raising awareness for developers who are inadvertently still using - these removed symbols. -#. The MPI Forum removed these symbols from the MPI-3.0 specification - in 2012. This is a sign that the Forum itself recognizes that - these removed symbols are no longer needed. +removing prototypes of deprecated functions from ``mpi.h`` starting +with the Open MPI v4.0.x series for the following reasons: + +#. The first set of symbols have been deprecated since *1996.* It's + time to start raising awareness for developers who are + inadvertently still using these removed symbols. +#. The MPI Forum removed a substantial set of symbols from the MPI-3.0 + specification in 2012. This is a sign that the Forum itself + recognizes that these removed symbols are no longer needed. +#. More functions have been deprecated in MPI 2.2 and MPI 4.0, and + additional functions are expected to be deprecated and removed in + future MPI versions. It is in the interest of both, developers and + end-users, to minimize confusion as much as possible, and stick + closely to the MPI specification. #. Note that Open MPI *did not fully remove* these removed symbols: we just made it slightly more painful to get to them. This is an attempt to raise awareness so that MPI application developers can @@ -189,7 +193,7 @@ to change: MPI_Get_address(buffer, &address); In Fortran, the type of the parameter changed from ``INTEGER`` -$right_arrow ``INTEGER(KIND=MPI_ADDRESS_KIND)`` so that it can hold +|rarrow| ``INTEGER(KIND=MPI_ADDRESS_KIND)`` so that it can hold larger values (e.g., 64 bit pointers): .. code-block:: Fortran @@ -251,7 +255,7 @@ In Fortran, only the subroutine name changed: ! Old way CALL MPI_ERRHANDLER_CREATE(my_errhandler_function, my_handler, ierror) - ! Old way + ! New way CALL MPI_COMM_CREATE_ERRHANDLER(my_errhandler_function, my_handler, ierror) .. _label-mpi-errhandler-get: From 0c2fe123061ba88bf052fc5043c7aa7b406ee3f9 Mon Sep 17 00:00:00 2001 From: Lisandro Dalcin Date: Mon, 24 Jul 2023 21:13:44 +0300 Subject: [PATCH 03/73] datatype: Fix MPI_Type_dup() to propagate errors from inner calls Signed-off-by: Lisandro Dalcin --- ompi/mpi/c/type_dup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ompi/mpi/c/type_dup.c b/ompi/mpi/c/type_dup.c index bd9e125e443..27d3fae39cf 100644 --- a/ompi/mpi/c/type_dup.c +++ b/ompi/mpi/c/type_dup.c @@ -41,6 +41,8 @@ static const char FUNC_NAME[] = "MPI_Type_dup"; int MPI_Type_dup (MPI_Datatype type, MPI_Datatype *newtype) { + int ret; + MEMCHECKER( memchecker_datatype(type); ); @@ -54,10 +56,9 @@ int MPI_Type_dup (MPI_Datatype type, } } - if (OMPI_SUCCESS != ompi_datatype_duplicate( type, newtype)) { + if (OMPI_SUCCESS != (ret = ompi_datatype_duplicate( type, newtype))) { ompi_datatype_destroy( newtype ); - OMPI_ERRHANDLER_RETURN (MPI_ERR_INTERN, MPI_COMM_WORLD, - MPI_ERR_INTERN, FUNC_NAME ); + OMPI_ERRHANDLER_NOHANDLE_RETURN( ret, ret, FUNC_NAME ); } ompi_datatype_set_args( *newtype, 0, NULL, 0, NULL, 1, &type, MPI_COMBINER_DUP ); @@ -69,13 +70,12 @@ int MPI_Type_dup (MPI_Datatype type, copy attributes. Really. */ if (NULL != type->d_keyhash) { ompi_attr_hash_init(&(*newtype)->d_keyhash); - if (OMPI_SUCCESS != ompi_attr_copy_all(TYPE_ATTR, - type, *newtype, - type->d_keyhash, - (*newtype)->d_keyhash)) { + if (OMPI_SUCCESS != (ret = ompi_attr_copy_all(TYPE_ATTR, + type, *newtype, + type->d_keyhash, + (*newtype)->d_keyhash))) { ompi_datatype_destroy(newtype); - OMPI_ERRHANDLER_NOHANDLE_RETURN( MPI_ERR_INTERN, - MPI_ERR_INTERN, FUNC_NAME ); + OMPI_ERRHANDLER_NOHANDLE_RETURN( ret, ret, FUNC_NAME ); } } From 15ad2ec98ca8475f6852b84067e49fdfb1f24291 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Mon, 31 Jul 2023 14:11:24 +0900 Subject: [PATCH 04/73] configury: support flang-new patch auto-generated configure in order to support flang-new Signed-off-by: Gilles Gouaillardet --- autogen.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogen.pl b/autogen.pl index 5af4704f2a1..9a7527bfe6c 100755 --- a/autogen.pl +++ b/autogen.pl @@ -923,7 +923,7 @@ sub patch_autotools_output { '# ICC 10 doesn\047t accept -KPIC any more.\n.*\n\s+' . "lt_prog_compiler_wl${tag}="; my $replace_string = "# Flang compiler - *flang) + *flang*) lt_prog_compiler_wl${tag}='-Wl,' lt_prog_compiler_pic${tag}='-fPIC -DPIC' lt_prog_compiler_static${tag}='-static' From 5691d80585fd43a08927247031fd64680ecbf093 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 7 Aug 2023 09:28:12 -0400 Subject: [PATCH 05/73] docs: update readthedocs.org config RTD is removing the option to use pre-installed packages. Instead, we just need to tell them where our Python requirements.txt file is located so that they'll install exactly those packages. Signed-off-by: Jeff Squyres --- .readthedocs.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index f26f84cf5f7..44e0bbac5a7 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -13,6 +13,10 @@ build: tools: python: "3.10" +python: + install: + - requirements: docs/requirements.txt + # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py From 21e3d1170e57ffe39fb6310ea2d33b3c1bd62fef Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 21 Jun 2023 00:33:15 -0400 Subject: [PATCH 06/73] Allow the monitoring infrastructure to be compiled. Signed-off-by: George Bosilca --- ompi/mca/coll/monitoring/configure.m4 | 5 +---- ompi/mca/common/monitoring/configure.m4 | 11 ++++------- ompi/mca/osc/monitoring/configure.m4 | 5 +---- ompi/mca/pml/monitoring/configure.m4 | 5 +---- 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/ompi/mca/coll/monitoring/configure.m4 b/ompi/mca/coll/monitoring/configure.m4 index 008bff46994..1d0b88fcb26 100644 --- a/ompi/mca/coll/monitoring/configure.m4 +++ b/ompi/mca/coll/monitoring/configure.m4 @@ -15,9 +15,6 @@ # ------------------------------------------------ AC_DEFUN([MCA_ompi_coll_monitoring_CONFIG],[ AC_CONFIG_FILES([ompi/mca/coll/monitoring/Makefile]) - - AS_IF([test "$MCA_BUILD_ompi_common_monitoring_DSO_TRUE" = ''], - [$1], - [$2]) + [$1] ])dnl diff --git a/ompi/mca/common/monitoring/configure.m4 b/ompi/mca/common/monitoring/configure.m4 index b7632bd4b8d..11cd33b3e8c 100644 --- a/ompi/mca/common/monitoring/configure.m4 +++ b/ompi/mca/common/monitoring/configure.m4 @@ -16,13 +16,10 @@ AC_DEFUN([MCA_ompi_common_monitoring_CONFIG],[ AC_CONFIG_FILES([ompi/mca/common/monitoring/Makefile]) - m4_ifdef([project_ompi], [ - m4_ifdef([MCA_BUILD_ompi_common_monitoring_DSO_TRUE], - [AC_CONFIG_LINKS(profile2mat.pl:test/monitoring/profile2mat.pl - aggregate_profile.pl:test/monitoring/aggregate_profile.pl)])]) + m4_ifdef([project_ompi], + [AC_CONFIG_LINKS(test/monitoring/profile2mat.pl:ompi/mca/common/monitoring/profile2mat.pl + test/monitoring/aggregate_profile.pl:ompi/mca/common/monitoring/aggregate_profile.pl)]) - AS_IF([test "$MCA_BUILD_ompi_common_monitoring_DSO_TRUE" = ''], - [$1], - [$2]) + [$1] ])dnl diff --git a/ompi/mca/osc/monitoring/configure.m4 b/ompi/mca/osc/monitoring/configure.m4 index f3cd355b8d7..8ef5dc01e27 100644 --- a/ompi/mca/osc/monitoring/configure.m4 +++ b/ompi/mca/osc/monitoring/configure.m4 @@ -91,10 +91,7 @@ EOF AC_DEFUN( [MCA_ompi_osc_monitoring_CONFIG], [AC_CONFIG_FILES([ompi/mca/osc/monitoring/Makefile]) - - AS_IF([test "$MCA_BUILD_ompi_common_monitoring_DSO_TRUE" = ''], - [$1], - [$2]) + [$1] MCA_OMPI_OSC_MONITORING_GENERATE_TEMPLATES( [ompi/mca/osc/monitoring/osc_monitoring_template_gen.h], diff --git a/ompi/mca/pml/monitoring/configure.m4 b/ompi/mca/pml/monitoring/configure.m4 index 27815f22957..ffc2bcf5fc4 100644 --- a/ompi/mca/pml/monitoring/configure.m4 +++ b/ompi/mca/pml/monitoring/configure.m4 @@ -15,10 +15,7 @@ # ------------------------------------------------ AC_DEFUN([MCA_ompi_pml_monitoring_CONFIG],[ AC_CONFIG_FILES([ompi/mca/pml/monitoring/Makefile]) - - AS_IF([test "$MCA_BUILD_ompi_common_monitoring_DSO_TRUE" = ''], - [$1], - [$2]) + [$1] ])dnl From ba0bce4b3c9c942f0aa2e0a0fd598cb6b62d2a25 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 5 Jun 2023 23:49:30 -0400 Subject: [PATCH 07/73] Ask for the modex of all jobs connected. The original code was merging the local modex with the modex of the local processes on the first jobid. This lead to incorrect, and mismatched, information among processes when joining multiple jobid processes (such as on the second spawn merged). This patch iterate over all the jobid on the list of "to connect" processes and adds their information to the local modex. Fixes #11724. Signed-off-by: George Bosilca --- ompi/dpm/dpm.c | 124 +++++++++++++++++++++++++++---------------------- 1 file changed, 69 insertions(+), 55 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 5dfcd67ce8b..656a45d8a41 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -259,6 +259,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, * fail. */ if (0 >= rportlen) { rc = rportlen; + /* no need to free here, the root has already done it and everyone else has not yet allocated the rport array */ goto exit; } @@ -406,72 +407,85 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root, OPAL_LIST_DESTRUCT(&rlist); goto exit; } - if (0 < opal_list_get_size(&ilist)) { - uint32_t *peer_ranks = NULL; + if (!opal_list_is_empty(&ilist)) { int prn, nprn = 0; char *val; - uint16_t u16; opal_process_name_t wildcard_rank; + i = 0; /* start from the begining */ + /* convert the list of new procs to a proc_t array */ new_proc_list = (ompi_proc_t**)calloc(opal_list_get_size(&ilist), sizeof(ompi_proc_t *)); - /* get the list of local peers for the new procs */ - cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist); - proc = cd->p; - wildcard_rank.jobid = proc->super.proc_name.jobid; - wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; - /* retrieve the local peers */ - OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_PEERS, - &wildcard_rank, &val, PMIX_STRING); - if (OPAL_SUCCESS == rc && NULL != val) { - char **peers = opal_argv_split(val, ','); - free(val); - nprn = opal_argv_count(peers); - peer_ranks = (uint32_t*)calloc(nprn, sizeof(uint32_t)); - for (prn = 0; NULL != peers[prn]; prn++) { - peer_ranks[prn] = strtoul(peers[prn], NULL, 10); - } - opal_argv_free(peers); - } - - i = 0; - OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) { + /* Extract the modex info for the first proc on the ilist, and then + * remove all processors in the same jobid from the list by getting + * their connection information and moving them into the proc array. + */ + do { + uint32_t *local_ranks_in_jobid = NULL; + ompi_dpm_proct_caddy_t* next = NULL; + cd = (ompi_dpm_proct_caddy_t*)opal_list_get_first(&ilist); proc = cd->p; - new_proc_list[i] = proc ; - /* ompi_proc_complete_init_single() initializes and optionally retrieves - * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without - * them, we are just fine */ - ompi_proc_complete_init_single(proc); - /* if this proc is local, then get its locality */ - if (NULL != peer_ranks) { - for (prn=0; prn < nprn; prn++) { - if (peer_ranks[prn] == proc->super.proc_name.vpid) { - /* get their locality string */ - val = NULL; - OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, - &proc->super.proc_name, &val, PMIX_STRING); - if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { - u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); - free(val); - } else { - /* all we can say is that it shares our node */ - u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; + wildcard_rank.jobid = proc->super.proc_name.jobid; + wildcard_rank.vpid = OMPI_NAME_WILDCARD->vpid; + /* retrieve the local peers for the specified jobid */ + OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_PEERS, + &wildcard_rank, &val, PMIX_STRING); + if (OPAL_SUCCESS == rc && NULL != val) { + char **peers = opal_argv_split(val, ','); + free(val); + nprn = opal_argv_count(peers); + local_ranks_in_jobid = (uint32_t*)calloc(nprn, sizeof(uint32_t)); + for (prn = 0; NULL != peers[prn]; prn++) { + local_ranks_in_jobid[prn] = strtoul(peers[prn], NULL, 10); + } + opal_argv_free(peers); + } + + OPAL_LIST_FOREACH_SAFE(cd, next, &ilist, ompi_dpm_proct_caddy_t) { + proc = cd->p; + if( proc->super.proc_name.jobid != wildcard_rank.jobid ) + continue; /* not a proc from this jobid */ + + new_proc_list[i] = proc; + opal_list_remove_item(&ilist, (opal_list_item_t*)cd); // TODO: do we need to release cd ? + OBJ_RELEASE(cd); + /* ompi_proc_complete_init_single() initializes and optionally retrieves + * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without + * them, we are just fine */ + ompi_proc_complete_init_single(proc); + /* if this proc is local, then get its locality */ + if (NULL != local_ranks_in_jobid) { + uint16_t u16; + for (prn=0; prn < nprn; prn++) { + if (local_ranks_in_jobid[prn] == proc->super.proc_name.vpid) { + /* get their locality string */ + val = NULL; + OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING, + &proc->super.proc_name, &val, PMIX_STRING); + if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) { + u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val); + free(val); + } else { + /* all we can say is that it shares our node */ + u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; + } + proc->super.proc_flags = u16; + /* save the locality for later */ + OPAL_PMIX_CONVERT_NAME(&pxproc, &proc->super.proc_name); + pval.type = PMIX_UINT16; + pval.data.uint16 = proc->super.proc_flags; + PMIx_Store_internal(&pxproc, PMIX_LOCALITY, &pval); + break; } - proc->super.proc_flags = u16; - /* save the locality for later */ - OPAL_PMIX_CONVERT_NAME(&pxproc, &proc->super.proc_name); - pval.type = PMIX_UINT16; - pval.data.uint16 = proc->super.proc_flags; - PMIx_Store_internal(&pxproc, PMIX_LOCALITY, &pval); - break; } } + ++i; } - ++i; - } - if (NULL != peer_ranks) { - free(peer_ranks); - } + if (NULL != local_ranks_in_jobid) { + free(local_ranks_in_jobid); + } + } while (!opal_list_is_empty(&ilist)); + /* call add_procs on the new ones */ rc = MCA_PML_CALL(add_procs(new_proc_list, opal_list_get_size(&ilist))); free(new_proc_list); From 3484445b76e4e075270cb2178206d7b9bd1bb1af Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Tue, 16 May 2023 21:18:29 -0400 Subject: [PATCH 08/73] Allow MPI_IN_PLACE for MPI_Allreduce on intercomms. Signed-off-by: George Bosilca --- ompi/mca/coll/inter/coll_inter_allreduce.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/ompi/mca/coll/inter/coll_inter_allreduce.c b/ompi/mca/coll/inter/coll_inter_allreduce.c index 91ca00ff858..661143b8355 100644 --- a/ompi/mca/coll/inter/coll_inter_allreduce.c +++ b/ompi/mca/coll/inter/coll_inter_allreduce.c @@ -48,7 +48,7 @@ mca_coll_inter_allreduce_inter(const void *sbuf, void *rbuf, int count, mca_coll_base_module_t *module) { int err, rank, root = 0; - char *tmpbuf = NULL, *pml_buffer = NULL; + char *tmpbuf = NULL, *pml_buffer = NULL, *source; ptrdiff_t gap, span; rank = ompi_comm_rank(comm); @@ -58,20 +58,21 @@ mca_coll_inter_allreduce_inter(const void *sbuf, void *rbuf, int count, tmpbuf = (char *) malloc(span); if (NULL == tmpbuf) { - return OMPI_ERR_OUT_OF_RESOURCE; + return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = tmpbuf - gap; + source = (MPI_IN_PLACE == sbuf) ? rbuf : sbuf; - err = comm->c_local_comm->c_coll->coll_reduce(sbuf, pml_buffer, count, - dtype, op, root, - comm->c_local_comm, - comm->c_local_comm->c_coll->coll_reduce_module); + err = comm->c_local_comm->c_coll->coll_reduce(source, pml_buffer, count, + dtype, op, root, + comm->c_local_comm, + comm->c_local_comm->c_coll->coll_reduce_module); if (OMPI_SUCCESS != err) { - goto exit; + goto exit; } if (rank == root) { - /* Do a send-recv between the two root procs. to avoid deadlock */ + /* Do a send-recv between the two root procs. to avoid deadlock */ err = ompi_coll_base_sendrecv_actual(pml_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_ALLREDUCE, rbuf, count, dtype, 0, @@ -84,8 +85,8 @@ mca_coll_inter_allreduce_inter(const void *sbuf, void *rbuf, int count, /* bcast the message to all the local processes */ err = comm->c_local_comm->c_coll->coll_bcast(rbuf, count, dtype, - root, comm->c_local_comm, - comm->c_local_comm->c_coll->coll_bcast_module); + root, comm->c_local_comm, + comm->c_local_comm->c_coll->coll_bcast_module); if (OMPI_SUCCESS != err) { goto exit; } From 74de3364afb2d54a13db3da8c60f0427b84b7a1d Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Tue, 16 May 2023 21:19:06 -0400 Subject: [PATCH 09/73] Create the remote_group to allow the creation of c_local_comm Without the remote_group the communicator is considered as an intracomm, and will lack the proper infrastructure necessary during the MPI_Comm_split_type. Signed-off-by: George Bosilca --- ompi/communicator/comm.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 8721062bda5..3b66d416cea 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -241,6 +241,16 @@ int ompi_comm_set_nb (ompi_communicator_t **ncomm, ompi_communicator_t *oldcomm, newcomm->c_assertions = 0; /* Set remote group and duplicate the local comm, if applicable */ + if ((NULL == remote_group) && (NULL != remote_ranks)) { + /* determine how the list of local_rank can be stored most + efficiently */ + ret = ompi_group_incl(oldcomm->c_remote_group, remote_size, + remote_ranks, &newcomm->c_remote_group); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + remote_group = newcomm->c_remote_group; + } if ( NULL != remote_group ) { ompi_communicator_t *old_localcomm; From 544d573d3a5e0bbc01e2296e23f1bce04c388d56 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 18 Aug 2023 20:10:46 -0400 Subject: [PATCH 10/73] docs: remove a unicode character from a man page Signed-off-by: Jeff Squyres --- docs/man-openmpi/man3/MPI_Session_finalize.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/man-openmpi/man3/MPI_Session_finalize.3.rst b/docs/man-openmpi/man3/MPI_Session_finalize.3.rst index 7937c079f1a..81c4b367890 100644 --- a/docs/man-openmpi/man3/MPI_Session_finalize.3.rst +++ b/docs/man-openmpi/man3/MPI_Session_finalize.3.rst @@ -70,7 +70,7 @@ communications initiated by other processes. The call to :ref:`MPI_Session_finalize` does not free objects created by MPI calls; these objects are freed using MPI_XXX_FREE calls. :ref:`MPI_Session_finalize` may be synchronizing on any or all of the groups associated with communicators, -windows, or  les derived from the session and not disconnected, freed, +windows, or files derived from the session and not disconnected, freed, or closed, respectively, before the call to :ref:`MPI_Session_finalize` procedure. :ref:`MPI_Session_finalize` behaves as if all such synchronizations occur concurrently. As :ref:`MPI_Comm_free` may mark a communicator for freeing From 8e9a24f7fd255043cc035339333566d1a7ac4f51 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 21 Aug 2023 09:32:59 -0400 Subject: [PATCH 11/73] docs: fix HTML word wapping in table cells The sphinx_rtd_theme does not properly handle wrapping long lines in table cells when rendering to HTML due to a CSS issue (see https://github.com/readthedocs/sphinx_rtd_theme/issues/1505). Until the issue is fixed upstream in sphinx_rtd_theme, we can simply override the CSS here. This commit overrides the CSS in conf.py and also touches up some places where we previously tried to work around the lack of word wrapping. Signed-off-by: Jeff Squyres --- docs/building-apps/deprecation-warnings.rst | 2 +- docs/conf.py | 13 ++++ .../required-support-libraries.rst | 39 +++++------ docs/launching-apps/ssh.rst | 67 ++++++++++--------- 4 files changed, 69 insertions(+), 52 deletions(-) diff --git a/docs/building-apps/deprecation-warnings.rst b/docs/building-apps/deprecation-warnings.rst index a7f3f46de9a..dabf456978d 100644 --- a/docs/building-apps/deprecation-warnings.rst +++ b/docs/building-apps/deprecation-warnings.rst @@ -93,7 +93,7 @@ listed :ref:`here `. - MPI-4.0 (2021) * - :ref:`MPI_SIZEOF ` - - Fortran intrinsics``c_sizeof`` or ``storage_size`` + - Fortran intrinsics``c_sizeof`` or ``storage_size`` - MPI-4.0 (2021) .. _label-mpi-keyval-create: diff --git a/docs/conf.py b/docs/conf.py index fc7034fa40a..bf192f5356b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -225,3 +225,16 @@ def _doit(topdir): .. |deprecated_favor| replace:: this routine is deprecated in favor of """ + +# The sphinx_rtd_theme does not properly handle wrapping long lines in +# table cells when rendering to HTML due to a CSS issue (see +# https://github.com/readthedocs/sphinx_rtd_theme/issues/1505). Until +# the issue is fixed upstream in sphinx_rtd_theme, we can simply +# override the CSS here. +rst_prolog += """ +.. raw:: html + + +""" diff --git a/docs/installing-open-mpi/required-support-libraries.rst b/docs/installing-open-mpi/required-support-libraries.rst index 049c030d208..9e02297998b 100644 --- a/docs/installing-open-mpi/required-support-libraries.rst +++ b/docs/installing-open-mpi/required-support-libraries.rst @@ -7,37 +7,38 @@ Open MPI requires the following support libraries with the minimum listed versio .. list-table:: :header-rows: 1 - :widths: 10 10 25 * - Library - Minimum version - Notes * - `Hardware Locality `_ - |hwloc_min_version| - - | This library is required; Open MPI will not build without it. + - This library is required; Open MPI will not build without it. * - `Libevent `_ - |event_min_version| - - | This library is required; Open MPI will not build without it. + - This library is required; Open MPI will not build without it. * - `PMIx `_ - |pmix_min_version| - - | This library is required; Open MPI will not build without it. + - This library is required; Open MPI will not build without it. * - `PRRTE `_ - |prte_min_version| - - | This library is optional in some environments. PRRTE provides - | Open MPI's full-featured ``mpirun`` / ``mpiexec`` MPI - | application launchers (the two are identical; they are symbolic - | links to the same executable). - - * | If your environment uses another MPI application launcher - | (e.g., Slurm users can use the ``srun`` launcher to "direct - | launch" Open MPI applications), then the use of PRRTE is - | optional. - * | If your environment has no other MPI application launcher, then - | you need to install PRRTE and build Open MPI with PRRTE - | support. - * | Open MPI can use the copy of PRRTE embedded in its source code - | tree, or compile/link against an external PRRTE installation. - | :ref:`See this section for details about how to specify each method `. + - This library is optional in some environments. PRRTE provides + Open MPI's full-featured ``mpirun`` / ``mpiexec`` MPI + application launchers (the two are identical; they are symbolic + links to the same executable). + + * If your environment uses another MPI application launcher + (e.g., Slurm users can use the ``srun`` launcher to "direct + launch" Open MPI applications), then the use of PRRTE is + optional. + * If your environment has no other MPI application launcher, then + you need to install PRRTE and build Open MPI with PRRTE + support. + * Open MPI can use the copy of PRRTE embedded in its source + code tree, or compile/link against an external PRRTE + installation. :ref:`See this section for details about how + to specify each method + `. Since these support libraries are fundamental to Open MPI's operation and not universally available in all environments, they are directly diff --git a/docs/launching-apps/ssh.rst b/docs/launching-apps/ssh.rst index 6a800c30eb3..969e55fa826 100644 --- a/docs/launching-apps/ssh.rst +++ b/docs/launching-apps/ssh.rst @@ -203,35 +203,38 @@ shells are picky about the permissions of the startup file, for example). The list below contains some common shells and the startup files that they read/execute upon login: -.. error:: TODO This rendering sucks, but I couldn't make it play nice - with list-table, either. :-( - -* ``bash`` or ``zsh``: - - * **Non-interactive login:** ``$HOME/.bashrc`` if it exists. - * **Interactive login**: ``$HOME/.bash_profile`` if it exists, or - ``$HOME/.bash_login`` if it exists, or ``$HOME/.profile`` if it - exists (in that order). Note that some Linux distributions - automatically come with ``$HOME/.bash_profile`` scripts for users - that automatically execute ``$HOME/.bashrc`` as well. Consult the - bash man page for more information. - -* ``sh``: - - * **Non-interactive login:** This shell does not execute any file - automatically, so Open MPI will execute the ``$HOME/.profile`` - script before invoking Open MPI executables on remote nodes - * **Interactive login:** ``$HOME/.profile`` - -* ``csh``: - - * **Non-interactive login:** ``$HOME/.cshrc`` - * **Interactive login:** ``$HOME/.cshrc`` followed by - ``$HOME/.login`` - -* ``tcsh``: - - * **Non-interactive login:** ``$HOME/.tcshrc`` if it exists, - ``$HOME/.cshrc`` if it does not - * **Interactive login:** ``$HOME/.tcshrc`` if it exists, - ``$HOME/.cshrc`` if it does not, followed by ``$HOME/.login`` +.. list-table:: + :header-rows: 1 + + * - Shell + - Non-interactive login + - Interactive login + + * - ``bash`` or ``zsh`` + - ``$HOME/.bashrc`` if it exists. + - #. ``$HOME/.bash_profile`` if it exists, or + #. ``$HOME/.bash_login`` if it exists, or + #. ``$HOME/.profile`` if it exists (in that order). + + Note that some Linux distributions automatically come + with ``$HOME/.bash_profile`` scripts for users that + automatically execute ``$HOME/.bashrc`` as well. Consult the + bash man page for more information. + + * - ``sh`` + - This shell does not execute any file automatically, so Open MPI + will execute the ``$HOME/.profile`` script before invoking Open + MPI executables on remote nodes + - ``$HOME/.profile`` + + * - ``csh`` + - ``$HOME/.cshrc`` + - ``$HOME/.cshrc`` followed by ``$HOME/.login`` + + * - ``tcsh`` + - #. ``$HOME/.tcshrc`` if it exists, or + #. ``$HOME/.cshrc`` if it does not + - #. ``$HOME/.tcshrc`` if it exists, or + #. ``$HOME/.cshrc`` if it does not + + Afterwards, execute ``$HOME/.login`` From 5d236e9fb322a34618f7fe64cc23f620d728ce1d Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Mon, 14 Aug 2023 18:58:20 +0000 Subject: [PATCH 12/73] Address Github issue #11532 by translating legacy parameters for direct launches Borrow code from the OMPI schizo module in PRRTE that translates legacy MCA parameters when an application is direct launched (PRRTE will translate legacy parameters when natively launched). Signed-off-by: Quincey Koziol --- opal/mca/pmix/base/pmix_base_fns.c | 157 ++++++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 1 deletion(-) diff --git a/opal/mca/pmix/base/pmix_base_fns.c b/opal/mca/pmix/base/pmix_base_fns.c index 8bc2b6d0e48..d4e6bef6188 100644 --- a/opal/mca/pmix/base/pmix_base_fns.c +++ b/opal/mca/pmix/base/pmix_base_fns.c @@ -30,12 +30,18 @@ #include "opal/class/opal_pointer_array.h" #include "opal/util/argv.h" +#include "opal/util/opal_environ.h" +#include "opal/util/os_path.h" #include "opal/util/output.h" +#include "opal/util/printf.h" #include "opal/util/proc.h" #include "opal_stdint.h" +#include "opal/mca/base/mca_base_vari.h" #include "opal/mca/pmix/base/base.h" +#include "src/include/pmix_frameworks.h" + int opal_pmix_base_exchange(pmix_info_t *indat, pmix_pdata_t *outdat, int timeout) { pmix_status_t rc; @@ -73,6 +79,150 @@ int opal_pmix_base_exchange(pmix_info_t *indat, pmix_pdata_t *outdat, int timeou return opal_pmix_convert_status(rc); } +static bool check_pmix_param(char *param) +{ + char *p; + size_t n; + int len; + + p = strchr(param, '_'); + len = (int)(p - param); + + if (0 == strncmp(param, "pmix", len)) { + return true; + } + for (n=0; NULL != pmix_framework_names[n]; n++) { + if (0 == strncmp(param, pmix_framework_names[n], len)) { + return true; + } + } + return false; +} + +static bool check_pmix_overlap(char *var, char *value) +{ + char *tmp; + + if (0 == strncmp(var, "dl_", 3)) { + opal_asprintf(&tmp, "PMIX_MCA_pdl_%s", &var[3]); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, value, false); + free(tmp); + return true; + } else if (0 == strncmp(var, "oob_", 4)) { + opal_asprintf(&tmp, "PMIX_MCA_ptl_%s", &var[4]); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, value, false); + free(tmp); + return true; + } else if (0 == strncmp(var, "hwloc_", 6)) { + opal_asprintf(&tmp, "PMIX_MCA_%s", var); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, value, false); + free(tmp); + return true; + } else if (0 == strncmp(var, "if_", 3)) { + // need to convert if to pif + opal_asprintf(&tmp, "PMIX_MCA_pif_%s", &var[3]); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, value, false); + free(tmp); + return true; + } + return false; +} + +// NOTE: This code is fundamentally the same (module PMIX <-> OPAL) +// as the translate_params() routine in the PRRTE repo's +// src/mca/schizo/ompi/schizo_ompi.c file. If there are +// changes here, there are likely to be changes there. +static void translate_params(void) +{ + char *evar, *tmp, *e2; + char *file; + const char *home; + opal_list_t params; + mca_base_var_file_value_t *fv; + bool pmix_overlap; + int n, len; + + /* Since we are direct launched, we need to check the OMPI default + * MCA params to see if there is something relating to PRRTE + * in them - this would be "old" references to things from + * ORTE, as well as a few OPAL references that also impact us + * + * NOTE: we do this in the following precedence order. Note + * that we do not overwrite at any step - this is so that we + * don't overwrite something previously set by the user. So + * the order to execution is the opposite of the intended + * precedence order. + * + * 1. check the environmental paramaters for OMPI_MCA values + * that need to be translated + * + * 2. the user's home directory file as it should + * overwrite the system default file, but not the + * envars + * + * 3. the system default parameter file + */ + len = strlen("OMPI_MCA_"); + for (n=0; NULL != environ[n]; n++) { + if (0 == strncmp(environ[n], "OMPI_MCA_", len)) { + e2 = strdup(environ[n]); + evar = strrchr(e2, '='); + *evar = '\0'; + ++evar; + pmix_overlap = check_pmix_overlap(&e2[len], evar); + if (!pmix_overlap && check_pmix_param(&e2[len])) { + opal_asprintf(&tmp, "PMIX_MCA_%s", &e2[len]); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, evar, false); + free(tmp); + } + free(e2); + } + } + + /* try to get user's home directory */ + home = opal_home_directory(); + if (NULL != home) { + file = opal_os_path(false, home, ".openmpi", "mca-params.conf", NULL); + OBJ_CONSTRUCT(¶ms, opal_list_t); + mca_base_parse_paramfile(file, ¶ms); + free(file); + OPAL_LIST_FOREACH (fv, ¶ms, mca_base_var_file_value_t) { + pmix_overlap = check_pmix_overlap(&e2[len], evar); + if (!pmix_overlap && check_pmix_param(fv->mbvfv_var)) { + opal_asprintf(&tmp, "PMIX_MCA_%s", fv->mbvfv_var); + // set it, but don't overwrite if they already + // have a value in our environment + setenv(tmp, fv->mbvfv_value, false); + free(tmp); + } + } + OPAL_LIST_DESTRUCT(¶ms); + } + + /* check if the user has set OMPIHOME in their environment */ + if (NULL != (evar = getenv("OMPIHOME"))) { + /* look for the default MCA param file */ + file = opal_os_path(false, evar, "etc", "openmpi-mca-params.conf", NULL); + OBJ_CONSTRUCT(¶ms, opal_list_t); + mca_base_parse_paramfile(file, ¶ms); + free(file); + OPAL_LIST_FOREACH (fv, ¶ms, mca_base_var_file_value_t) { + check_pmix_overlap(fv->mbvfv_var, fv->mbvfv_value); + } + OPAL_LIST_DESTRUCT(¶ms); + } +} + typedef struct { opal_list_item_t super; pmix_nspace_t nspace; @@ -85,8 +235,13 @@ static opal_list_t localnspaces; void opal_pmix_setup_nspace_tracker(void) { /* check if we were launched by PRRTE */ - if (NULL != getenv("PRRTE_LAUNCHED")) { + if (NULL != getenv("PRTE_LAUNCHED")) { opal_process_info.nativelaunch = true; + } else { + // When direct launched, translate MCA parameters from older releases + // into newer versions here, since PRRTE isn't involved. (When + // natively launched, PRRTE will already have translated the params) + translate_params(); } OBJ_CONSTRUCT(&localnspaces, opal_list_t); From f5f3b93483958ce6196cb394635093d6673fbd92 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Wed, 7 Jun 2023 01:23:55 +0000 Subject: [PATCH 13/73] opal/common/ofi: refactor NIC selection logic This patch refactors the OFI NIC selection logic. It foremost improves the NIC search algorithm. Instead of searching for the closest NICs on the system, this patch directly compares the distances of the given providers and selects the nearest NIC. This change also makes it explicit that if the process is unbound, or the distance cannot be reliably calculated, a provider will be selected in round-robin fashion. Signed-off-by: Wenduo Wang --- opal/mca/common/ofi/common_ofi.c | 411 +++++++++++++++---------------- 1 file changed, 199 insertions(+), 212 deletions(-) diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 1957bc80a67..f0da0f4a52c 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -440,12 +440,24 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr && !check_ep_attr(provider_info->ep_attr, provider->ep_attr) && !(provider_info->caps & ~(provider->caps)) && !(provider_info->mode & ~(provider->mode)) && provider_info->addr_format == provider->addr_format) { - return 0; + return OPAL_SUCCESS; } else { return OPAL_ERROR; } } +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_nic_pci(struct fi_info *provider, struct fi_pci_attr *pci) +{ + if (NULL != provider->nic && NULL != provider->nic->bus_attr + && FI_BUS_PCI == provider->nic->bus_attr->bus_type) { + *pci = provider->nic->bus_attr->attr.pci; + return OPAL_SUCCESS; + } + return OPAL_ERR_NOT_AVAILABLE; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + /** * Calculate device distances * @@ -510,177 +522,220 @@ static int compute_dev_distances(pmix_device_distance_t **distances, } /** - * Find the nearest devices to the current thread + * @brief Get the provider distance from the provided distance metrics + * + * @param[in] topology hwloc topology + * @param[in] provider Provider object + * @param[in] distances List of known device distances + * @param[in] num_distances Length of distances + * @param[out] distance Pointer to store the provider distance + * @return OPAL_SUCCESS if and only if the distance is found in the provided list + */ +#if OPAL_OFI_PCI_DATA_AVAILABLE +static int get_provider_distance(hwloc_topology_t topology, struct fi_info *provider, + pmix_device_distance_t *distances, int num_distances, + uint16_t *distance) +{ + hwloc_obj_t pcidev, osdev; + struct fi_pci_attr pci = {0}; + + if (OPAL_SUCCESS != get_provider_nic_pci(provider, &pci)) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot determine PCI attributes of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + + pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, pci.bus_id, pci.device_id, + pci.function_id); + if (!pcidev) { + opal_output_verbose(1, opal_common_ofi.output, "Cannot locate PCI device of provider %s", + provider->domain_attr->name); + return OPAL_ERROR; + } + +#if HWLOC_API_VERSION < 0x00020000 + osdev = pcidev->first_child; +#else + osdev = pcidev->io_first_child; +#endif /* HWLOC_API_VERSION */ + for (; osdev != NULL; osdev = osdev->next_sibling) { + int i; + + if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { + const char *nguid = hwloc_obj_get_info_by_name(osdev, "NodeGUID"); + const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); + + if (!nguid && !sguid) + continue; + + for (i = 0; i < num_distances; i++) { + char lsguid[20], lnguid[20]; + int ret; + + if (PMIX_DEVTYPE_OPENFABRICS != distances[i].type) { + continue; + } + + if (!distances[i].osname || !osdev->name + || strcmp(distances[i].osname, osdev->name)) + continue; + + ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); + if (ret != 2) + continue; + + if ((nguid && (0 == strcasecmp(lnguid, nguid))) + || (sguid && (0 == strcasecmp(lsguid, sguid)))) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { + const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); + if (!address) + continue; + for (i = 0; i < num_distances; i++) { + if (PMIX_DEVTYPE_NETWORK != distances[i].type) { + continue; + } + char *addr = strstr(distances[i].uuid, "://"); + if (!addr || addr + 3 > distances[i].uuid + strlen(distances[i].uuid)) + continue; + if (!strcmp(addr + 3, address)) { + *distance = distances[i].mindist; + return OPAL_SUCCESS; + } + } + } + } + + return OPAL_ERROR; +} +#else +static int get_provider_distance(struct fi_info *provider, hwloc_topology_t topology, + pmix_device_distance_t *distances, size_t num_distances, + uint16_t *distance) +{ + return OPAL_ERROR; +} +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ + +/** + * @brief Get the nearest device to the current thread * * Use the PMIx server or calculate the device distances, then out of the set of * returned distances find the subset of the nearest devices. This can be - * 1 or more. - * - * @param num_distances (OUT) number of entries in the returned array + * 0 or more. + * If there are multiple equidistant devices, break the tie using the rank. * - * @return An array of device distances which are nearest this thread - * or NULL if we fail to get the distances. In this case we will just - * revert to round robin. + * @param[in] topoloy hwloc topology + * @param[in] provider_list List of providers to select from + * @param[in] num_providers Number of providers in provider_list + * @param[in] rank local rank of the process + * @param[out] provider pointer to the selected provider * + * @return OPAL_SUCCESS if and only if a nearest provider is found. */ -static pmix_device_distance_t * -get_nearest_nics(int *num_distances, pmix_value_t **valin) +static int get_nearest_nic(hwloc_topology_t topology, struct fi_info *provider_list, + size_t num_providers, uint32_t rank, struct fi_info **provider) { - size_t ndist, i; - int ret, idx = 0; + int ret; pmix_data_array_t *dptr; - uint16_t near = USHRT_MAX; + pmix_device_distance_t *distances; pmix_info_t directive; pmix_value_t *val = NULL; - pmix_device_distance_t *distances, *nearest = NULL; + size_t ndist, num_nearest = 0; + struct fi_info *current_provider = NULL; + uint16_t dists[num_providers], *dist = NULL, min_dist = USHRT_MAX; + uint32_t provider_rank = 0; PMIx_Info_load(&directive, PMIX_OPTIONAL, NULL, PMIX_BOOL); - ret = PMIx_Get(&opal_process_info.myprocid, - PMIX_DEVICE_DISTANCES, &directive, 1, &val); + ret = PMIx_Get(&opal_process_info.myprocid, PMIX_DEVICE_DISTANCES, &directive, 1, &val); PMIx_Info_destruct(&directive); if (ret != PMIX_SUCCESS || !val) { ret = compute_dev_distances(&distances, &ndist); if (ret) { + ret = OPAL_ERROR; goto out; } goto find_nearest; } if (PMIX_DATA_ARRAY != val->type) { + ret = OPAL_ERROR; goto out; } dptr = val->data.darray; if (NULL == dptr) { + ret = OPAL_ERROR; goto out; } if (PMIX_DEVICE_DIST != dptr->type) { + ret = OPAL_ERROR; goto out; } - distances = (pmix_device_distance_t*)dptr->array; + distances = (pmix_device_distance_t *) dptr->array; ndist = dptr->size; find_nearest: - nearest = calloc(sizeof(*distances), ndist); - if (!nearest) { - goto out; - } - - for (i = 0; i < ndist; i++) { - if (distances[i].type != PMIX_DEVTYPE_NETWORK && - distances[i].type != PMIX_DEVTYPE_OPENFABRICS) + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next, ++dist) { + if (OPAL_SUCCESS != check_provider_attr(provider_list, current_provider)) { continue; - if (distances[i].mindist < near) { - idx = 0; - near = distances[i].mindist; - nearest[idx] = distances[i]; - idx++; - } else if (distances[i].mindist == near) { - nearest[idx] = distances[i]; - idx++; + } + if (OPAL_SUCCESS != get_provider_distance(topology, current_provider, distances, ndist, dist)) { + *dist = USHRT_MAX; + } + + if (*dist < min_dist) { + min_dist = *dist; + num_nearest = 1; + } else if (*dist == min_dist) { + ++num_nearest; + } + + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider)) { + opal_output_verbose(1, opal_common_ofi.output, "provider: %s dist: %d", + current_provider->domain_attr->name, *dist); } } - *num_distances = idx; + ret = OPAL_ERROR; + if (0 >= num_nearest) { + return ret; + } + provider_rank = rank % num_nearest; + num_nearest = 0; + for (current_provider = provider_list, dist = dists; NULL != current_provider; + current_provider = current_provider->next) { + if (OPAL_SUCCESS == check_provider_attr(provider_list, current_provider) + && min_dist == *(dist++) && provider_rank == num_nearest++) { + *provider = current_provider; + ret = OPAL_SUCCESS; + goto out; + } + } out: - *valin = val; - return nearest; -} + if (val) + PMIx_Value_free(val, 1); -#if OPAL_OFI_PCI_DATA_AVAILABLE -/** - * Determine if a device is nearest - * - * Given a device distances array of the nearest pci devices, - * determine if one of these device distances refers to the pci - * device passed in - * - * @param distances (IN) distances array - * @param num_distances (IN) number of entries in the distances array - * @param topology (IN) topology of the node - * @param pci (IN) PCI device being examined - * - * @return true if the PCI device is in the distances array or if the - * distances array is not provided. False otherwise. - * - */ -#if HWLOC_API_VERSION < 0x00020000 -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) -{ - return true; + return ret; } -#else -static bool is_near(pmix_device_distance_t *distances, - int num_distances, - hwloc_topology_t topology, - struct fi_pci_attr pci) -{ - hwloc_obj_t pcidev, osdev; - - /* if we failed to find any distances, then we consider all interfaces - * to be of equal distances and let the caller decide how to handle - * them - */ - if (!distances) - return true; - - pcidev = hwloc_get_pcidev_by_busid(topology, pci.domain_id, - pci.bus_id, pci.device_id, - pci.function_id); - if (!pcidev) - return false; - - for(osdev = pcidev->io_first_child; osdev != NULL; osdev = osdev->next_sibling) { - int i; - - if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) { - const char *nguid = hwloc_obj_get_info_by_name(osdev,"NodeGUID"); - const char *sguid = hwloc_obj_get_info_by_name(osdev, "SysImageGUID"); - - if (!nguid && !sguid) - continue; - - for (i = 0; i < num_distances; i++) { - char lsguid[20], lnguid[20]; - int ret; - if (!distances[i].osname || !osdev->name - || strcmp(distances[i].osname, osdev->name)) - continue; +static struct fi_info *select_provider_round_robin(struct fi_info *provider_list, uint32_t rank, + size_t num_providers) +{ + uint32_t provider_rank = rank % num_providers; + struct fi_info *current_provider = provider_list; - ret = sscanf(distances[i].uuid, "fab://%19s::%19s", lnguid, lsguid); - if (ret != 2) - continue; - if (nguid && (0 == strcasecmp(lnguid, nguid))) { - return true; - } else if (sguid && (0 == strcasecmp(lsguid, sguid))) { - return true; - } - } - } else if (osdev->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK) { - const char *address = hwloc_obj_get_info_by_name(osdev, "Address"); - if (!address) - continue; - for (i = 0; i < num_distances; i++) { - char *addr = strstr(distances[i].uuid, "://"); - if (!addr || addr + 3 > distances[i].uuid - + strlen(distances[i].uuid)) - continue; - if (!strcmp(addr+3, address)) { - return true; - } - } - } + for (uint32_t i = 0; i < provider_rank; ++i) { + current_provider = current_provider->next; } - return false; + return current_provider; } -#endif -#endif // OPAL_OFI_PCI_DATA_AVAILABLE /* Count providers returns the number of providers present in an fi_info list * @param (IN) provider_list struct fi_info* list of providers available @@ -791,109 +846,41 @@ static uint32_t get_package_rank(opal_process_info_t *process_info) } struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info) + opal_process_info_t *process_info) { - struct fi_info *provider = provider_list, *current_provider = provider_list; - struct fi_info **provider_table; -#if OPAL_OFI_PCI_DATA_AVAILABLE - pmix_device_distance_t *distances = NULL; - pmix_value_t *pmix_val; - struct fi_pci_attr pci; - int num_distances = 0; -#endif - bool near = false; - int ret; - unsigned int num_provider = 0, provider_limit = 0; - bool provider_found = false; + int ret, num_providers = 0; + struct fi_info *provider = NULL; uint32_t package_rank = 0; + num_providers = count_providers(provider_list); + if (!process_info->proc_is_bound || 2 > num_providers) { + goto round_robin; + } + /* Initialize opal_hwloc_topology if it is not already */ ret = opal_hwloc_base_get_topology(); if (0 > ret) { /* Provider selection can continue but there is no guarantee of locality */ - opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology\n", + opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Failed to initialize topology", __FILE__, __LINE__); } - provider_limit = count_providers(provider_list); - - /* Allocate memory for provider table */ - provider_table = calloc(provider_limit, sizeof(struct fi_info *)); - if (NULL == provider_table) { - opal_output_verbose(1, opal_common_ofi.output, - "%s:%d:Failed to allocate memory for provider table\n", __FILE__, - __LINE__); - return provider_list; - } + package_rank = get_package_rank(process_info); #if OPAL_OFI_PCI_DATA_AVAILABLE - /* find all the nearest devices to this thread, then out of these - * determine which device we should bind to. - */ - distances = get_nearest_nics(&num_distances, &pmix_val); -#endif - - current_provider = provider; - - /* Cycle through remaining fi_info objects, looking for alike providers */ - while (NULL != current_provider) { - if (!check_provider_attr(provider, current_provider)) { - near = false; -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != current_provider->nic - && NULL != current_provider->nic->bus_attr - && current_provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = current_provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); - } -#endif - /* We could have multiple near providers */ - if (near && !provider_found) { - provider_found = true; - num_provider = 0; - } - - /* Add the provider to the provider list if the cpusets match or if - * no other provider was found on the same cpuset as the process. - */ - if (near || !provider_found) { - provider_table[num_provider] = current_provider; - num_provider++; - } - } - current_provider = current_provider->next; - } - - /* Select provider from local rank % number of providers */ - if (num_provider >= 2) { - // If there are multiple NICs "close" to the process, try to calculate package_rank - package_rank = get_package_rank(process_info); - provider = provider_table[package_rank % num_provider]; - } else if (num_provider == 1) { - provider = provider_table[num_provider - 1]; - } - -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (NULL != provider->nic - && NULL != provider->nic->bus_attr - && provider->nic->bus_attr->bus_type == FI_BUS_PCI) { - pci = provider->nic->bus_attr->attr.pci; - near = is_near(distances, num_distances, - opal_hwloc_topology, pci); + ret = get_nearest_nic(opal_hwloc_topology, provider_list, num_providers, package_rank, + &provider); + if (OPAL_SUCCESS == ret) { + goto out; } -#endif +#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */ +round_robin: + provider = select_provider_round_robin(provider_list, package_rank, num_providers); +out: #if OPAL_ENABLE_DEBUG - opal_output_verbose(1, opal_common_ofi.output, - "package rank: %d device: %s near: %s\n", package_rank, - provider->domain_attr->name, near ? "true" : "false"); -#endif - - free(provider_table); -#if OPAL_OFI_PCI_DATA_AVAILABLE - if (pmix_val) - PMIx_Value_free(pmix_val, 1); + opal_output_verbose(1, opal_common_ofi.output, "package rank: %d device: %s", package_rank, + provider->domain_attr->name); #endif return provider; } From 5a7f814ac2ac4f8bf1cfe10442ca119ba04a8442 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Wed, 23 Aug 2023 00:50:07 +0000 Subject: [PATCH 14/73] coll/base,tuned: introduce allgather_reduce allreduce algorithm This patch introduces a new allreduce algorithm implemented as an allgather followed by local reduction. The change is motivated by the longer latency of tcp/EFA traffic. Current allreduce algorithms require a round trip to and from a selected root process. This algorithm avoids the round trip over network and therefore reduces total latency. However, this communication pattern is not scalable for large communicators, and should only be used for inter-node allreduce. Co-authored-by: Matt Koop Co-authored-by: Wenduo Wang Signed-off-by: Wenduo Wang --- ompi/mca/coll/base/coll_base_allreduce.c | 114 ++++++++++++++++++ ompi/mca/coll/base/coll_base_functions.h | 1 + .../tuned/coll_tuned_allreduce_decision.c | 3 + 3 files changed, 118 insertions(+) diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c index 95468a5c885..30ab0a4f869 100644 --- a/ompi/mca/coll/base/coll_base_allreduce.c +++ b/ompi/mca/coll/base/coll_base_allreduce.c @@ -18,6 +18,8 @@ * Copyright (c) 2018 Siberian State University of Telecommunications * and Information Science. All rights reserved. * Copyright (c) 2022 Cisco Systems, Inc. All rights reserved. + * Copyright (c) Amazon.com, Inc. or its affiliates. + * All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1245,4 +1247,116 @@ int ompi_coll_base_allreduce_intra_redscat_allgather( return err; } +/** + * A greedy algorithm to exchange data among processes in the communicator via + * an allgather pattern, followed by a local reduction on each process. This + * avoids the round trip in a rooted communication pattern, e.g. reduce on the + * root and then broadcast to peers. + * + * This algorithm supports both commutative and non-commutative MPI operations. + * For non-commutative operations the reduction is applied to the data in the + * same rank order, e.g. rank 0, rank 1, ... rank N, on each process. + * + * This algorithm benefits inter-node allreduce over a high-latency network. + * Caution is needed on larger communicators(n) and data sizes(m), which will + * result in m*n^2 total traffic and potential network congestion. + */ +int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf, int count, + struct ompi_datatype_t *dtype, + struct ompi_op_t *op, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) +{ + char *send_buf = (void *) sbuf; + int comm_size = ompi_comm_size(comm); + int err = MPI_SUCCESS; + int rank = ompi_comm_rank(comm); + bool commutative = ompi_op_is_commute(op); + ompi_request_t **reqs; + + if (sbuf == MPI_IN_PLACE) { + send_buf = rbuf; + } + + /* Allocate a large-enough buffer to receive from everyone else */ + char *tmp_buf = NULL, *tmp_buf_raw = NULL, *tmp_recv = NULL; + ptrdiff_t lb, extent, dsize, gap = 0; + ompi_datatype_get_extent(dtype, &lb, &extent); + dsize = opal_datatype_span(&dtype->super, count * comm_size, &gap); + tmp_buf_raw = (char *) malloc(dsize); + if (NULL == tmp_buf_raw) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + if (commutative) { + ompi_datatype_copy_content_same_ddt(dtype, count, (char *) rbuf, (char *) send_buf); + } + + tmp_buf = tmp_buf_raw - gap; + + /* Requests for send to AND receive from everyone else */ + int reqs_needed = (comm_size - 1) * 2; + reqs = ompi_coll_base_comm_get_reqs(module->base_data, reqs_needed); + + ptrdiff_t incr = extent * count; + tmp_recv = (char *) tmp_buf; + + /* Exchange data with peer processes */ + int req_index = 0, peer_rank = 0; + for (int i = 1; i < comm_size; ++i) { + peer_rank = (rank + i) % comm_size; + tmp_recv = tmp_buf + (peer_rank * incr); + err = MCA_PML_CALL(irecv(tmp_recv, count, dtype, peer_rank, MCA_COLL_BASE_TAG_ALLREDUCE, + comm, &reqs[req_index++])); + if (MPI_SUCCESS != err) { + goto err_hndl; + } + + err = MCA_PML_CALL(isend(send_buf, count, dtype, peer_rank, MCA_COLL_BASE_TAG_ALLREDUCE, + MCA_PML_BASE_SEND_STANDARD, comm, &reqs[req_index++])); + if (MPI_SUCCESS != err) { + goto err_hndl; + } + } + + err = ompi_request_wait_all(req_index, reqs, MPI_STATUSES_IGNORE); + + /* Prepare for local reduction */ + peer_rank = 0; + if (!commutative) { + /* For non-commutative operations, ensure the reduction always starts from Rank 0's data */ + memcpy(rbuf, 0 == rank ? send_buf : tmp_buf, incr); + peer_rank = 1; + } + + char *inbuf; + for (; peer_rank < comm_size; peer_rank++) { + inbuf = rank == peer_rank ? send_buf : tmp_buf + (peer_rank * incr); + ompi_op_reduce(op, (void *) inbuf, rbuf, count, dtype); + } + +err_hndl: + if (NULL != tmp_buf_raw) + free(tmp_buf_raw); + + if (NULL != reqs) { + if (MPI_ERR_IN_STATUS == err) { + for (int i = 0; i < reqs_needed; i++) { + if (MPI_REQUEST_NULL == reqs[i]) + continue; + if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) + continue; + if (MPI_SUCCESS != reqs[i]->req_status.MPI_ERROR) { + err = reqs[i]->req_status.MPI_ERROR; + break; + } + } + } + ompi_coll_base_free_reqs(reqs, reqs_needed); + } + + /* All done */ + return err; +} + /* copied function (with appropriate renaming) ends here */ diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h index 32714445904..1c73d01d37e 100644 --- a/ompi/mca/coll/base/coll_base_functions.h +++ b/ompi/mca/coll/base/coll_base_functions.h @@ -210,6 +210,7 @@ int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS); int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize); int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS); int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS); +int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS); /* AlltoAll */ int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS); diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c index eabe6f17378..3711cdb8eb1 100644 --- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c +++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c @@ -42,6 +42,7 @@ static const mca_base_var_enum_value_t allreduce_algorithms[] = { {4, "ring"}, {5, "segmented_ring"}, {6, "rabenseifner"}, + {7, "allgather_reduce"}, {0, NULL} }; @@ -146,6 +147,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, int co return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize); case (6): return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module); + case (7): + return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module); } /* switch */ OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?", algorithm, ompi_coll_tuned_forced_max_algorithms[ALLREDUCE])); From 76a444127081e74b3fb9d19d8694172c7c044b43 Mon Sep 17 00:00:00 2001 From: Andrii Bilokur Date: Mon, 28 Aug 2023 00:36:55 +0300 Subject: [PATCH 15/73] Prevent runs NVIDIA_CI on the fork Signed-off-by: Andrii Bilokur --- .github/workflows/ompi_nvidia.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ompi_nvidia.yaml b/.github/workflows/ompi_nvidia.yaml index 6492d09f275..c60a3257cd0 100644 --- a/.github/workflows/ompi_nvidia.yaml +++ b/.github/workflows/ompi_nvidia.yaml @@ -1,6 +1,5 @@ name: ompi_NVIDIA CI -on: [pull_request, push] - +on: [pull_request] jobs: deployment: runs-on: [self-hosted, linux, x64, nvidia] From bc35052bb6b599b38e340d985fe932ebd6f27d23 Mon Sep 17 00:00:00 2001 From: "GERMAIN, FLORENT" Date: Tue, 29 Aug 2023 15:05:25 +0200 Subject: [PATCH 16/73] Fix: Assert that MPI_BYTE is 1 byte large Signed-off-by: Florent Germain --- ompi/datatype/ompi_datatype_internal.h | 38 +++----------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/ompi/datatype/ompi_datatype_internal.h b/ompi/datatype/ompi_datatype_internal.h index 21759be2733..31f3cf287c4 100644 --- a/ompi/datatype/ompi_datatype_internal.h +++ b/ompi/datatype/ompi_datatype_internal.h @@ -133,27 +133,14 @@ * of being redefined as independent types, they will be made synonyms to * the most basic type. */ -#if SIZEOF_CHAR == 1 +#if SIZEOF_CHAR != 1 +#error Open MPI asserts that sizeof(char) is 1. This seems not to be the case here. Please report on Open MPI github (see issue #11815) +#endif + #define OMPI_DATATYPE_MPI_CHAR OMPI_DATATYPE_MPI_INT8_T #define OMPI_DATATYPE_MPI_SIGNED_CHAR OMPI_DATATYPE_MPI_INT8_T #define OMPI_DATATYPE_MPI_UNSIGNED_CHAR OMPI_DATATYPE_MPI_UINT8_T #define OMPI_DATATYPE_MPI_BYTE OMPI_DATATYPE_MPI_UINT8_T -#elif SIZEOF_CHAR == 2 -#define OMPI_DATATYPE_MPI_CHAR OMPI_DATATYPE_MPI_INT16_T -#define OMPI_DATATYPE_MPI_SIGNED_CHAR OMPI_DATATYPE_MPI_INT16_T -#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR OMPI_DATATYPE_MPI_UINT16_T -#define OMPI_DATATYPE_MPI_BYTE OMPI_DATATYPE_MPI_UINT16_T -#elif SIZEOF_CHAR == 4 -#define OMPI_DATATYPE_MPI_CHAR OMPI_DATATYPE_MPI_INT32_T -#define OMPI_DATATYPE_MPI_SIGNED_CHAR OMPI_DATATYPE_MPI_INT32_T -#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR OMPI_DATATYPE_MPI_UINT32_T -#define OMPI_DATATYPE_MPI_BYTE OMPI_DATATYPE_MPI_UINT32_T -#elif SIZEOF_CHAR == 8 -#define OMPI_DATATYPE_MPI_CHAR OMPI_DATATYPE_MPI_INT64_T -#define OMPI_DATATYPE_MPI_SIGNED_CHAR OMPI_DATATYPE_MPI_INT64_T -#define OMPI_DATATYPE_MPI_UNSIGNED_CHAR OMPI_DATATYPE_MPI_UINT64_T -#define OMPI_DATATYPE_MPI_BYTE OMPI_DATATYPE_MPI_UINT64_T -#endif #if SIZEOF_SHORT == 1 #define OMPI_DATATYPE_MPI_SHORT OMPI_DATATYPE_MPI_INT8_T @@ -519,27 +506,10 @@ extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX #define OMPI_DATATYPE_INITIALIZER_INT64_T OPAL_DATATYPE_INITIALIZER_INT8 #define OMPI_DATATYPE_INITIALIZER_UINT64_T OPAL_DATATYPE_INITIALIZER_UINT8 -#if SIZEOF_CHAR == 1 #define OMPI_DATATYPE_INITIALIZER_CHAR OPAL_DATATYPE_INITIALIZER_INT1 #define OMPI_DATATYPE_INITIALIZER_UNSIGNED_CHAR OPAL_DATATYPE_INITIALIZER_UINT1 #define OMPI_DATATYPE_INITIALIZER_SIGNED_CHAR OPAL_DATATYPE_INITIALIZER_INT1 #define OMPI_DATATYPE_INITIALIZER_BYTE OPAL_DATATYPE_INITIALIZER_UINT1 -#elif SIZEOF_CHAR == 2 -#define OMPI_DATATYPE_INITIALIZER_CHAR OPAL_DATATYPE_INITIALIZER_INT2 -#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_CHAR OPAL_DATATYPE_INITIALIZER_UINT2 -#define OMPI_DATATYPE_INITIALIZER_SIGNED_CHAR OPAL_DATATYPE_INITIALIZER_INT2 -#define OMPI_DATATYPE_INITIALIZER_BYTE OPAL_DATATYPE_INITIALIZER_UINT2 -#elif SIZEOF_CHAR == 4 -#define OMPI_DATATYPE_INITIALIZER_CHAR OPAL_DATATYPE_INITIALIZER_INT4 -#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_CHAR OPAL_DATATYPE_INITIALIZER_UINT4 -#define OMPI_DATATYPE_INITIALIZER_SIGNED_CHAR OPAL_DATATYPE_INITIALIZER_INT4 -#define OMPI_DATATYPE_INITIALIZER_BYTE OPAL_DATATYPE_INITIALIZER_UINT4 -#elif SIZEOF_CHAR == 8 -#define OMPI_DATATYPE_INITIALIZER_CHAR OPAL_DATATYPE_INITIALIZER_INT8 -#define OMPI_DATATYPE_INITIALIZER_UNSIGNED_CHAR OPAL_DATATYPE_INITIALIZER_UINT8 -#define OMPI_DATATYPE_INITIALIZER_SIGNED_CHAR OPAL_DATATYPE_INITIALIZER_INT8 -#define OMPI_DATATYPE_INITIALIZER_BYTE OPAL_DATATYPE_INITIALIZER_UINT8 -#endif #if SIZEOF_SHORT == 2 #define OMPI_DATATYPE_INITIALIZER_SHORT OPAL_DATATYPE_INITIALIZER_INT2 From 234ad2a39c6ee11afd2fec8987af468427e8cdc1 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Fri, 1 Sep 2023 09:23:40 +0900 Subject: [PATCH 17/73] mpif-h: add missing declaration in session_get_nth_pset_f() Add a missing OMPI_SINGLE_NAME_DECL() to correctly support 8 bytes Fortran INTEGER Thanks Neil Mehta for the report. Ref. open-mpi/ompi#11887 Signed-off-by: Gilles Gouaillardet --- ompi/mpi/fortran/mpif-h/session_get_nth_pset_f.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ompi/mpi/fortran/mpif-h/session_get_nth_pset_f.c b/ompi/mpi/fortran/mpif-h/session_get_nth_pset_f.c index 9fb5718cfbd..c0baed9abb4 100644 --- a/ompi/mpi/fortran/mpif-h/session_get_nth_pset_f.c +++ b/ompi/mpi/fortran/mpif-h/session_get_nth_pset_f.c @@ -13,8 +13,8 @@ * Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. + * Copyright (c) 2015-2023 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. * * $COPYRIGHT$ @@ -78,6 +78,7 @@ void ompi_session_get_nth_pset_f(MPI_Fint *session, MPI_Fint *info, MPI_Fint *n, int c_ierr; MPI_Session c_session; char c_name[MPI_MAX_PSET_NAME_LEN]; + OMPI_SINGLE_NAME_DECL(pset_len); c_session = PMPI_Session_f2c(*session); From 63391af84cb6b18c4c573d1c19d142c8923452e6 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Sun, 3 Sep 2023 21:45:41 +0000 Subject: [PATCH 18/73] Revert "pml/cm: fix buffer usage in MCA_PML_CM_HVY_SEND_REQUEST_BSEND_ALLOC()" This reverts commit d71fe934c435a77480fabd4b21aeffce33bc770f. The revert fixes a bug revealed by mtt ibm test suite. The send buffer, instead of the attached user buffer, was used for MPI_Bsend. This violates the MPI_Bsend semantic and makes it unsafe to reuse the send buffer after the function returns. The revert is also needed for 4.x Signed-off-by: Wenduo Wang --- ompi/mca/pml/cm/pml_cm_sendreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mca/pml/cm/pml_cm_sendreq.h b/ompi/mca/pml/cm/pml_cm_sendreq.h index ad99474d74b..56ade0719f9 100644 --- a/ompi/mca/pml/cm/pml_cm_sendreq.h +++ b/ompi/mca/pml/cm/pml_cm_sendreq.h @@ -381,7 +381,7 @@ do { \ &max_data ); \ opal_convertor_prepare_for_send( &sendreq->req_send.req_base.req_convertor, \ &(ompi_mpi_packed.dt.super), \ - max_data, sendreq->req_addr ); \ + max_data, sendreq->req_buff ); \ } \ } \ } while(0); From 63e25b4aa920cc179df61e59ec90e6834f481b53 Mon Sep 17 00:00:00 2001 From: Andrii Bilokur Date: Tue, 5 Sep 2023 19:21:14 +0300 Subject: [PATCH 19/73] Prevent runs NVIDIA_CI in case triggered by PR in the fork Signed-off-by: Andrii Bilokur --- .github/workflows/ompi_nvidia.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ompi_nvidia.yaml b/.github/workflows/ompi_nvidia.yaml index c60a3257cd0..38fc1401d87 100644 --- a/.github/workflows/ompi_nvidia.yaml +++ b/.github/workflows/ompi_nvidia.yaml @@ -1,7 +1,9 @@ name: ompi_NVIDIA CI on: [pull_request] jobs: + deployment: + if: github.repository == 'open-mpi/ompi' runs-on: [self-hosted, linux, x64, nvidia] steps: - name: Checkout @@ -28,7 +30,11 @@ jobs: - name: Running tests run: /start test clean: - if: ${{ always() }} +# always() should be used to run "clean" even when the workflow was canceled +# ( in case of the right repository name) +# The second condition doesn't work when the workflow was canceled + + if: always() && (github.repository == 'open-mpi/ompi') needs: [deployment, build, test] runs-on: [self-hosted, linux, x64, nvidia] steps: From 2029df2a3f1281ac9b036e6d5aa9705ce97053d0 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Thu, 25 May 2023 09:22:02 -0600 Subject: [PATCH 20/73] accel/ze: initial pass at intel zero-level api support for Intel PV and other accelerators related to issue #11763 Signed-off-by: Howard Pritchard --- config/opal_check_ze.m4 | 71 ++ opal/mca/accelerator/ze/Makefile.am | 45 ++ opal/mca/accelerator/ze/accelerator_ze.h | 56 ++ .../accelerator/ze/accelerator_ze_component.c | 405 ++++++++++ .../accelerator/ze/accelerator_ze_module.c | 697 ++++++++++++++++++ opal/mca/accelerator/ze/configure.m4 | 28 + opal/runtime/opal_params_core.c | 3 + opal/runtime/opal_params_core.h | 7 + 8 files changed, 1312 insertions(+) create mode 100644 config/opal_check_ze.m4 create mode 100644 opal/mca/accelerator/ze/Makefile.am create mode 100644 opal/mca/accelerator/ze/accelerator_ze.h create mode 100644 opal/mca/accelerator/ze/accelerator_ze_component.c create mode 100644 opal/mca/accelerator/ze/accelerator_ze_module.c create mode 100644 opal/mca/accelerator/ze/configure.m4 diff --git a/config/opal_check_ze.m4 b/config/opal_check_ze.m4 new file mode 100644 index 00000000000..d1d47bb67c1 --- /dev/null +++ b/config/opal_check_ze.m4 @@ -0,0 +1,71 @@ +dnl +dnl Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +dnl Copyright (c) 2023 Triad National Security, LLC. All rights reserved. +dnl $COPYRIGHT$ +dnl +dnl Additional copyrights may follow +dnl +dnl $HEADER$ +dnl + + +# OPAL_CHECK_ZE(prefix, [action-if-found], [action-if-not-found]) +# -------------------------------------------------------- +# check if Intel ZE support can be found. sets prefix_{CPPFLAGS, +# LDFLAGS, LIBS} as needed and runs action-if-found if there is +# support, otherwise executes action-if-not-found + + +# +# Check for ZE support +# +AC_DEFUN([OPAL_CHECK_ZE],[ + + OPAL_VAR_SCOPE_PUSH([opal_check_ze_happy ze_save_CPPFLAGS ze_save_LDFLAGS ze_save_LIBS ze_CPPFLAGS ze_LDFLAGS ze_LIBS]) + + ze_save_CPPFLAGS="$CPPFLAGS" + ze_save_LDFLAGS="$LDFLAGS" + ze_save_LIBS="$LIBS" + + # Get some configuration information + AC_ARG_WITH([ze], + [AS_HELP_STRING([--with-ze(=DIR)], + [Build Intel ZE support, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])]) + + + AS_IF([ test -n "$with_ze" && test "$with_ze" = "yes" ], + [ with_ze="/opt/ze"] ) + + m4_define([ze_pkgconfig_module], [level-zero]) + OAC_CHECK_PACKAGE([ze], + [$1], + [level_zero/ze_api.h], + [ze_loader], + [zeInit], + [opal_check_ze_happy="yes"], + [opal_check_ze_happy="no"]) + + LDFLAGS="$ze_save_LDFLAGS" + LIBS="$ze_save_LIBS" + OPAL_APPEND([CPPFLAGS], [${$1_CPPFLAGS}] ) + OPAL_APPEND([LDFLAGS], [${$1_LDFLAGS}] ) + OPAL_APPEND([LIBS], [${$1_LIBS}] ) + + AS_IF([ test "$opal_check_ze_happy" = "no" ], + [ CPPFLAGS="$ze_save_CPPFLAGS"]) + + AS_IF([ test "$opal_check_ze_happy" = "yes" ], + [ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [1], [Enable Intel ZE support]) + ZE_SUPPORT=1 ], + [ AC_DEFINE_UNQUOTED([OPAL_ZE_SUPPORT], [0], [Disable Intel ZE support]) + ZE_SUPPORT=0 ]) + + AS_IF([ test "$opal_check_ze_happy" = "yes" ], + [$2], + [AS_IF([test -n "$with_ze" && test "$with_ze" != "no"], + [AC_MSG_ERROR([Intel ZE support requested but not found. Aborting])]) + $3]) + + AM_CONDITIONAL([OPAL_ze_support], [test "$opal_check_ze_happy" = "yes"]) + OPAL_VAR_SCOPE_POP +]) diff --git a/opal/mca/accelerator/ze/Makefile.am b/opal/mca/accelerator/ze/Makefile.am new file mode 100644 index 00000000000..0d3c40c4b12 --- /dev/null +++ b/opal/mca/accelerator/ze/Makefile.am @@ -0,0 +1,45 @@ +# +# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. +# All Rights reserved. +# Copyright (c) 2022 Advanced Micro Devices, Inc. +# All Rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + + +AM_CPPFLAGS = $(common_ze_CPPFLAGS) + +sources = \ + accelerator_ze.h \ + accelerator_ze_component.c \ + accelerator_ze_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_accelerator_ze_DSO +component_noinst = +component_install = mca_accelerator_ze.la +else +component_noinst = libmca_accelerator_ze.la +component_install = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_accelerator_ze_la_SOURCES = $(sources) +mca_accelerator_ze_la_LDFLAGS = -module -avoid-version $(opal_ze_LDFLAGS) +mca_accelerator_ze_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ + $(opal_ze_LIBS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_accelerator_ze_la_SOURCES =$(sources) +libmca_accelerator_ze_la_LDFLAGS = -module -avoid-version $(opal_ze_LDFLAGS) +libmca_accelerator_ze_la_LIBADD = $(opal_ze_LIBS) diff --git a/opal/mca/accelerator/ze/accelerator_ze.h b/opal/mca/accelerator/ze/accelerator_ze.h new file mode 100644 index 00000000000..9bf7a0b4b07 --- /dev/null +++ b/opal/mca/accelerator/ze/accelerator_ze.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_ACCELERATOR_ZE_H +#define OPAL_ACCELERATOR_ZE_H + +#include "opal_config.h" + +#include "level_zero/ze_api.h" + +#include "opal/mca/accelerator/accelerator.h" +#include "opal/mca/threads/mutex.h" + +typedef struct { + opal_accelerator_base_component_t super; +} opal_accelerator_ze_component_t; + +OPAL_DECLSPEC extern opal_accelerator_ze_component_t mca_accelerator_ze_component; +OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_ze_module; + +struct opal_accelerator_ze_stream_t { + opal_accelerator_stream_t base; + ze_command_queue_handle_t hCommandQueue; + ze_command_list_handle_t hCommandList; + int dev_id; +}; +typedef struct opal_accelerator_ze_stream_t opal_accelerator_ze_stream_t; +OBJ_CLASS_DECLARATION(opal_accelerator_ze_stream_t); + +struct opal_accelerator_ze_event_t { + opal_accelerator_event_t base; +}; +typedef struct opal_accelerator_ze_event_t opal_accelerator_ze_event_t; +OBJ_CLASS_DECLARATION(opal_accelerator_ze_event_t); + +OPAL_DECLSPEC extern uint32_t opal_accelerator_ze_device_count; +OPAL_DECLSPEC extern ze_device_handle_t *opal_accelerator_ze_devices_handle; +OPAL_DECLSPEC extern ze_driver_handle_t opal_accelerator_ze_driver_handle; +OPAL_DECLSPEC extern ze_context_handle_t opal_accelerator_ze_context; +OPAL_DECLSPEC extern ze_event_pool_handle_t opal_accelerator_ze_event_pool; +OPAL_DECLSPEC extern opal_accelerator_stream_t **opal_accelerator_ze_MemcpyStream; + +OPAL_DECLSPEC extern int opal_accelerator_ze_memcpy_async; +OPAL_DECLSPEC extern int opal_accelerator_ze_verbose; + +OPAL_DECLSPEC extern int opal_accelerator_ze_lazy_init(void); + +#endif diff --git a/opal/mca/accelerator/ze/accelerator_ze_component.c b/opal/mca/accelerator/ze/accelerator_ze_component.c new file mode 100644 index 00000000000..d34e2632a71 --- /dev/null +++ b/opal/mca/accelerator/ze/accelerator_ze_component.c @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2014 Intel, Inc. All rights reserved. + * Copyright (c) 2014 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include +#include + +#include "opal/mca/dl/base/base.h" +#include "opal/runtime/opal_params.h" +#include "accelerator_ze.h" +#include "opal/mca/accelerator/base/base.h" +#include "opal_config.h" +#include "opal/util/argv.h" +#include "opal/util/printf.h" +#include "opal/util/output.h" + + +int opal_accelerator_ze_memcpy_async = 1; +int opal_accelerator_ze_verbose = 0; +uint32_t opal_accelerator_ze_device_count = 0; +ze_device_handle_t *opal_accelerator_ze_devices_handle = NULL; +ze_driver_handle_t opal_accelerator_ze_driver_handle = NULL; +ze_context_handle_t opal_accelerator_ze_context = NULL; +ze_event_pool_handle_t opal_accelerator_ze_event_pool = NULL; +opal_accelerator_stream_t **opal_accelerator_ze_MemcpyStream = NULL; + +/* Initialization lock for lazy ze initialization */ +static opal_mutex_t accelerator_ze_init_lock; +static bool accelerator_ze_init_complete = false; + +/* + * Public string showing the accelerator ze component version number + */ +const char *opal_accelerator_ze_component_version_string + = "OPAL ze accelerator MCA component version " OPAL_VERSION; + +/* + * Local function + */ +static int accelerator_ze_open(void); +static int accelerator_ze_close(void); +static int accelerator_ze_component_register(void); +static opal_accelerator_base_module_t* accelerator_ze_init(void); +static void accelerator_ze_finalize(opal_accelerator_base_module_t* module); + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +opal_accelerator_ze_component_t mca_accelerator_ze_component = {{ + + /* First, the mca_component_t struct containing meta information + * about the component itself */ + + .base_version = + { + /* Indicate that we are a accelerator v1.1.0 component (which also + * implies a specific MCA version) */ + + OPAL_ACCELERATOR_BASE_VERSION_1_0_0, + + /* Component name and version */ + + .mca_component_name = "ze", + MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, + OPAL_RELEASE_VERSION), + + /* Component open and close functions */ + + .mca_open_component = accelerator_ze_open, + .mca_close_component = accelerator_ze_close, + .mca_register_component_params = accelerator_ze_component_register, + + }, + /* Next the MCA v1.0.0 component meta data */ + .base_data = + { /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT}, + .accelerator_init = accelerator_ze_init, + .accelerator_finalize = accelerator_ze_finalize, +}}; + +static int accelerator_ze_open(void) +{ + /* construct the component fields */ + + return OPAL_SUCCESS; +} + +static int accelerator_ze_close(void) +{ + return OPAL_SUCCESS; +} + +static int accelerator_ze_component_register(void) +{ + /* Set verbosity in the ze related code. */ + opal_accelerator_ze_verbose = 0; + (void) mca_base_var_register("ompi", "mpi", "accelerator_ze", "verbose", + "Set level of ze verbosity", MCA_BASE_VAR_TYPE_INT, NULL, + 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &opal_accelerator_ze_verbose); + + return OPAL_SUCCESS; +} + +/* + * If this method is invoked it means we already + * initialized ZE in the accelerator_ze_init method below + */ + +int opal_accelerator_ze_lazy_init(void) +{ + uint32_t i,d; + int err = OPAL_SUCCESS; + ze_result_t zret; + uint32_t driver_count = 0; + ze_driver_handle_t *all_drivers = NULL; + + /* Double checked locking to avoid having to + * grab locks post lazy-initialization. */ + + opal_atomic_rmb(); + if (true == accelerator_ze_init_complete) { + return OPAL_SUCCESS; + } + OPAL_THREAD_LOCK(&accelerator_ze_init_lock); + + /* If already initialized, just exit */ + if (true == accelerator_ze_init_complete) { + goto fn_fail; + } + + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: starting lazy init"); + + zret = zeDriverGet(&driver_count, NULL); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: zeDriverGet returned %d\n", zret); + err = OPAL_ERR_NOT_INITIALIZED; + goto fn_fail; + } + + /* + * driver count should not be zero as to get here ZE component + * was successfully init'd. + */ + if (0 == driver_count) { + err = OPAL_ERR_NOT_FOUND; + } + + all_drivers = (ze_driver_handle_t *)malloc(driver_count * sizeof(ze_driver_handle_t)); + if (all_drivers == NULL) { + err = OPAL_ERR_OUT_OF_RESOURCE; + goto fn_fail; + } + + zret = zeDriverGet(&driver_count, all_drivers); + if (ZE_RESULT_SUCCESS != zret) { + err = OPAL_ERR_NOT_FOUND; + goto fn_fail; + } + + /* + * Current design of ZE component assumes we find the first driver with a GPU device. + * we'll create a single ZE context if we do find such a device. This may need to + * be revisited at some point but would impact areas of code outside of the + * accelerator framework. + */ + + for (i = 0; i < driver_count; ++i) { + opal_accelerator_ze_device_count = 0; + zret = zeDeviceGet(all_drivers[i], &opal_accelerator_ze_device_count, NULL); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: zeDeviceGet returned %d\n", zret); + err = OPAL_ERROR; + goto fn_fail; + } + opal_accelerator_ze_devices_handle = + malloc(opal_accelerator_ze_device_count * sizeof(ze_device_handle_t)); + if (NULL == opal_accelerator_ze_devices_handle) { + err = OPAL_ERR_OUT_OF_RESOURCE; + goto fn_fail; + } + zret = zeDeviceGet(all_drivers[i], &opal_accelerator_ze_device_count, opal_accelerator_ze_devices_handle); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: zeDeviceGet returned %d\n", zret); + err = OPAL_ERROR; + goto fn_fail; + } + /* Check if the driver supports a gpu */ + for (d = 0; d < opal_accelerator_ze_device_count; ++d) { + ze_device_properties_t device_properties; + zret = zeDeviceGetProperties(opal_accelerator_ze_devices_handle[d], &device_properties); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: zeDeviceGetProperties returned %d\n", zret); + err = OPAL_ERROR; + goto fn_fail; + } + + if (ZE_DEVICE_TYPE_GPU == device_properties.type) { + opal_accelerator_ze_driver_handle = all_drivers[i]; + break; + } + } + + if (NULL != opal_accelerator_ze_driver_handle) { + break; + } else { + free(opal_accelerator_ze_devices_handle); + opal_accelerator_ze_devices_handle = NULL; + } + } + + ze_context_desc_t contextDesc = { + .stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC, + .pNext = NULL, + .flags = 0, + }; + zret = zeContextCreate(opal_accelerator_ze_driver_handle, + &contextDesc, + &opal_accelerator_ze_context); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: zeContextCreate returned %d\n", zret); + err = OPAL_ERROR; + goto fn_fail; + } + + /* + * allocate synchronous memcpy stream handles, but delay creating streams till needed + */ + + opal_accelerator_ze_MemcpyStream = (opal_accelerator_stream_t **)calloc((size_t)opal_accelerator_ze_device_count, + sizeof(opal_accelerator_stream_t *)); + if (NULL == opal_accelerator_ze_MemcpyStream) { + err = OPAL_ERR_OUT_OF_RESOURCE; + goto fn_fail; + } + + /* + * set up an event pool + */ + + ze_event_pool_desc_t eventPoolDesc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .pNext = NULL, + .flags = 0, + .count = 1000, /* TODO: fix this! */ + }; + + /* + * create an event pool that can be used by all devices associated with this ze context + */ + zret = zeEventPoolCreate(opal_accelerator_ze_context, + &eventPoolDesc, + 0, + NULL, + &opal_accelerator_ze_event_pool); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: zeEventPoolCreate returned %d\n", zret); + err = OPAL_ERROR; + goto fn_fail; + } + + opal_atomic_wmb(); + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: found %d devices", opal_accelerator_ze_device_count); + accelerator_ze_init_complete = true; + + return OPAL_SUCCESS; + +fn_fail: + if (NULL != opal_accelerator_ze_MemcpyStream) { + free(opal_accelerator_ze_MemcpyStream); + opal_accelerator_ze_MemcpyStream = NULL; + } + + if (NULL != all_drivers) { + free(all_drivers); + } + + if (OPAL_SUCCESS != err) { + free(opal_accelerator_ze_devices_handle); + opal_accelerator_ze_devices_handle = NULL; + } + + OPAL_THREAD_UNLOCK(&accelerator_ze_init_lock); + return err; +} + +static opal_accelerator_base_module_t* accelerator_ze_init(void) +{ + uint32_t driver_count=0; + ze_result_t zret; + ze_init_flag_t flags = ZE_INIT_FLAG_GPU_ONLY; + + OBJ_CONSTRUCT(&accelerator_ze_init_lock, opal_mutex_t); + + if (opal_ze_runtime_initialized) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "ZE: runtime not initialized"); + return NULL; + } + + /* + * Initialize ze, this function can be called multiple times + */ + + zret = zeInit(flags); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "ZE: zeInit returned %d flags = %d\n", zret, flags); + return NULL; + } + + /* + * zeDriverGet can return: + * ZE_RESULT_SUCCESS + * ZE_RESULT_ERROR_UNINITIALIZED + * ZE_RESULT_ERROR_DEVICE_LOST + * ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY + * ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY + */ + zret = zeDriverGet(&driver_count, NULL); + if (ZE_RESULT_SUCCESS != zret || 0 == driver_count) { + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: zeDriverGet returned %d\n", zret); + } else { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: no device drivers found\n"); + } + return NULL; + } else { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: %d device drivers found\n", driver_count); + } + + opal_atomic_mb(); + opal_ze_runtime_initialized = true; + + return &opal_accelerator_ze_module; +} + +static void accelerator_ze_finalize(opal_accelerator_base_module_t* module) +{ + ze_result_t zret; + + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "ZE: finalizing component\n"); + + if (NULL != opal_accelerator_ze_event_pool) { + zret = zeEventPoolDestroy(opal_accelerator_ze_event_pool); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeEventPoolDestroy returned %d", zret); + } + opal_accelerator_ze_event_pool = NULL; + } + + + if (NULL != opal_accelerator_ze_MemcpyStream) { + for (uint32_t i = 0; i < opal_accelerator_ze_device_count; i++) { + if (NULL != opal_accelerator_ze_MemcpyStream[i]) { + OBJ_RELEASE(opal_accelerator_ze_MemcpyStream[i]); + } + } + free(opal_accelerator_ze_MemcpyStream); + opal_accelerator_ze_MemcpyStream = NULL; + } + + if (NULL != (void *)opal_accelerator_ze_context) { + zret = zeContextDestroy(opal_accelerator_ze_context); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeContextDestroy returned %d", zret); + } + opal_accelerator_ze_context = NULL; + } + + opal_accelerator_ze_device_count = 0; + + OBJ_DESTRUCT(&accelerator_ze_init_lock); + return; +} diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c new file mode 100644 index 00000000000..5696359104e --- /dev/null +++ b/opal/mca/accelerator/ze/accelerator_ze_module.c @@ -0,0 +1,697 @@ +/* + * Copyright (c) 2022 Advanced Micro Devices, Inc. All Rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include + +#include "opal_config.h" + +#include "accelerator_ze.h" +#include "opal/mca/accelerator/base/base.h" +#include "opal/util/argv.h" +#include "opal/util/printf.h" +#include "opal/constants.h" +#include "opal/util/output.h" + +/* Accelerator API's */ +static int mca_accelerator_ze_check_addr(const void *addr, int *dev_id, uint64_t *flags); +static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_t **stream); + +static int mca_accelerator_ze_create_event(int dev_id, opal_accelerator_event_t **event); +static int mca_accelerator_ze_record_event(int dev_id, opal_accelerator_event_t *event, opal_accelerator_stream_t *stream); +static int mca_accelerator_ze_query_event(int dev_id, opal_accelerator_event_t *event); + +static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); +static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, + size_t size, opal_accelerator_transfer_type_t type); +static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_transfer_type_t type); +static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size); +static int mca_accelerator_ze_mem_release(int dev_id, void *ptr); +static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base, + size_t *size); + +static int mca_accelerator_ze_host_register(int dev_id, void *ptr, size_t size); +static int mca_accelerator_ze_host_unregister(int dev_id, void *ptr); + +static int mca_accelerator_ze_get_device(int *dev_id); +static int mca_accelerator_ze_device_can_access_peer( int *access, int dev1, int dev2); + +static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr); + +static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); + +opal_accelerator_base_module_t opal_accelerator_ze_module = +{ + .check_addr = mca_accelerator_ze_check_addr, + + .create_stream = mca_accelerator_ze_create_stream, + .create_event = mca_accelerator_ze_create_event, + .record_event = mca_accelerator_ze_record_event, + .query_event = mca_accelerator_ze_query_event, + + .mem_copy_async = mca_accelerator_ze_memcpy_async, + .mem_copy = mca_accelerator_ze_memcpy, + .mem_move = mca_accelerator_ze_memmove, + + .mem_alloc = mca_accelerator_ze_mem_alloc, + .mem_release = mca_accelerator_ze_mem_release, + .get_address_range = mca_accelerator_ze_get_address_range, + + .host_register = mca_accelerator_ze_host_register, + .host_unregister = mca_accelerator_ze_host_unregister, + + .get_device= mca_accelerator_ze_get_device, + .get_device_pci_attr = mca_accelerator_ze_get_device_pci_attr, + .device_can_access_peer = mca_accelerator_ze_device_can_access_peer, + + .get_buffer_id = mca_accelerator_ze_get_buffer_id +}; + +static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice) +{ + int i, ret = MCA_ACCELERATOR_NO_DEVICE_ID; + + for (i = 0; i < (int)opal_accelerator_ze_device_count; i++) { + if (opal_accelerator_ze_devices_handle[i] == hDevice) { + ret = i; + break; + } + } + + return ret; +} + +static int mca_accelerator_ze_check_addr (const void *addr, int *dev_id, uint64_t *flags) +{ + ze_result_t zret; + int ret = 0; + ze_memory_allocation_properties_t attr; + ze_device_handle_t hDevice; + + *dev_id = MCA_ACCELERATOR_NO_DEVICE_ID; + *flags = 0; + + if (NULL == addr || NULL == flags) { + return OPAL_ERR_BAD_PARAM; + } + + ret = opal_accelerator_ze_lazy_init(); + if (OPAL_SUCCESS != ret) { + return ret; + } + + memset(&attr, 0, sizeof(ze_memory_allocation_properties_t)); + + zret = zeMemGetAllocProperties(opal_accelerator_ze_context, + addr, + &attr, + &hDevice); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeMemGetAllocProperties returned %d", zret); + goto fn_fail; + } + switch (attr.type) { + case ZE_MEMORY_TYPE_UNKNOWN: + case ZE_MEMORY_TYPE_HOST: + break; + case ZE_MEMORY_TYPE_DEVICE: + case ZE_MEMORY_TYPE_SHARED: + ret = 1; + *dev_id = accelerator_ze_dev_handle_to_dev_id(hDevice); + break; + default: + goto fn_fail; + } + +fn_fail: + + return ret; +} + +static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_t **stream) +{ + int ret; + ze_result_t zret; + ze_device_handle_t hDevice; + opal_accelerator_ze_stream_t *ze_stream; + + ze_command_queue_desc_t cmdQueueDesc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .pNext = NULL, + .index = 0, + .flags = 0, + .ordinal = 0, + .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + }; + + if (NULL == stream) { + return OPAL_ERR_BAD_PARAM; + } + + ret = opal_accelerator_ze_lazy_init(); + if (OPAL_SUCCESS != ret) { + return ret; + } + + *stream = (opal_accelerator_stream_t*)OBJ_NEW(opal_accelerator_ze_stream_t); + if (NULL == *stream) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + ze_stream = (opal_accelerator_ze_stream_t *)malloc(sizeof(opal_accelerator_ze_stream_t)); + if (NULL == ze_stream) { + OBJ_RELEASE(*stream); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { + hDevice = opal_accelerator_ze_devices_handle[0]; + } else { + hDevice = opal_accelerator_ze_devices_handle[dev_id]; + } + ze_stream->dev_id = dev_id; + + zret = zeCommandQueueCreate(opal_accelerator_ze_context, + hDevice, + &cmdQueueDesc, + &ze_stream->hCommandQueue); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandQueueCreate returned %d", zret); + return OPAL_ERROR; + } + + /* + * create a command list + */ + + ze_command_list_desc_t commandListDesc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .pNext = NULL, + .commandQueueGroupOrdinal = 0, + .flags = 0, + }; + + zret = zeCommandListCreate(opal_accelerator_ze_context, + opal_accelerator_ze_devices_handle[0], + &commandListDesc, + &ze_stream->hCommandList); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandListCreate returned %d", zret); + return OPAL_ERROR; + } + (*stream)->stream = (void *)ze_stream; + + return OPAL_SUCCESS; +} + +static void mca_accelerator_ze_stream_destruct(opal_accelerator_ze_stream_t *stream) +{ + ze_result_t zret; + opal_accelerator_ze_stream_t *ze_stream; + + if (NULL != stream->base.stream) { + ze_stream = (opal_accelerator_ze_stream_t *)stream->base.stream; + zret = zeCommandQueueDestroy(ze_stream->hCommandQueue); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error while destroying the zeCommandQueue"); + } + + free(stream->base.stream); + } +} + +OBJ_CLASS_INSTANCE( + opal_accelerator_ze_stream_t, + opal_accelerator_stream_t, + NULL, + mca_accelerator_ze_stream_destruct); + +static int mca_accelerator_ze_create_event(int dev_id, opal_accelerator_event_t **event) +{ + ze_result_t zret; + + ze_event_desc_t eventDesc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, + .pNext = NULL, + .index = 0, + .signal = ZE_EVENT_SCOPE_FLAG_HOST, + .wait = ZE_EVENT_SCOPE_FLAG_HOST, + }; + + if (NULL == event) { + return OPAL_ERR_BAD_PARAM; + } + + *event = (opal_accelerator_event_t*)OBJ_NEW(opal_accelerator_ze_event_t); + if (NULL == *event) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + (*event)->event = malloc(sizeof(ze_event_handle_t)); + if (NULL == (*event)->event) { + OBJ_RELEASE(*event); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + zret = zeEventCreate(opal_accelerator_ze_event_pool, &eventDesc, (ze_event_handle_t *)(*event)->event); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error creating event %d", zret); + free((*event)->event); + OBJ_RELEASE(*event); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; + +} + +static void mca_accelerator_ze_event_destruct(opal_accelerator_ze_event_t *event) +{ + ze_result_t zret; + + if (NULL != event->base.event) { + zret = zeEventDestroy(*(ze_event_handle_t *)event->base.event); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error destroying event %d", zret); + } + free(event->base.event); + } +} + +OBJ_CLASS_INSTANCE( + opal_accelerator_ze_event_t, + opal_accelerator_event_t, + NULL, + mca_accelerator_ze_event_destruct); + +static int mca_accelerator_ze_record_event(int dev_id, opal_accelerator_event_t *event, + opal_accelerator_stream_t *stream) +{ + ze_result_t zret = ZE_RESULT_SUCCESS; + opal_accelerator_ze_stream_t *ze_stream; + + if (NULL == event || NULL == event->event){ + return OPAL_ERR_BAD_PARAM; + } + if (NULL == stream || NULL == stream->stream){ + return OPAL_ERR_BAD_PARAM; + } + + ze_stream = (opal_accelerator_ze_stream_t *)stream->stream; + + zret = zeCommandListAppendSignalEvent(ze_stream->hCommandList, + *(ze_event_handle_t *)event->event); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandListAppendSignalEvent returned %d", zret); + return OPAL_ERROR; + } + + /* + * okay now close the command list and submit + */ + + zret = zeCommandListClose(ze_stream->hCommandList); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandListClose returned %d", zret); + return OPAL_ERROR; + } + + zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue, + 1, + &ze_stream->hCommandList, + NULL); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandQueueExecuteCommandList returned %d", zret); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_query_event(int dev_id, opal_accelerator_event_t *event) +{ + ze_result_t zret; + + if (NULL == event || NULL == event->event) { + return OPAL_ERR_BAD_PARAM; + } + + zret = zeEventQueryStatus(*((ze_event_handle_t *)event->event)); + switch (zret) { + case ZE_RESULT_SUCCESS: + return OPAL_SUCCESS; + break; + case ZE_RESULT_NOT_READY: + return OPAL_ERR_RESOURCE_BUSY; + break; + default: + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeEventQueryStatus returned %d", zret); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, + size_t size, opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type) +{ + ze_result_t zret; + opal_accelerator_ze_stream_t *ze_stream = NULL; + + if (NULL == stream || NULL == src || + NULL == dest || size <= 0) { + return OPAL_ERR_BAD_PARAM; + } + + ze_stream = (opal_accelerator_ze_stream_t *)stream->stream; + assert(NULL != ze_stream); + + zret = zeCommandListAppendMemoryCopy(ze_stream->hCommandList, + dest, + src, + size, + NULL, + 0, + NULL); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandListAppendMemoryCopy returned %d", zret); + return OPAL_ERROR; + } + + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_transfer_type_t type) +{ + int ret, dev_id; + ze_result_t zret; + + opal_accelerator_ze_stream_t *ze_stream = NULL; + + if (NULL == src || NULL == dest || size <=0) { + return OPAL_ERR_BAD_PARAM; + } + + if (MCA_ACCELERATOR_NO_DEVICE_ID == src_dev_id) { + dev_id = 0; + } else { + dev_id = src_dev_id; + } + + if (NULL == opal_accelerator_ze_MemcpyStream[dev_id]) { + ret = mca_accelerator_ze_create_stream(dev_id, + (opal_accelerator_stream_t **)&opal_accelerator_ze_MemcpyStream[dev_id]); + if (OPAL_SUCCESS != ret) { + return ret; + } + } + + ze_stream = (opal_accelerator_ze_stream_t *)opal_accelerator_ze_MemcpyStream[dev_id]->stream; + zret = zeCommandListAppendMemoryCopy(ze_stream->hCommandList, + dest, + src, + size, + NULL, + 0, + NULL); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandListAppendMemoryCopy returned %d", zret); + return OPAL_ERROR; + } + + zret = zeCommandListClose(ze_stream->hCommandList); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandListClose returned %d", zret); + return OPAL_ERROR; + } + + zret = zeCommandQueueExecuteCommandLists(ze_stream->hCommandQueue, + 1, + &ze_stream->hCommandList, + NULL); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandQueueExecuteCommandList returned %d", zret); + return OPAL_ERROR; + } + + zret = zeCommandQueueSynchronize(ze_stream->hCommandQueue, + UINT32_MAX); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandQueueSynchronize returned %d", zret); + return OPAL_ERROR; + } + + zret = zeCommandListReset(ze_stream->hCommandList); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeCommandListReset returned %d", zret); + return OPAL_ERROR; + } + + + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_transfer_type_t type) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + +static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size) +{ + ze_result_t zret; + size_t mem_alignment; + ze_device_handle_t hDevice; + + ze_device_mem_alloc_desc_t device_desc = { + .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + .pNext = NULL, + .flags = 0, + .ordinal = 0, /* We currently support a single memory type */ + }; + + if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { + hDevice = opal_accelerator_ze_devices_handle[0]; + } else { + hDevice = opal_accelerator_ze_devices_handle[dev_id]; + } + + /* Currently ZE ignores this argument and uses an internal alignment + * value. However, this behavior can change in the future. */ + mem_alignment = 1; + zret = zeMemAllocDevice(opal_accelerator_ze_context, + &device_desc, + size, + mem_alignment, + hDevice, + ptr); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeMemAllocDevice returned %d", zret); + goto fn_fail; + } + + return OPAL_SUCCESS; + fn_fail: + return OPAL_ERROR; +} + +static int mca_accelerator_ze_mem_release(int dev_id, void *ptr) +{ + ze_result_t zret; + + zret = zeMemFree(opal_accelerator_ze_context, ptr); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeMemFree returned %d", zret); + goto fn_fail; + } + + return OPAL_SUCCESS; + fn_fail: + return OPAL_ERROR; +} + +static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base, + size_t *size) +{ + ze_result_t zret; + void *pBase; + size_t pSize; + + if (NULL == ptr || NULL == base || NULL == size) { + return OPAL_ERR_BAD_PARAM; + } + + zret = zeMemGetAddressRange(opal_accelerator_ze_context, + ptr, + &pBase, + &pSize); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "couldn't get address range for pointer %p/%lu %d", ptr, *size, zret); + return OPAL_ERROR; + } + + *size = pSize; + *base = (char *) pBase; + + return OPAL_SUCCESS; +} + +/* + * ZE doesn't have explicit host memory registration functions + */ + +static int mca_accelerator_ze_host_register(int dev_id, void *ptr, size_t size) +{ + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_host_unregister(int dev_id, void *ptr) +{ + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_get_device(int *dev_id) +{ + /* + * this method does not map to the Zero Level API, just return 0. + * This may just work if the runtime is use the ZE_AFFINITY_MASK + * environment variable to control the visible PV(s) for a given process. + */ + + if (NULL == dev_id) { + return OPAL_ERR_BAD_PARAM; + } + + *dev_id = 0; + + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_pci_attr_t *pci_attr) +{ + ze_result_t zret; + ze_device_handle_t hDevice; + ze_pci_ext_properties_t pPciProperties; + + if (NULL == pci_attr) { + return OPAL_ERR_BAD_PARAM; + } + + if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { + hDevice = opal_accelerator_ze_devices_handle[0]; + } else { + hDevice = opal_accelerator_ze_devices_handle[dev_id]; + } + + zret = zeDevicePciGetPropertiesExt(hDevice, &pPciProperties); + if(ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeDevicePciGetPropertiesExt returned %d", zret); + return OPAL_ERROR; + } + + pci_attr->domain_id = (uint16_t)pPciProperties.address.domain; + pci_attr->bus_id = (uint8_t) pPciProperties.address.bus; + pci_attr->device_id = (uint8_t)pPciProperties.address.device; + pci_attr->function_id = (uint8_t)pPciProperties.address.function; + + return OPAL_SUCCESS; +} + + +/* + * could zeDeviceGetP2PProperties be used instead here? + */ +static int mca_accelerator_ze_device_can_access_peer(int *access, int dev1, int dev2) +{ + ze_result_t zret; + ze_bool_t value; + ze_device_handle_t hDevice; + ze_device_handle_t hPeerDevice; + + if (NULL == access || dev1 < 0 || dev2 < 0){ + return OPAL_ERR_BAD_PARAM; + } + + hDevice = opal_accelerator_ze_devices_handle[dev1]; + hPeerDevice = opal_accelerator_ze_devices_handle[dev2]; + + zret = zeDeviceCanAccessPeer(hDevice, + hPeerDevice, + &value); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeDeviceCanAccessPeer returned %d", zret); + return OPAL_ERROR; + } + + *access = (value == 1) ? 1 : 0; + + return OPAL_SUCCESS; +} + +static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id) +{ + ze_result_t zret; + ze_memory_allocation_properties_t pMemAllocProperties; + ze_device_handle_t hDevice; + + if (NULL == buf_id) { + return OPAL_ERR_BAD_PARAM; + } + + if (MCA_ACCELERATOR_NO_DEVICE_ID == dev_id) { + hDevice = opal_accelerator_ze_devices_handle[0]; + } else { + hDevice = opal_accelerator_ze_devices_handle[dev_id]; + } + + zret = zeMemGetAllocProperties(opal_accelerator_ze_context, + addr, + &pMemAllocProperties, + &hDevice); + if (ZE_RESULT_SUCCESS != zret) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "zeMemGetAllocProperties returned %d", zret); + return OPAL_ERROR; + } + + *buf_id = pMemAllocProperties.id; + + return OPAL_SUCCESS; +} diff --git a/opal/mca/accelerator/ze/configure.m4 b/opal/mca/accelerator/ze/configure.m4 new file mode 100644 index 00000000000..48b5256eda8 --- /dev/null +++ b/opal/mca/accelerator/ze/configure.m4 @@ -0,0 +1,28 @@ +# -*- shell-script -*- +# +# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +# Copyright (c) 2023 Triad National Security, LLC. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_opal_accelerator_ze_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------------ +AC_DEFUN([MCA_opal_accelerator_ze_CONFIG],[ + AC_CONFIG_FILES([opal/mca/accelerator/ze/Makefile]) + + OPAL_CHECK_ZE([opal_ze], + [opal_ze_happy="yes"], + [opal_ze_happy="no"]) + OPAL_SUMMARY_ADD([Accelerators], [Intel ZE support], [], [$opal_ze_happy]) + + AS_IF([test "$opal_ze_happy" = "yes"], + [$1], + [$2]) + AC_SUBST([opal_ze_LDFLAGS]) + AC_SUBST([opal_ze_LIBS]) +])dnl diff --git a/opal/runtime/opal_params_core.c b/opal/runtime/opal_params_core.c index 9cbc0ef86c9..48837bfdbdd 100644 --- a/opal/runtime/opal_params_core.c +++ b/opal/runtime/opal_params_core.c @@ -27,6 +27,7 @@ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2022 Computer Architecture and VLSI Systems (CARV) * Laboratory, ICS Forth. All rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -74,6 +75,8 @@ bool opal_warn_on_missing_libcuda = true; bool opal_built_with_rocm_support = OPAL_INT_TO_BOOL(OPAL_ROCM_SUPPORT); bool opal_rocm_runtime_initialized = false; +bool opal_built_with_ze_support = OPAL_INT_TO_BOOL(OPAL_ZE_SUPPORT); +bool opal_ze_runtime_initialized = false; /** * Globals imported from the OMPI layer. diff --git a/opal/runtime/opal_params_core.h b/opal/runtime/opal_params_core.h index bdf0a580a7e..9dd8870effa 100644 --- a/opal/runtime/opal_params_core.h +++ b/opal/runtime/opal_params_core.h @@ -22,6 +22,7 @@ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2022 Computer Architecture and VLSI Systems (CARV) * Laboratory, ICS Forth. All rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -57,6 +58,7 @@ extern bool opal_timing_overhead; OPAL_DECLSPEC extern int opal_initialized; OPAL_DECLSPEC extern bool opal_built_with_cuda_support; OPAL_DECLSPEC extern bool opal_built_with_rocm_support; +OPAL_DECLSPEC extern bool opal_built_with_ze_support; /** * * Whether we want to enable CUDA GPU buffer send and receive support. @@ -73,6 +75,11 @@ OPAL_DECLSPEC extern bool opal_cuda_runtime_initialized; */ OPAL_DECLSPEC extern bool opal_rocm_runtime_initialized; +/** + * Whether ze runtime support is initialized or not. + */ +OPAL_DECLSPEC extern bool opal_ze_runtime_initialized; + /** * * Whether we want to warn the user when libcuda is missing. * */ From e3a63e005b19921e747cba6b0f460b8540958950 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 9 Sep 2023 08:17:09 -0600 Subject: [PATCH 21/73] Update PMIx/PRRTE submodule pointers for CI Signed-off-by: Ralph Castain --- 3rd-party/openpmix | 2 +- 3rd-party/prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rd-party/openpmix b/3rd-party/openpmix index 22fe51cb7a9..4c444462d2b 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit 22fe51cb7a961b6060fc5c48e659237cbe162566 +Subproject commit 4c444462d2bb0102faa6fda8410ca8e50a365e78 diff --git a/3rd-party/prrte b/3rd-party/prrte index ece4f3c45a0..0347baa1eda 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit ece4f3c45a07a069e5b8f9c5e641613dfcaeffc3 +Subproject commit 0347baa1edaec29c4f0cf1eac7b674ad7ba139c1 From 864caf3c144faa73157fd100e9dce7dd56bb01e7 Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Sat, 9 Sep 2023 11:10:30 -0500 Subject: [PATCH 22/73] Document MCA parameter changes from move from ORTE to PRRTE. Addresses Github issue https://github.com/open-mpi/ompi/issues/7668 Co-authored-by: jsquyres@cisco.com Signed-off-by: Quincey Koziol --- docs/mca.rst | 489 +++++++++++++++++++++++++++++++++++++ docs/version-numbering.rst | 6 + 2 files changed, 495 insertions(+) diff --git a/docs/mca.rst b/docs/mca.rst index 9cb6a95c29a..91aadae7d2b 100644 --- a/docs/mca.rst +++ b/docs/mca.rst @@ -655,3 +655,492 @@ presented here so that they can easily be found via internet searches: ``^accelerator,btl/uct``, then Open MPI will only warn about the failure to load DSOs that are neither in the accelerator framework nor are the UCT BTL. + +.. _label-mca-backward-compat: + +MCA Parameter Changes Between Open MPI 4.x and 5.x +-------------------------------------------------- + +When Open MPI :ref:`switched from using ORTE to PRRTE as its run-time +environment, ` some MCA +parameters were renamed to be more consistent and/or allow more +flexible behavior. The deprecated Open MPI MCA parameters listed +below are currently replaced by a corresponding new PRRTE parameter, +but may be removed in future releases. + +.. note:: In all cases listed below, the deprecated MCA parameter is + an Open MPI MCA parameter, meaning that its corresponding + environment variable was prefixed with ``OMPI_MCA_`` (e.g., + ``OMPI_MCA_orte_xml_output``). However, the corresponding + new MCA parameter is a PRRTE MCA parameter, meaning that its + corresponding environment variable is prefixed with + ``PRTE_MCA_`` (e.g., ``PRTE_MCA_output``). + + .. important:: Yes, that's a single ``R`` in the + ``PRTE_MCA_`` environment variable prefix. + `See this explanation + `_ for the when one + R or two R's are used in the PRRTE name. + + +.. list-table:: + :header-rows: 1 + + * - Behavior + - Deprecated MCA parameter + - Replaced with + + * - Control buffering of stream output + - ``orte_ess_base_stream_buffering`` + + Values: 0 | 1 | 2 + + - ``ompi_stream_buffering`` + + Values: same + + * - Output a brief periodic report on launch progress + - ``orte_report_launch_progress`` + + Values: boolean + - ``state_base_show_launch_progress`` + + Values: same + + * - Provide all output in XML format + - ``orte_xml_output`` + + Values: boolean + - ``output`` + + Value: ``xml`` + + * - Tag all output with [job,rank] + - ``orte_tag_output`` + + Values: boolean + - ``output`` + + Value: ``tag`` + + * - Timestamp all application process output + - ``orte_timestamp_output`` + + Values: boolean + - ``output`` + + Value: ``timestamp`` + + * - Redirect output from application processes into filename / job + / rank / stdout / stderr / stdddiag. + - ``orte_output_filename`` + + Value: ```` + - ``output`` + + Value: ``file=`` + + * - Display a detailed process map just before launch + - ``rmaps_base_display_devel_map`` + + Values: boolean + - ``display`` + + Value: ``map-devel`` + + * - Display the topology as part of the process map just before + launch + - ``rmaps_base_display_topo_with_map`` + + Values: ```` + - ``display`` + + Value: ``topo=`` + + * - Whether to report process bindings to stderr + - ``hwloc_base_report_bindings`` + + Values: boolean + - ``display`` + + Value: ``bind`` + + * - Display the process map just before launch + - ``rmaps_base_display_map`` + + Values: boolean + - ``display`` + + Value: ``map`` + + * - Display the allocation being used by this job + - ``orte_display_alloc`` + + Values: boolean + - ``display`` + + Value: ``allocation`` + + * - Do not run any MPI applications on the local node + - ``rmaps_base_no_schedule_local`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``[]:nolocal`` + + * - Nodes are allowed to be oversubscribed, even on a managed + system, and overloading of processing elements + - ``rmaps_base_oversubscribe`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``[]:oversubscribe`` + + * - Nodes are not to be oversubscribed, even if the system + supports such operation + - ``rmaps_base_no_oversubscribe`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``[]:nooversubscribe`` + + * - Use hardware threads as independent CPUs + - ``hwloc_base_use_hwthreads_as_cpus`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``[]:hwtcpus`` + + * - Comma-separated list of ranges specifying logical cpus + allocated to this job + - ``hwloc_base_cpu_set`` + + Value: ```` + - ``rmaps_default_mapping_policy`` + + Value: ``pe-list=`` + + * - List of processor IDs to bind processes to + - ``hwloc_base_cpu_list`` + + Value: ```` + - ``rmaps_default_mapping_policy`` + + Value: ``pe-list=`` + + * - Bind processes to cores + - ``hwloc_base_bind_to_core`` + + Values: boolean + - ``hwloc_default_binding_policy`` + + Value: ``core`` + + * - Bind processes to sockets + - ``hwloc_base_bind_to_socket`` + + Values: boolean + - ``hwloc_default_binding_policy`` + + Value: ``package`` + + * - Whether to map and rank processes round-robin by node + - ``rmaps_base_bynode`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``node`` + + * - Whether to map and rank processes round-robin by core + - ``rmaps_base_bycore`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``core`` + + * - Whether to map and rank processes round-robin by slot + - ``rmaps_base_byslot`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``slot`` + + * - Number of cpus to use for each process + - ``rmaps_base_cpus_per_rank`` + + Value: ```` + - ``rmaps_default_mapping_policy`` + + Value: ``[]:pe=`` + + * - Launch n processes per node on all allocated nodes + - ``rmaps_ppr_n_pernode`` + + Value: ```` + - ``rmaps_default_mapping_policy`` + + Value: ``ppr::node`` + + * - Launch one process per available node + - ``rmaps_ppr_pernode`` + + Values: boolean + - ``rmaps_default_mapping_policy`` + + Value: ``ppr:1:node`` + + * - Launch n processes per socket on all allocated nodes + - ``rmaps_ppr_n_persocket`` + + Value: integer ```` + - ``rmaps_default_mapping_policy`` + + Value: ``ppr::package`` + + * - Comma-separated list of number of processes on a given + resource type + - ``rmaps_ppr_pattern`` + + Value: ```` + - ``rmaps_default_mapping_policy`` + + Value: ``ppr:`` + + * - Provide a rankfile file + - ``orte_rankfile`` + + Value: ```` + - ``rmaps_default_mapping_policy`` + + Value: ``rankfile:file=`` + +Examples +^^^^^^^^^^^^^^^^^^^ + +Converting many parameters in the table above are straightforward, where an +integer or boolean value is involved, but some of the conversions require +substituting a boolean with a value to the new parameter, or even constructing +a more complicated composite value for the new parameter. Examples of all +of these types of conversions are given below. + +Simple values, where only the name of the MCA parameter changed +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. code-block:: ini + + # Old environment variable: (integer value) + export OMPI_MCA_orte_ess_base_stream_buffering=2 + + # New environment variable: (integer value) + export PRTE_MCA_ompi_stream_buffering=2 + +.. code-block:: ini + + # Old environment variable: (boolean value) + export OMPI_MCA_orte_report_launch_progress=1 + + # New environment variable: (boolean value) + export PRTE_MCA_state_base_show_launch_progress=1 + +Convert from boolean value to parameter for variable +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. code-block:: ini + + # Old environment variable: (boolean value) + export OMPI_MCA_orte_xml_output=1 + + # New environment variable: (parameter value) + export PRTE_MCA_output=xml + +.. code-block:: ini + + # Old environment variables: (boolean value) + export OMPI_MCA_orte_xml_output=1 + export OMPI_MCA_orte_timestamp_output=1 + + # New environment variable: (parameter value) + export PRTE_MCA_output=xml,timestamp + +.. code-block:: ini + + # Old environment variable: (boolean value) + export OMPI_MCA_rmaps_base_display_devel_map=1 + + # New environment variable: (parameter value) + export PRTE_MCA_display=map-devel + +.. code-block:: ini + + # Old environment variables: (boolean value) + export OMPI_MCA_rmaps_base_display_devel_map=1 + export OMPI_MCA_rmaps_base_report_bindings=1 + + # New environment variable: (parameter value) + export PRTE_MCA_display=map-devel,bind + +Convert from string value to parameter for variable +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. code-block:: ini + + # Old environment variable: (string value) + export OMPI_MCA_orte_output_filename=output.txt + + # New environment variable: (parameter value) + export PRTE_MCA_output=file=output.txt + +.. code-block:: ini + + # Old environment variables: (boolean value) + export OMPI_MCA_orte_xml_output=1 + export OMPI_MCA_orte_timestamp_output=1 + + # Old environment variable: (string value) + export OMPI_MCA_orte_output_filename=output.txt + + # New environment variable: (parameter value) + export PRTE_MCA_output=xml,timestamp,file=output.txt + +.. code-block:: ini + + # Old environment variable: (string value) + export OMPI_MCA_rmaps_base_display_topo_with_map=node + + # New environment variable: (parameter value) + export PRTE_MCA_display=topo=node + +.. code-block:: ini + + # Old environment variables: (boolean value) + export OMPI_MCA_rmaps_base_display_devel_map=1 + export OMPI_MCA_rmaps_base_report_bindings=1 + + # Old environment variable: (string value) + export OMPI_MCA_rmaps_base_display_topo_with_map=node + + # New environment variable: (parameter value) + export PRTE_MCA_display=map-devel,bind,topo=node + +Converting mapping parameters +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +Mapping parameters were previously prefixed with ``rmaps_base_`` or ``hwloc_base_`` +(and also the ``orte_rankfile`` parameter). These have been updated +to the ``rmaps_default_mapping_policy`` and ``hwloc_default_binding_policy`` +parameters to be more consistent and indicate that they are the *default* +mapping for processes. Some of the old parameters are now values for a +new parameter and some are now suffixes, as shown in the examples below. + +The examples below show conversions from old boolean parameters to new +parameter values: + +.. code-block:: ini + + # Old environment variable: (boolean value) + export OMPI_MCA_rmaps_base_bycore=1 + + # New environment variable: (parameter value) + export PRTE_MCA_rmaps_default_mapping_policy=core + +.. code-block:: ini + + # Old environment variable: (boolean value) + export OMPI_MCA_hwloc_base_bind_to_socket=1 + + # New environment variable: (parameter value) + export PRTE_MCA_hwloc_default_binding_policy=package + + +The examples below show conversions from old parameters that have integer or +string values to new parameter values with those same values: + +.. code-block:: ini + + # Old environment variable: (string value) + export OMPI_MCA_hwloc_base_cpu_set=1,3,8 + + # New environment variable: (parameter value) + export PRTE_MCA_rmaps_default_mapping_policy=pe-list=1,3,8 + +.. code-block:: ini + + # Old environment variable: (integer value) + export OMPI_MCA_rmaps_ppr_n_persocket=4 + + # New environment variable: (parameter value) + export PRTE_MCA_rmaps_default_mapping_policy=ppr:4:package + +.. code-block:: ini + + # Old environment variable: (string value) + export OMPI_MCA_orte_rankfile=rankfile.txt + + # New environment variable: (parameter value) + export PRTE_MCA_rmaps_default_mapping_policy=rankfile:file=rankfile.txt + +The examples below show conversions from old parameters that map to suffixes +for new parameter values: + +.. code-block:: ini + + # Old environment variable: (boolean value) + export OMPI_MCA_hwloc_base_use_hwthreads_as_cpus=1 + + # New environment variable: (standalone suffix) + export PRTE_MCA_rmaps_default_mapping_policy=:hwtcpus + +.. code-block:: ini + + # Old environment variable: (boolean value) + export OMPI_MCA_rmaps_base_oversubscribe=1 + + # New environment variable: (standalone suffix) + export PRTE_MCA_rmaps_default_mapping_policy=:oversubscribe + +The examples below show conversions from old parameters that map to suffixes +combined with parameters that have values: + +.. code-block:: ini + + # Old environment variable: (string value) + export OMPI_MCA_hwloc_base_cpu_set=1,3,8 + + # Old environment variable: (boolean value) + export OMPI_MCA_rmaps_base_oversubscribe=1 + + # New environment variable: (suffix on value) + export PRTE_MCA_rmaps_default_mapping_policy=pe-list=1,3,8:oversubscribe + +.. code-block:: ini + + # Old environment variable: (integer value) + export OMPI_MCA_rmaps_ppr_n_persocket=4 + + # Old environment variable: (boolean value) + export OMPI_MCA_hwloc_base_use_hwthreads_as_cpus=1 + + # New environment variable: (suffix on value) + export PRTE_MCA_rmaps_default_mapping_policy=ppr:4:package:hwtcpus + +Multiple suffixes may be appended to a mapping value: + +.. code-block:: ini + + # Old environment variable: (integer value) + export OMPI_MCA_rmaps_ppr_n_persocket=4 + + # Old environment variables: (boolean value) + export OMPI_MCA_hwloc_base_use_hwthreads_as_cpus=1 + export OMPI_MCA_rmaps_base_oversubscribe=1 + + # New environment variable: (suffix on value) + export PRTE_MCA_rmaps_default_mapping_policy=ppr:4:package:hwtcpus:oversubscribe + diff --git a/docs/version-numbering.rst b/docs/version-numbering.rst index 302bcd08d3f..b96da23de91 100644 --- a/docs/version-numbering.rst +++ b/docs/version-numbering.rst @@ -50,6 +50,12 @@ Similarly, if using a container technology that internally bundles all the libraries from Open MPI vX, attempting to launch that container with ``mpirun`` / ``oshrun`` from Open MPI vY is not guaranteed to work. +Open MPI |ompi_series| MCA parameter compatibility +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Several MCA parameters have been deprecated in Open MPI |ompi_series|, please +see this :ref:`table ` for the full list. + Software Version Number ----------------------- From 8337012800a33f89bdb28ba10888f5e0e8bd7df8 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 12 Sep 2023 08:13:06 +0000 Subject: [PATCH 23/73] OFI: add Intel Level Zero (aka ZE) to OFI MTL/BTL This commit adds support for registering Intel accelerator memory backed buffers with an OFI provider. Signed-off-by: Howard Pritchard --- config/opal_check_ofi.m4 | 13 ++++++++++++- ompi/mca/mtl/ofi/mtl_ofi.h | 7 ++++++- opal/mca/btl/ofi/btl_ofi_module.c | 5 +++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/config/opal_check_ofi.m4 b/config/opal_check_ofi.m4 index 051830821f4..ce575e0554e 100644 --- a/config/opal_check_ofi.m4 +++ b/config/opal_check_ofi.m4 @@ -4,6 +4,8 @@ dnl Copyright (c) 2015-2020 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights dnl reserved. dnl Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. +dnl Copyright (c) 2023 Triad National Security, LLC. All rights +dnl reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -155,7 +157,16 @@ AC_DEFUN([OPAL_CHECK_OFI],[ AC_DEFINE_UNQUOTED([OPAL_OFI_HAVE_FI_HMEM_ROCR], [${opal_check_fi_hmem_rocr}], - [check if FI_HMEM_ROCR avaiable in fi_hmem_iface])]) + [check if FI_HMEM_ROCR avaiable in fi_hmem_iface]) + + AC_CHECK_DECL([FI_HMEM_ZE], + [opal_check_fi_hmem_ze=1], + [opal_check_fi_hmem_ze=0], + [#include ]) + + AC_DEFINE_UNQUOTED([OPAL_OFI_HAVE_FI_HMEM_ZE], + [${opal_check_fi_hmem_ze}], + [check if FI_HMEM_ZE avaiable in fi_hmem_iface])]) CPPFLAGS=${opal_check_ofi_save_CPPFLAGS} LDFLAGS=${opal_check_ofi_save_LDFLAGS} diff --git a/ompi/mca/mtl/ofi/mtl_ofi.h b/ompi/mca/mtl/ofi/mtl_ofi.h index 9db062cd96a..8c008d12662 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi.h +++ b/ompi/mca/mtl/ofi/mtl_ofi.h @@ -2,7 +2,7 @@ * Copyright (c) 2013-2018 Intel, Inc. All rights reserved * Copyright (c) 2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2019-2022 Triad National Security, LLC. All rights + * Copyright (c) 2019-2023 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All Rights reserved. * reserved. @@ -327,6 +327,11 @@ int ompi_mtl_ofi_register_buffer(struct opal_convertor_t *convertor, } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) { attr.iface = FI_HMEM_ROCR; opal_accelerator.get_device(&attr.device.cuda); +#endif +#if OPAL_OFI_HAVE_FI_HMEM_ZE + } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) { + attr.iface = FI_HMEM_ZE; + opal_accelerator.get_device(&attr.device.ze); #endif } else { return OPAL_ERROR; diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index 784d87d6ea2..578ac8d019b 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -267,6 +267,11 @@ int mca_btl_ofi_reg_mem(void *reg_data, void *base, size_t size, } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) { attr.iface = FI_HMEM_ROCR; opal_accelerator.get_device(&attr.device.cuda); +#endif +#if OPAL_OFI_HAVE_FI_HMEM_ZE + } else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) { + attr.iface = FI_HMEM_ZE; + opal_accelerator.get_device(&attr.device.ze); #endif } else { return OPAL_ERROR; From baf882ac13b2cd3fde6f7aabca98ef206dddbf33 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 12 Sep 2023 08:42:00 +0000 Subject: [PATCH 24/73] ofi - add MCA parameters to not use FI_HMEM This commit adds two MCA parameters: mtl_ofi_disable_hmem btl_ofi_disable_hmem to allow for disabling use of FI_HMEM in cases where the provider may advertise support for HMEM but in fact may not, and does not observe the OFI libfabric FI_HMEM_DISABLE_P2P environment variable. This is actually the situation as of the writing of this commit on certain systems owing to limitations in kernel support for registration of accelerator memory. The OFI provider on such systems unfortunately stil advertises support for FI_HMEM with ZE but fails when trying to register memory. These mca parameters allow for turning off use of FI_HMEM in such cases. Related to https://github.com/ofiwg/libfabric/issues/9315 Signed-off-by: Howard Pritchard --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 30 ++++++++++++++++++++++------ ompi/mca/mtl/ofi/mtl_ofi_types.h | 5 +++-- opal/mca/btl/ofi/btl_ofi.h | 2 ++ opal/mca/btl/ofi/btl_ofi_component.c | 16 +++++++++++++-- 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index e575cec02b2..e4ac687edd5 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -251,6 +251,15 @@ ompi_mtl_ofi_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mtl_ofi.num_ofi_contexts); + ompi_mtl_ofi.disable_hmem = false; + mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version, + "disable_hmem", + "Disable HMEM usage", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, + &ompi_mtl_ofi.disable_hmem); + return opal_common_ofi_mca_register(&mca_mtl_ofi_component.super.mtl_version); } @@ -626,8 +635,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, /* Request device transfer capabilities */ #if defined(FI_HMEM) - hints->caps |= FI_HMEM; - hints->domain_attr->mr_mode |= FI_MR_HMEM | FI_MR_ALLOCATED; + if (false == ompi_mtl_ofi.disable_hmem) { + hints->caps |= FI_HMEM; + hints->domain_attr->mr_mode |= FI_MR_HMEM | FI_MR_ALLOCATED; + } #endif no_hmem: @@ -791,10 +802,17 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, *accelerator_support = false; #if defined(FI_HMEM) - if (!(prov->caps & FI_HMEM)) { - opal_output_verbose(50, opal_common_ofi.output, - "%s:%d: Libfabric provider does not support device buffers. Continuing with device to host copies.\n", - __FILE__, __LINE__); + if (!(prov->caps & FI_HMEM) || (true == ompi_mtl_ofi.disable_hmem)) { + if (!(prov->caps & FI_HMEM) && (false == ompi_mtl_ofi.disable_hmem)) { + opal_output_verbose(50, opal_common_ofi.output, + "%s:%d: Libfabric provider does not support device buffers. Continuing with device to host copies.\n", + __FILE__, __LINE__); + } + if (true == ompi_mtl_ofi.disable_hmem) { + opal_output_verbose(50, opal_common_ofi.output, + "%s:%d: Support for device buffers disabled by MCA parameter. Continuing with device to host copies.\n", + __FILE__, __LINE__); + } } else { *accelerator_support = true; ompi_mtl_ofi.hmem_needs_reg = true; diff --git a/ompi/mca/mtl/ofi/mtl_ofi_types.h b/ompi/mca/mtl/ofi/mtl_ofi_types.h index 836870f8ca7..a925f0ec28e 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_types.h +++ b/ompi/mca/mtl/ofi/mtl_ofi_types.h @@ -2,8 +2,8 @@ * Copyright (c) 2013-2018 Intel, Inc. All rights reserved * * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2022 Triad National Security, LLC. All rights - * reserved. + * Copyright (c) 2022-2023 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -59,6 +59,7 @@ typedef struct mca_mtl_ofi_module_t { int enable_sep; /* MCA to enable/disable SEP feature */ int thread_grouping; /* MCA for thread grouping feature */ int num_ofi_contexts; /* MCA for number of contexts to use */ + bool disable_hmem; /* MCA to enable/disable request for FI_HMEM support from provider */ /** Endpoint name length */ size_t epnamelen; diff --git a/opal/mca/btl/ofi/btl_ofi.h b/opal/mca/btl/ofi/btl_ofi.h index 590e9b34c2b..0019065ecfe 100644 --- a/opal/mca/btl/ofi/btl_ofi.h +++ b/opal/mca/btl/ofi/btl_ofi.h @@ -169,6 +169,8 @@ struct mca_btl_ofi_component_t { size_t max_inject_size; bool disable_inject; + bool disable_hmem; + /** All BTL OFI modules (1 per tl) */ mca_btl_ofi_module_t *modules[MCA_BTL_OFI_MAX_MODULES]; }; diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index fd52e56d848..a9de2620ac4 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -200,6 +200,16 @@ static int mca_btl_ofi_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_ofi_component.disable_inject); + mca_btl_ofi_component.disable_hmem = false; + mca_base_component_var_register(&mca_btl_ofi_component.super.btl_version, + "disable_hmem", + "Disable HMEM usage", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_ofi_component.disable_hmem); + + /* for now we want this component to lose to the MTL. */ module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; @@ -345,8 +355,10 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, #if defined(FI_HMEM) /* Request device transfer capabilities, separate from required_caps */ - hints.caps |= FI_HMEM; - hints.domain_attr->mr_mode |= FI_MR_HMEM; + if (false == mca_btl_ofi_component.disable_hmem) { + hints.caps |= FI_HMEM; + hints.domain_attr->mr_mode |= FI_MR_HMEM; + } no_hmem: #endif From b192a785b2624cc19d7668bc0b8331046eb92a36 Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Thu, 17 Aug 2023 09:58:57 +0300 Subject: [PATCH 25/73] SHMEM/MCA/SSHMEM/UCX: Fixing DEVICE_NIC_MEM support to use RDMA memory type Signed-off-by: Roie Danino Added a fallback for rdma allocation failure - allocating host memory instead Signed-off-by: Roie Danino --- config/ompi_check_ucx.m4 | 3 +- oshmem/mca/sshmem/ucx/configure.m4 | 30 +----- oshmem/mca/sshmem/ucx/sshmem_ucx.h | 1 - oshmem/mca/sshmem/ucx/sshmem_ucx_module.c | 117 ++++++---------------- 4 files changed, 33 insertions(+), 118 deletions(-) diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 75aeb93e26e..fbea98cd7b3 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -107,7 +107,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_ATOMIC_FETCH_OP_FXOR, UCP_PARAM_FIELD_ESTIMATED_NUM_PPN, UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK, - UCP_OP_ATTR_FLAG_MULTI_SEND], + UCP_OP_ATTR_FLAG_MULTI_SEND, + UCS_MEMORY_TYPE_RDMA], [], [], [#include ]) AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], diff --git a/oshmem/mca/sshmem/ucx/configure.m4 b/oshmem/mca/sshmem/ucx/configure.m4 index 4991c7557c0..7bb9038c5d0 100644 --- a/oshmem/mca/sshmem/ucx/configure.m4 +++ b/oshmem/mca/sshmem/ucx/configure.m4 @@ -28,34 +28,9 @@ AC_DEFUN([MCA_oshmem_sshmem_ucx_CONFIG],[ save_LIBS="$LIBS" save_CPPFLAGS="$CPPFLAGS" - alloc_dm_LDFLAGS=" -L$ompi_check_ucx_libdir/ucx" - alloc_dm_LIBS=" -luct_ib" CPPFLAGS+=" $sshmem_ucx_CPPFLAGS" - LDFLAGS+=" $sshmem_ucx_LDFLAGS $alloc_dm_LDFLAGS" - LIBS+=" $sshmem_ucx_LIBS $alloc_dm_LIBS" - - AC_LANG_PUSH([C]) - AC_LINK_IFELSE([AC_LANG_PROGRAM( - [[ - #include - #include - ]], - [[ - uct_md_h md = ucp_context_find_tl_md((ucp_context_h)NULL, ""); - (void)uct_ib_md_alloc_device_mem(md, NULL, NULL, 0, "", NULL); - uct_ib_md_release_device_mem(NULL); - ]])], - [ - AC_MSG_NOTICE([UCX device memory allocation is supported]) - AC_DEFINE([HAVE_UCX_DEVICE_MEM], [1], [Support for device memory allocation]) - sshmem_ucx_LIBS+=" $alloc_dm_LIBS" - sshmem_ucx_LDFLAGS+=" $alloc_dm_LDFLAGS" - ], - [ - AC_MSG_NOTICE([UCX device memory allocation is not supported]) - AC_DEFINE([HAVE_UCX_DEVICE_MEM], [0], [Support for device memory allocation]) - ]) - AC_LANG_POP([C]) + LDFLAGS+=" $sshmem_ucx_LDFLAGS" + LIBS+=" $sshmem_ucx_LIBS" CPPFLAGS="$save_CPPFLAGS" LDFLAGS="$save_LDFLAGS" @@ -66,4 +41,3 @@ AC_DEFUN([MCA_oshmem_sshmem_ucx_CONFIG],[ AC_SUBST([sshmem_ucx_LDFLAGS]) AC_SUBST([sshmem_ucx_LIBS]) ])dnl - diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx.h b/oshmem/mca/sshmem/ucx/sshmem_ucx.h index b6085374caa..90d41ac002c 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx.h +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx.h @@ -35,7 +35,6 @@ OSHMEM_DECLSPEC extern mca_sshmem_ucx_component_t mca_sshmem_ucx_component; typedef struct mca_sshmem_ucx_segment_context { - void *dev_mem; sshmem_ucx_shadow_allocator_t *shadow_allocator; ucp_mem_h ucp_memh; } mca_sshmem_ucx_segment_context_t; diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c index fa38d0693a0..262bef5ffe6 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c @@ -26,13 +26,6 @@ #include "sshmem_ucx.h" -//#include - -#if HAVE_UCX_DEVICE_MEM -#include -#include -#endif - #define ALLOC_ELEM_SIZE sizeof(uint64_t) #define min(a,b) ((a) < (b) ? (a) : (b)) #define max(a,b) ((a) > (b) ? (a) : (b)) @@ -104,7 +97,7 @@ static segment_allocator_t sshmem_ucx_allocator = { static int segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, - unsigned flags, long hint, void *dev_mem) + unsigned flags, ucs_memory_type_t mem_type, int err_level) { mca_sshmem_ucx_segment_context_t *ctx; int rc = OSHMEM_SUCCESS; @@ -120,15 +113,19 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | UCP_MEM_MAP_PARAM_FIELD_LENGTH | - UCP_MEM_MAP_PARAM_FIELD_FLAGS; + UCP_MEM_MAP_PARAM_FIELD_FLAGS | + UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE; - mem_map_params.address = address; - mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.address = address; + mem_map_params.length = size; + mem_map_params.flags = flags; + mem_map_params.memory_type = mem_type; status = ucp_mem_map(spml->ucp_context, &mem_map_params, &mem_h); if (UCS_OK != status) { - SSHMEM_ERROR("ucp_mem_map() failed: %s\n", ucs_status_string(status)); + SSHMEM_VERBOSE(err_level, "ucp_mem_map(memory_type=%s) failed: %s\n", + ucs_memory_type_names[mem_type], + ucs_status_string(status)); rc = OSHMEM_ERROR; goto out; } @@ -161,12 +158,7 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); ds_buf->context = ctx; ds_buf->type = MAP_SEGMENT_ALLOC_UCX; - ds_buf->alloc_hints = hint; ctx->ucp_memh = mem_h; - ctx->dev_mem = dev_mem; - if (hint) { - ds_buf->allocator = &sshmem_ucx_allocator; - } out: OPAL_OUTPUT_VERBOSE( @@ -181,82 +173,37 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, return rc; } -#if HAVE_UCX_DEVICE_MEM -static uct_ib_device_mem_h alloc_device_mem(mca_spml_ucx_t *spml, size_t size, - void **address_p) -{ - uct_ib_device_mem_h dev_mem = NULL; - ucs_status_t status; - uct_md_h uct_md; - void *address; - size_t length; - - uct_md = ucp_context_find_tl_md(spml->ucp_context, "mlx5"); - if (uct_md == NULL) { - SSHMEM_VERBOSE(1, "ucp_context_find_tl_md() returned NULL\n"); - return NULL; - } - - /* If found a matching memory domain, allocate device memory on it */ - length = size; - address = NULL; - status = uct_ib_md_alloc_device_mem(uct_md, &length, &address, - UCT_MD_MEM_ACCESS_ALL, "sshmem_seg", - &dev_mem); - if (status != UCS_OK) { - /* If could not allocate device memory - fallback to mmap (since some - * PEs in the job may succeed and while others failed */ - SSHMEM_VERBOSE(1, "uct_ib_md_alloc_dm() failed: %s\n", - ucs_status_string(status)); - return NULL; - } - - SSHMEM_VERBOSE(3, "uct_ib_md_alloc_dm() returned address %p\n", address); - *address_p = address; - return dev_mem; -} -#endif - static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size, long hint) { mca_spml_ucx_t *spml = (mca_spml_ucx_t*)mca_spml.self; - unsigned flags; + unsigned flags = UCP_MEM_MAP_ALLOCATE; + int status; -#if HAVE_UCX_DEVICE_MEM - int ret = OSHMEM_ERROR; if (hint & SHMEM_HINT_DEVICE_NIC_MEM) { - if (size > UINT_MAX) { - return OSHMEM_ERR_BAD_PARAM; +#if HAVE_DECL_UCS_MEMORY_TYPE_RDMA + status = segment_create_internal(ds_buf, NULL, size, flags, + UCS_MEMORY_TYPE_RDMA, 3); + if (status == OSHMEM_SUCCESS) { + ds_buf->alloc_hints = hint; + ds_buf->allocator = &sshmem_ucx_allocator; + return OSHMEM_SUCCESS; } - - void *dev_mem_address; - uct_ib_device_mem_h dev_mem = alloc_device_mem(spml, size, - &dev_mem_address); - if (dev_mem != NULL) { - int ret; - ret = segment_create_internal(ds_buf, dev_mem_address, size, 0, - hint, dev_mem); - if (ret == OSHMEM_SUCCESS) { - return OSHMEM_SUCCESS; - } else if (dev_mem != NULL) { - uct_ib_md_release_device_mem(dev_mem); - /* fallback to regular allocation */ - } - } - } +#else + SSHMEM_VERBOSE(3, "DEVICE_NIC_MEM hint ignored since UCX does not " + "support MEMORY_TYPE_RDMA"); #endif + return OSHMEM_ERR_NOT_IMPLEMENTED; + } - flags = UCP_MEM_MAP_ALLOCATE | (spml->heap_reg_nb ? UCP_MEM_MAP_NONBLOCK : 0); - if (hint) { - return segment_create_internal(ds_buf, NULL, size, flags, hint, NULL); - } else { - return segment_create_internal(ds_buf, mca_sshmem_base_start_address, - size, flags | UCP_MEM_MAP_FIXED, hint, - NULL); + flags |= UCP_MEM_MAP_FIXED; + if (spml->heap_reg_nb) { + flags |= UCP_MEM_MAP_NONBLOCK; } + return segment_create_internal(ds_buf, mca_sshmem_base_start_address, size, + flags, UCS_MEMORY_TYPE_HOST, 0); } static void * @@ -303,12 +250,6 @@ segment_unlink(map_segment_t *ds_buf) ucp_mem_unmap(spml->ucp_context, ctx->ucp_memh); -#if HAVE_UCX_DEVICE_MEM - if (ctx->dev_mem) { - uct_ib_md_release_device_mem(ctx->dev_mem); - } -#endif - ds_buf->context = NULL; free(ctx); From fb3b68f4b8dbc34958650db072fc2462c77024a8 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Fri, 15 Sep 2023 09:19:28 -0500 Subject: [PATCH 26/73] fbtl/posix: fix data-sieving calculations as part of introducing atomicity support for ompi v5.0, we also tried to improve the robustness in some file I/O routines. Unfortunately, this also introduced a bug since ret_code returned by a function does not necessarily contain the number of bytes read or written, but could contain the last value (e.g. 0). The value was however used in a subsequent calculation and we ended not copying data out of the temporary buffer used in the data sieving at all. This commit also simplifies some of the logic in the while loop, no need to retry to read past the end of the file multiple times. Fixes issue #11917 Code was tested with the reproducer provided as part of the issue, our internal testsuite, and the hdf5-1.4.2 testsuite, all tests pass. Signed-off-by: Edgar Gabriel --- ompi/mca/fbtl/posix/fbtl_posix_preadv.c | 25 +++++------------------- ompi/mca/fbtl/posix/fbtl_posix_pwritev.c | 11 +---------- 2 files changed, 6 insertions(+), 30 deletions(-) diff --git a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c index ea15adaf5fe..7f32c8e227b 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_preadv.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_preadv.c @@ -33,7 +33,6 @@ static ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh, struct flock static ssize_t mca_fbtl_posix_preadv_generic (ompio_file_t *fh, struct flock *lock, int *lock_counter); static ssize_t mca_fbtl_posix_preadv_single (ompio_file_t *fh, struct flock *lock, int *lock_counter); -#define MAX_RETRIES 10 ssize_t mca_fbtl_posix_preadv (ompio_file_t *fh ) { @@ -108,7 +107,6 @@ ssize_t mca_fbtl_posix_preadv_single (ompio_file_t *fh, struct flock *lock, int return OMPI_ERROR; } - int retries = 0; size_t len = fh->f_io_array[0].length; while ( total_bytes < len ) { ret_code = pread(fh->fd, (char*)fh->f_io_array[0].memory_address+total_bytes, @@ -121,13 +119,7 @@ ssize_t mca_fbtl_posix_preadv_single (ompio_file_t *fh, struct flock *lock, int } if ( ret_code == 0 ) { // end of file - retries++; - if ( retries == MAX_RETRIES ) { - break; - } - else { - continue; - } + break; } total_bytes += ret_code; } @@ -206,7 +198,6 @@ ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh, struct flock *lock, return OMPI_ERROR; } size_t total_bytes = 0; - int retries = 0; while ( total_bytes < len ) { ret_code = pread (fh->fd, temp_buf+total_bytes, len-total_bytes, start+total_bytes); @@ -218,13 +209,7 @@ ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh, struct flock *lock, } if ( ret_code == 0 ) { // end of file - retries++; - if ( retries == MAX_RETRIES ) { - break; - } - else { - continue; - } + break; } total_bytes += ret_code; } @@ -236,12 +221,12 @@ ssize_t mca_fbtl_posix_preadv_datasieving (ompio_file_t *fh, struct flock *lock, size_t start_offset = (size_t) fh->f_io_array[startindex].offset; for ( i = startindex ; i < endindex ; i++) { pos = (size_t) fh->f_io_array[i].offset - start_offset; - if ( (ssize_t) pos > ret_code ) { + if ( (ssize_t) pos > total_bytes ) { break; } num_bytes = fh->f_io_array[i].length; - if ( ((ssize_t) pos + (ssize_t)num_bytes) > ret_code ) { - num_bytes = ret_code - (ssize_t)pos; + if ( ((ssize_t) pos + (ssize_t)num_bytes) > total_bytes ) { + num_bytes = total_bytes - (ssize_t)pos; } memcpy (fh->f_io_array[i].memory_address, temp_buf + pos, num_bytes); diff --git a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c index b96bddcb894..9b99f968f7c 100644 --- a/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c +++ b/ompi/mca/fbtl/posix/fbtl_posix_pwritev.c @@ -34,8 +34,6 @@ static ssize_t mca_fbtl_posix_pwritev_datasieving (ompio_file_t *fh, struct floc static ssize_t mca_fbtl_posix_pwritev_generic (ompio_file_t *fh, struct flock *lock, int *lock_counter ); static ssize_t mca_fbtl_posix_pwritev_single (ompio_file_t *fh, struct flock *lock, int *lock_counter ); -#define MAX_RETRIES 10 - ssize_t mca_fbtl_posix_pwritev(ompio_file_t *fh ) { ssize_t bytes_written=0; @@ -192,7 +190,6 @@ ssize_t mca_fbtl_posix_pwritev_datasieving (ompio_file_t *fh, struct flock *lock return OMPI_ERROR; } - int retries=0; while ( total_bytes < len ) { ret_code = pread (fh->fd, temp_buf, len, start); if ( ret_code == -1 ) { @@ -203,13 +200,7 @@ ssize_t mca_fbtl_posix_pwritev_datasieving (ompio_file_t *fh, struct flock *lock } if ( ret_code == 0 ) { // end of file - retries++; - if ( retries == MAX_RETRIES ) { - break; - } - else { - continue; - } + break; } total_bytes += ret_code; } From 7074e59ea93e8eb5edde53655bea35d658c286f9 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 7 Sep 2023 15:06:09 -0400 Subject: [PATCH 27/73] Replace use of writev by sendmsg This allows the TCP BTL to avoid raising SIGPIPE on OSes that do not support SO_NOSIGPIPE. Correctly use the unsigned type of the vpid when using it as a starting position for finding the process rank in a group. Signed-off-by: George Bosilca --- ompi/group/group.h | 12 +++++------- opal/mca/btl/tcp/btl_tcp_frag.c | 22 +++++++++++++++++----- opal/win32/opal_uio.c | 6 +++--- opal/win32/opal_uio.h | 14 +++++++------- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/ompi/group/group.h b/ompi/group/group.h index 58251892015..c188e98f02f 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2020 The University of Tennessee and The University + * Copyright (c) 2004-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -459,19 +459,17 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t */ static inline int ompi_group_proc_lookup_rank (ompi_group_t* group, ompi_proc_t* proc) { - int i, np, v; + int i, np, rank; + opal_vpid_t v; assert( NULL != proc ); assert( !ompi_proc_is_sentinel(proc) ); np = ompi_group_size(group); if( 0 == np ) return MPI_PROC_NULL; /* heuristic: On comm_world, start the lookup from v=vpid, so that - * when working on comm_world, the search is O(1); - * Otherwise, wild guess: start from a proportional position - * compared to comm_world position. */ + * when working on comm_world, on average, the search remains O(1). */ v = proc->super.proc_name.vpid; - v = (viov_ptr; + msg.msg_iovlen = frag->iov_cnt; + msg.msg_control = NULL; + msg.msg_controllen = 0; + + /* non-blocking write, continue if interrupted */ do { - cnt = writev(sd, frag->iov_ptr, frag->iov_cnt); + /* Use sendmsg to avoid issues with SIGPIPE as described in + * https://blog.erratasec.com/2018/10/tcpip-sockets-and-sigpipe.html# + */ + cnt = sendmsg(sd, &msg, msg_flags); if (cnt < 0) { switch (opal_socket_errno) { case EINTR: @@ -116,7 +128,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) case EWOULDBLOCK: return false; case EFAULT: - BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n", + BTL_ERROR(("mca_btl_tcp_frag_send: sendmsg error (%p, %lu)\n\t%s(%lu)\n", frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len, strerror(opal_socket_errno), (unsigned long) frag->iov_cnt)); /* send_lock held by caller */ @@ -125,7 +137,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) return false; default: BTL_PEER_ERROR(frag->endpoint->endpoint_proc->proc_opal, - ("mca_btl_tcp_frag_send: writev failed: %s (%d)", + ("mca_btl_tcp_frag_send: sendmsg failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); /* send_lock held by caller */ frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; diff --git a/opal/win32/opal_uio.c b/opal/win32/opal_uio.c index 0270e0f4f7b..3c4bfe7550b 100644 --- a/opal/win32/opal_uio.c +++ b/opal/win32/opal_uio.c @@ -2,7 +2,7 @@ Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana University Research and Technology Corporation. All rights reserved. - Copyright (c) 2004-2005 The University of Tennessee and The University + Copyright (c) 2004-2023 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -26,12 +26,12 @@ of code to handle the windows error flags */ -int writev(int fd, struct iovec *iov, int cnt) +ssize_t sendmsg(int fd, const struct msghdr *message, int flags) { int err; DWORD sendlen; - err = WSASend((SOCKET) fd, &(iov->data), cnt, &sendlen, 0, NULL, NULL); + err = WSASendMsg((SOCKET) fd, message, flags, &sendlen, NULL, NULL); if (err < 0) { return err; diff --git a/opal/win32/opal_uio.h b/opal/win32/opal_uio.h index 2691b0bd3d4..642beda1128 100644 --- a/opal/win32/opal_uio.h +++ b/opal/win32/opal_uio.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -33,14 +33,14 @@ struct iovec { #define iov_len data.len BEGIN_C_DECLS + /* - * writev: - writev writes data to file descriptor fd, and from the buffers - described by iov. The number of buffers is specified by cnt. The - buffers are used in the order specified. Operates just like write - except that data is taken from iov instead of a contiguous buffer. + * sendmsg: + * writes data to a file descriptor. This is a convenience function to allow + * the TCP BTL to support Windows. Overall is should behave similarly to the + * POSIX sendmsg function. */ -OPAL_DECLSPEC int writev(int fd, struct iovec *iov, int cnt); +OPAL_DECLSPEC ssize_t sendmsg(int socket, const struct msghdr *message, int flags); /* readv reads data from file descriptor fd, and puts the result in the From cef772b4fbf3c5a9b9f6492e92cfabb3277fd5d3 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 22 Sep 2023 15:24:27 -0400 Subject: [PATCH 28/73] Use aggregate initialization to ensure all fields are set Removes complaints from coverity about msg.msg_flags not being set. For more information about this read the discussion on #11915. Signed-off-by: George Bosilca --- opal/mca/btl/tcp/btl_tcp_frag.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index b70ad9e34eb..36c01537895 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -105,16 +105,11 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) { ssize_t cnt; size_t i, num_vecs; - struct msghdr msg; + struct msghdr msg = { + .msg_iov = frag->iov_ptr, + .msg_iovlen = frag->iov_cnt }; int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_iov = frag->iov_ptr; - msg.msg_iovlen = frag->iov_cnt; - msg.msg_control = NULL; - msg.msg_controllen = 0; - /* non-blocking write, continue if interrupted */ do { /* Use sendmsg to avoid issues with SIGPIPE as described in From da9206a4c1df7744d4cc05358336bf1aaff596f4 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 11 Jul 2023 12:02:54 -0400 Subject: [PATCH 29/73] mailmap: Add alternate email address for Jeff Squyres Signed-off-by: Jeff Squyres --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index e8516075720..b463497a038 100644 --- a/.mailmap +++ b/.mailmap @@ -32,6 +32,7 @@ Jeff Squyres Jeff Squyres --quiet <--quiet> Jeff Squyres +Jeff Squyres George Bosilca From ab7013787261ac93d0ea55ea94af00e07cd215a2 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sun, 24 Sep 2023 10:21:33 -0400 Subject: [PATCH 30/73] Update prrte submodule to include new RST functionality Signed-off-by: Jeff Squyres --- 3rd-party/prrte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd-party/prrte b/3rd-party/prrte index 0347baa1eda..9015ca02cce 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 0347baa1edaec29c4f0cf1eac7b674ad7ba139c1 +Subproject commit 9015ca02cce72acc03f86d399f939843c42b3dc8 From 1fd09447f4234f15d2d8c14dd4efba7601b67b8d Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 11 Jul 2023 12:02:04 -0400 Subject: [PATCH 31/73] docs: slurp PRTE's RST files into mpirun.1 This commit introduce a fundamentally new concept: have configure search PRRTE for RST files to include in Open MPI's documentation (regardless of whether we're using the internal/bundled PRRTE or an external PRRTE). If we're building against an external PRRTE that is old enough that it doesn't have any RST files installed, we'll make up some dummy RST files that basically say "you don't get help/content here because your PRRTE is too old." To simplify the configury for this scheme, this commit also makes another change: the pre-built HTML docs and nroff man pages included in distribution tarballs are now located at docs/html/ and docs/man/, respectively (vs. the location where we'll build them: docs/_build/html/ and docs/_build/man/, respectively). There are two cases here: 1. If the user has Sphinx available, we'll build the docs under docs/_build/, and install those (effectively ignoring the pre-built docs). 2. If the user does not have Sphinx available, we'll just install the pre-built docs. This simplified things like "make clean" and "make distcheck". Including RST content from PRTE required another major change: when we build the RST docs in a VPATH scenario, we copy the entire docs/ source tree to the build tree. This allows us to modify the RST sources a bit (e.g., to include the PRRTE RST files or generate dummy PRRTE RST files). mpirun.1.rst is updated to include the RST content from PRRTE about CLI options. More work needs to be done here to remove old, now-redundant content. Finally, we also amend the advice to implementors to have Sphinx installed when building their package so that Open MPI's build system can properly slurp in their PRRTE's RST docs. Signed-off-by: Jeff Squyres --- .gitignore | 9 + Makefile.ompi-rules | 9 + config/ompi_setup_prrte.m4 | 59 +++- configure.ac | 3 +- docs/Makefile.am | 270 ++++++++++++--- docs/conf.py | 20 +- docs/index.rst | 8 +- docs/installing-open-mpi/packagers.rst | 42 +++ .../required-support-libraries.rst | 5 +- docs/man-openmpi/man1/mpirun.1.rst | 307 +++++++++++------- docs/news/news-v5.0.x.rst | 7 +- docs/no-prrte-content.rst.txt | 24 ++ 12 files changed, 590 insertions(+), 173 deletions(-) create mode 100644 docs/no-prrte-content.rst.txt diff --git a/.gitignore b/.gitignore index d15a1bc8f88..c1bfe01444a 100644 --- a/.gitignore +++ b/.gitignore @@ -534,3 +534,12 @@ docs/_templates # Common Python virtual environment directory names venv py?? + +# Copies of PRRTE RST files (i.e., not source controlled in this tree) +docs/prrte-rst-content +docs/schizo-ompi-rst-content + +# Copies of the built HTML docs and man pages (for distribution +# tarballs) +docs/html +docs/man diff --git a/Makefile.ompi-rules b/Makefile.ompi-rules index 567bcfd99f3..d18d49c4978 100644 --- a/Makefile.ompi-rules +++ b/Makefile.ompi-rules @@ -2,6 +2,7 @@ # Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2020 Intel, Inc. All rights reserved. +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -26,6 +27,14 @@ OMPI_V_GEN = $(ompi__v_GEN_$V) ompi__v_GEN_ = $(ompi__v_GEN_$AM_DEFAULT_VERBOSITY) ompi__v_GEN_0 = @echo " GENERATE" $@; +OMPI_V_COPYALL = $(ompi__v_COPYALL_$V) +ompi__v_COPYALL_ = $(ompi__v_COPYALL_$AM_DEFAULT_VERBOSITY) +ompi__v_COPYALL_0 = @echo " COPY tree $@"; + +OMPI_V_SPHINX_COPYRST = $(ompi__v_SPHINX_COPYRST_$V) +ompi__v_SPHINX_COPYRST_ = $(ompi__v_SPHINX_COPYRST_$AM_DEFAULT_VERBOSITY) +ompi__v_SPHINX_COPYRST_0 = @echo " COPY RST source files"; + OMPI_V_SPHINX_HTML = $(ompi__v_SPHINX_HTML_$V) ompi__v_SPHINX_HTML_ = $(ompi__v_SPHINX_HTML_$AM_DEFAULT_VERBOSITY) ompi__v_SPHINX_HTML_0 = @echo " GENERATE HTML docs"; diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index 4dffa6ceb2a..97eba7a1bd2 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -19,6 +19,7 @@ dnl Copyright (c) 2019-2020 Intel, Inc. All rights reserved. dnl Copyright (c) 2020-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. dnl Copyright (c) 2021 Nanook Consulting. All rights reserved. dnl Copyright (c) 2021-2022 IBM Corporation. All rights reserved. +dnl Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -35,10 +36,25 @@ dnl dnl A Makefile conditional OMPI_WANT_PRRTE will be defined based on the dnl results of the build. AC_DEFUN([OMPI_SETUP_PRRTE],[ - OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy]) + AC_REQUIRE([AC_PROG_LN_S]) + +OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy target_rst_dir]) opal_show_subtitle "Configuring PRRTE" + # We *must* have setup Sphinx before invoking this macro (i.e., it + # is a programming error -- not a run-time error -- if Sphinx was + # not previously setup). + OAC_ASSERT_BEFORE([OAC_SETUP_SPHINX], [OMPI_SETUP_PRRTE]) + + # These are sym links to folders with PRRTE's RST files that we'll + # slurp into mpirun.1.rst. We'll remove these links (or even + # accidental full copies) now and replace them with new links to + # the PRRTE that we find, below. + target_rst_dir="$OMPI_TOP_BUILDDIR/docs" + rm -rf "$target_rst_dir/prrte-rst-content" + rm -rf "$target_rst_dir/schizo-ompi-rst-content" + OPAL_3RDPARTY_WITH([prrte], [prrte], [package_prrte], [1]) AC_ARG_WITH([prrte-bindir], @@ -101,12 +117,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ [$OMPI_USING_INTERNAL_PRRTE], [Whether or not we are using the internal PRRTE]) - OPAL_SUMMARY_ADD([Miscellaneous], [prrte], [], [$opal_prrte_mode]) + AC_SUBST(OMPI_PRRTE_RST_CONTENT_DIR) + AC_SUBST(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR) + AM_CONDITIONAL(OMPI_HAVE_PRRTE_RST, [test $OMPI_HAVE_PRRTE_RST -eq 1]) + + OPAL_SUMMARY_ADD([Miscellaneous], [PRRTE], [], [$opal_prrte_mode]) OPAL_VAR_SCOPE_POP ]) - dnl _OMPI_SETUP_PRRTE_INTERNAL([action-if-success], [action-if-not-success]) dnl dnl Attempt to configure the built-in PRRTE. @@ -220,7 +239,15 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_INTERNAL], [ [AC_MSG_ERROR([PRRTE configuration failed. Cannot continue.])]) AS_IF([test "$internal_prrte_happy" = "yes"], - [$1], [$2]) + [AC_MSG_CHECKING([for internal PRRTE RST files]) + AS_IF([test -n "$SPHINX_BUILD"], + [OMPI_HAVE_PRRTE_RST=1 + OMPI_PRRTE_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/docs/prrte-rst-content" + OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/mca/schizo/ompi" + AC_MSG_RESULT([found])], + [AC_MSG_RESULT([not found])]) + $1], + [$2]) OPAL_VAR_SCOPE_POP ]) @@ -284,9 +311,27 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [ [AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH], ["${prterun_path}"], [Path to prterun])]) AS_IF([test "$setup_prrte_external_happy" = "yes"], - [$1], [$2]) + [ # Determine if this external PRRTE has installed the RST + # directories that we care about + + AC_MSG_CHECKING([for external PRRTE RST files]) + prrte_install_dir=${with_prrte}/share/prte/rst + AS_IF([test -n "$SPHINX_BUILD"], + [AS_IF([test -d "$prrte_install_dir/prrte-rst-content" && \ + test -d "$prrte_install_dir/schizo-ompi-rst-content"], + [OMPI_HAVE_PRRTE_RST=1 + OMPI_PRRTE_RST_CONTENT_DIR="$prrte_install_dir/prrte-rst-content" + OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$prrte_install_dir/schizo-ompi-rst-content" + AC_MSG_RESULT([found]) + ], + [ # This version of PRRTE doesn't have installed RST + # files. + AC_MSG_RESULT([not found]) + OMPI_HAVE_PRRTE_RST=0 + ]) + ]) + $1], + [$2]) OPAL_VAR_SCOPE_POP ]) - - diff --git a/configure.ac b/configure.ac index 7c3c3936c3b..f03bdaf268c 100644 --- a/configure.ac +++ b/configure.ac @@ -28,6 +28,7 @@ # Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. # Copyright (c) 2019 Triad National Security, LLC. All rights # reserved. +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -1072,7 +1073,7 @@ AS_IF([test -z "$LEX" || \ dnl Note that we have to double escape the URL below dnl so that the # it contains doesn't confuse the Autotools -OAC_SETUP_SPHINX([$srcdir/docs/_build/man/MPI_T.3], +OAC_SETUP_SPHINX([$srcdir/docs/man/MPI_T.3], [[https://docs.open-mpi.org/en/main/developers/prerequisites.html#sphinx-and-therefore-python]]) # diff --git a/docs/Makefile.am b/docs/Makefile.am index 3aa2b3b960f..dc9a085e99e 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -1,6 +1,7 @@ # # Copyright (c) 2022 Cisco Systems, Inc. All rights reserved. # +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -26,7 +27,7 @@ .NOTPARALLEL: OUTDIR = _build -SPHINX_CONFIG = conf.py +SPHINX_CONFIG = $(srcdir)/conf.py SPHINX_OPTS ?= -W --keep-going -j auto # Note: it is significantly more convenient to list all the source @@ -58,6 +59,9 @@ RST_SOURCE_FILES = \ EXTRA_DIST = \ requirements.txt \ + no-prrte-content.rst.txt \ + html \ + man \ $(SPHINX_CONFIG) \ $(TEXT_SOURCE_FILES) \ $(IMAGE_SOURCE_FILES) \ @@ -784,27 +788,48 @@ OSHMEM_MAN3 = \ MAN_OUTDIR = $(OUTDIR)/man +# If we're building the docs, then we install from the just-built +# docs. Otherwise, we install from the pre-built docs (i.e., the docs +# included in the tarball). +# +# NOTE: If we're in a git clone with a) no pre-built docs and b) +# Sphinx is not found, then both OPAL_BUILD_DOCS and OPAL_INSTALL_DOCS +# will be false, and the value of MAN_INSTALL_FROM will not not used. +if OPAL_BUILD_DOCS +MAN_INSTALL_FROM = $(MAN_OUTDIR) +HTML_INSTALL_FROM = $(OUTDIR)/html +else +MAN_INSTALL_FROM = man +HTML_INSTALL_FROM = html +endif + +# For each of the man page macros below: +# +# *_RST: the .rst source files +# *_BUILT: the files in the _build/man directory +# *_INSTALL_FROM: the files in either the _build/man/ directory (if we +# are building the Sphinx docs) or the man/ directory (if we are not +# building the Sphinx docs, and are using the pre-built docs that +# are included in the tarballl). OMPI_MAN1_RST = $(OMPI_MAN1:%.1=man-openmpi/man1/%.1.rst) OMPI_MAN1_BUILT = $(OMPI_MAN1:%.1=$(MAN_OUTDIR)/%.1) +OMPI_MAN1_INSTALL_FROM = $(OMPI_MAN1:%.1=$(MAN_INSTALL_FROM)/%.1) OMPI_MAN3_RST = $(OMPI_MAN3:%.3=man-openmpi/man3/%.3.rst) OMPI_MAN3_BUILT = $(OMPI_MAN3:%.3=$(MAN_OUTDIR)/%.3) +OMPI_MAN3_INSTALL_FROM = $(OMPI_MAN3:%.3=$(MAN_INSTALL_FROM)/%.3) OMPI_MAN7_RST = $(OMPI_MAN7:%.7=man-openmpi/man7/%.7.rst) OMPI_MAN7_BUILT = $(OMPI_MAN7:%.7=$(MAN_OUTDIR)/%.7) +OMPI_MAN7_INSTALL_FROM = $(OMPI_MAN7:%.7=$(MAN_INSTALL_FROM)/%.7) OSHMEM_MAN1_RST = $(OSHMEM_MAN1:%.1=man-oshmem/man1/%.1.rst) OSHMEM_MAN1_BUILT = $(OSHMEM_MAN1:%.1=$(MAN_OUTDIR)/%.1) +OSHMEM_MAN1_INSTALL_FROM = $(OSHMEM_MAN1:%.1=$(MAN_INSTALL_FROM)/%.1) OSHMEM_MAN3_RST = $(OSHMEM_MAN3:%.3=man-oshmem/man3/%.3.rst) OSHMEM_MAN3_BUILT = $(OSHMEM_MAN3:%.3=$(MAN_OUTDIR)/%.3) - -EXTRA_DIST += \ - $(OMPI_MAN1_BUILT) \ - $(OMPI_MAN3_BUILT) \ - $(OMPI_MAN7_BUILT) \ - $(OSHMEM_MAN1_BUILT) \ - $(OSHMEM_MAN3_BUILT) +OSHMEM_MAN3_INSTALL_FROM = $(OSHMEM_MAN3:%.3=$(MAN_INSTALL_FROM)/%.3) ########################################################################### @@ -845,49 +870,202 @@ EXTRA_DIST += \ $(OSHMEM_MAN1_CXX_REDIRECTS) \ $(OSHMEM_MAN1_FORTRAN_REDIRECTS) + +########################################################################### + +ALL_MAN_BUILT = \ + $(OMPI_MAN1_BUILT) $(OMPI_MAN3_BUILT) $(OMPI_MAN7_BUILT) \ + $(OSHMEM_MAN1_BUILT) $(OSHMEM_MAN_3_BUILT) + +# These 2 targets are used in EXTRA_DIST: we make a full copy of the +# built HTML and man docs into a separate location that is included in +# the tarball. This gives users a fully copy of the docs included in +# distribution tarballs. +html: $(ALL_MAN_BUILT) + $(OMPI_V_COPYALL) rm -rf html; cp -rp $(OUTDIR)/html . + +man: $(ALL_MAN_BUILT) + $(OMPI_V_COPYALL) rm -rf man; cp -rp $(OUTDIR)/man . + +# Remove the copies of the built HTML and man pages to get back to a +# clean git clone. +maintainer-clean-local: + rm -rf html man + +# If we're doing a VPATH build, we may have "html" and "man" +# directories in the build tree (e.g., if we did "make dist"). Remove +# these copies so that we can pass distcheck (of course: we never +# remove these directories from the source tree). +distclean-local: + if test "$(srcdir)" != "$(builddir)"; then \ + rm -rf html man; \ + fi + ########################################################################### if OPAL_BUILD_DOCS include $(top_srcdir)/Makefile.ompi-rules -# Have to not list these targets in EXTRA_DIST outside of the -# OPAL_BUILD_DOCS conditional because "make dist" will fail due to -# these missing targets (and therefore not run the "dist-hook" target -# in the top-level Makefile, which prints a pretty message about why -# "make dist" failed). +# Copy over the PRRTE RST files to this build tree. # -# We list the entire directory trees (html and man) to grab all -# generated files in them. -EXTRA_DIST += \ - $(OUTDIR)/html \ - $(OUTDIR)/man +# 1. If we're building with PRRTE support: +# +# 1a. If we're building the internal/bundled PRRTE, then we'll copy +# the internal/bundled PRRTE's RST files to the build tree. +# 1b. If we're building against an external PRRTE installation that +# has RST files in its install tree, then we'll copy that +# external PRRTE's RST files to the build tree. +# 1c. If we're building against an external PRRTE installation that +# does NOT have RST files in its install tree, then we'll +# create some dummy RST files instead. +# +# 2. If we're building without PRRTE support, we'll create some dummy +# RST files instead. +# +# NOTE: We specifically list $(builddir) in the target name, just to +# ensure that "make" doesn't accidentally find this directory in the +# VPATH srcdir, and therefore not execute this rule (because Sphinx +# does not understand VPATH, and will ignore this directory in the +# VPATH srcdir). We can have this directory in the srcdir by doing a +# VPATH build of an official distribution tarball. -ALL_MAN_BUILT = \ - $(OMPI_MAN1_BUILT) $(OMPI_MAN3_BUILT) $(OMPI_MAN7_BUILT) \ - $(OSHMEM_MAN1_BUILT) $(OSHMEM_MAN_3_BUILT) +# Make the 2 directories that we need: schizo-ompi-rst-content and +# prrte-rst-content. +$(builddir)/schizo-ompi-rst-content: + $(OMPI_V_MKDIR) if test ! -d "$@"; then mkdir "$@"; fi +$(builddir)/prrte-rst-content: + $(OMPI_V_MKDIR) if test ! -d "$@"; then mkdir "$@"; fi + +# Get the schizo-ompi-rst-cli.rst file that we need. CAVEAT: we name +# it ".in" so that Sphinx doesn't slurp it in via two different +# locations in the RST docroot (i.e., via +# /schizo-ompi-rst-content/schizo-ompi-cli.rstxt and via +# /man-openmpi/man1/mpirun.1.rst). Sphinx *shouldn't* do this -- it +# should see the ".. include...." directive in mpirun.1.rst and *only* +# include the file once. But somehow it's also seeing it a 2nd time. +# So -- fine. We'll name it something other than .rst so that Sphinx +# doesn't do that. +# +# Regardless, either copy this file from the PRRTE install tree or +# make a bogus one (if we don't have one in the PRRTE install tree). +# +# Also, note: the rule to make the $(builddir)/schizo-ompi-rst-content +# directory must be in the AM_CONDITIONAL here, otherwise Automake +# complains. Meaning: we have to have same dependency listed in both +# the "if" and the "else" blocks. Grumble. +if OMPI_HAVE_PRRTE_RST +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)/* + $(OMPI_V_SPHINX_COPYRST) \ + dir=`dirname $@`; \ + cp -rpf $(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)/* "$$dir" +else +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(srcdir)/no-prrte-content.rst.txt + if test ! -d "$$dir"; then mkdir "$$dir"; fi + $(OMPI_V_SPHINX_COPYRST) \ + dir=`dirname $@`; \ + cp -pf $(srcdir)/no-prrte-content.rst.txt "$$dir" +endif + +$(ALL_MAN_BUILT): $(builddir)/prrte-rst-content +$(ALL_MAN_BUILT): $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt $(ALL_MAN_BUILT): $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(ALL_MAN_BUILT): $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG) +# Render the RST source into both 1) full HTML docs and 2) nroff man +# pages. +# # List both commands (HTML and man) in a single rule because they # really need to be run in serial. Specifically, if they were two # different rules and someone ran "make -j", then both of them could # be writing to $(OUTDIR)/doctrees simultaneously, which would be Bad. # Use one of the man pages as a sentinel file to indicate whether all # the HTML docs and man pages have been built. +# +# It's therefore a little bit of a lie to have the target named +# $(ALL_MAN_BUILT) *also* generate all the HTML content, but... so be +# it. +# +# Also note that Open MPI's RST includes some conditional RST (from +# PRRTE -- i.e., whether we get the source RST from the internal +# PRRTE, an external PRRTE, or whether we create RST files from +# scratch). These conditionals mean that we have to make some changes +# to the input Sphinx RST tree before building it. But -- by Automake +# convention -- we can't modify the source tree. Hence, we have to +# copy over all the source RST files -- including its internal +# directory structure -- to the build tree, and then make our desired +# changes here in the build tree. This is a bit ugly, but we could +# not think of anything better to do. +# +# NOTE: This is a little gross in that for a VPATH build, we *always* +# copy from the source tree to the dest tree (if the target does not +# exist or any of the sources in the source tree -- thanks to +# make/VPATH handling -- have changed compared to the target). +# However, we're using "cp -p", so even though we're copying *all the +# sources* from the source tree to the build tree, the timestamp will +# reflect what is in the source tree. Hence, if the source file has +# not changed, then it won't look like the file in the build tree has +# changed. We're going to overwrite any local changes in the build +# tree, but you shouldn't be editing the build tree, anyway. So -- +# good enough. +# +# Finally, one added wrinkle: only copy the RST source files in +# prrte-rst-content that are referenced by ".. include::" in the +# schizo-ompi-cli.rstxt file. We do this because Sphinx complains if +# there are .rst files that are not referenced. :-( $(ALL_MAN_BUILT): - $(OMPI_V_SPHINX_HTML) $(SPHINX_BUILD) -M html "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) - $(OMPI_V_SPHINX_MAN) $(SPHINX_BUILD) -M man "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_COPYRST) if test "$(srcdir)" != "$(builddir)"; then \ + len=`echo "$(srcdir)/" | wc -c`; \ + for file in $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG); do \ + dir=`dirname $$file | cut -c$$len-`; \ + if test -z "$$dir"; then \ + dir=.; \ + fi; \ + if test ! -d "$$dir"; then \ + mkdir -p "$$dir"; \ + fi; \ + cp -p "$$file" "$$dir"; \ + done; \ + fi; \ + for file in `fgrep '.. include::' $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt | awk '{ print $$3 }'`; do \ + filename=`basename $$file`; \ + cp -pf $(OMPI_PRRTE_RST_CONTENT_DIR)/$$filename "$(builddir)/prrte-rst-content"; \ + done + $(OMPI_V_SPHINX_HTML) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M html "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_MAN) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M man "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) # A useful rule to invoke manually to ensure that all of the external # HTML links we have are valid. Running this rule requires # connectivity to the general internet. linkcheck: - $(SPHINX_BUILD) -M linkcheck "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(SPHINX_BUILD) -M linkcheck "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) .PHONY: linkcheck -maintainer-clean-local: - $(SPHINX_BUILD) -M clean "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) +# Since we are building the docs, we built $(OUTDIR). Hence, we need +# to delete it during "make clean". Note that we can't add +# directories to CLEANFILES, because Automake only (effectively) does +# "rm -f $(CLEANFILES)" (not "rm -rf ..."). So we have to delete +# directories ourselves. +# +# Also, if this is a VPATH build, then we made a copy of a bunch of +# RST source files to the build tree. So delete all of those, too. +clean-local: + rm -rf $(OUTDIR) + rm -rf prrte-rst-content schizo-ompi-rst-content + if test "$(srcdir)" != "$(builddir)"; then \ + len=`echo "$(srcdir)/" | wc -c`; \ + for file in $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG); do \ + dir=`dirname $$file | cut -c$$len-`; \ + if test -z "$$dir"; then \ + rm -rf `basename $$file`; \ + fi; \ + if test -n "$$dir" && test -d "$$dir"; then \ + rm -rf "$$dir"; \ + fi; \ + done; \ + fi # List all the built man pages here in the Automake BUILT_SOURCES # macro. This hooks into the normal Automake build mechanisms, and @@ -901,7 +1079,7 @@ endif OPAL_BUILD_DOCS if OPAL_INSTALL_DOCS man1_MANS = \ - $(OMPI_MAN1_BUILT) \ + $(OMPI_MAN1_INSTALL_FROM) \ $(OMPI_MAN1_C_REDIRECTS) if OMPI_HAVE_CXX_COMPILER man1_MANS += $(OMPI_MAN1_CXX_REDIRECTS) @@ -913,12 +1091,12 @@ if OMPI_WANT_JAVA_BINDINGS man1_MANS += $(OMPI_MAN1_JAVA_REDIRECTS) endif -man3_MANS = $(OMPI_MAN3_BUILT) -man7_MANS = $(OMPI_MAN7_BUILT) +man3_MANS = $(OMPI_MAN3_INSTALL_FROM) +man7_MANS = $(OMPI_MAN7_INSTALL_FROM) if PROJECT_OSHMEM man1_MANS += \ - $(OSHMEM_MAN1_BUILT) \ + $(OSHMEM_MAN1_INSTALL_FROM) \ $(OSHMEM_MAN1_C_REDIRECTS) # There is no OSHMEM equivalent of this conditional; just use the OMPI # conditional. @@ -929,7 +1107,7 @@ if OSHMEM_BUILD_FORTRAN_BINDINGS man1_MANS += $(OSHMEM_MAN1_FORTRAN_REDIRECTS) endif -man3_MANS += $(OSHMEM_MAN3_BUILT) +man3_MANS += $(OSHMEM_MAN3_INSTALL_FROM) endif # We do not know the names of all the generated HTML files: we only @@ -945,19 +1123,29 @@ endif # Automake-provided install macros to set desirable permissions on the # target directories and files. # -# Since this might be a VPATH build, first check to see if _build/html -# exists in the source tree. If not, do the find+install from the -# build tree. +# Check to see if we actually built the docs. If we did, copy from +# the _build/html tree in the builddir. In all other cases, see if +# there's a _build/html in the source tree (e.g., if this is a build +# from a tarball that included a _build/html); if that exists, copy +# from that. +# +# NOTE: We can't use the AM_CONDITIONAL OPAL_BUILD_DOCS in the middle +# of a block that uses the shell continuation character at the end of +# each line. Instead, we check if $(SPHINX_BUILD) is non-empty, which +# is the test used to construct OPAL_BUILD_DOCS. install-data-hook: $(MKDIR_P) $(DESTDIR)$(docdir) - if test -d $(srcdir)/_build/html; then \ - topdir=$(srcdir)/_build; \ - else \ - topdir=_build; \ + topdir= ; \ + if test -n "$(SPHINX_BUILD)" && test -d $(builddir)/$(HTML_INSTALL_FROM); then \ + topdir="$(builddir)/$(HTML_INSTALL_FROM)"; \ + elif test -d $(srcdir)/$(HTML_INSTALL_FROM); then \ + topdir="$(srcdir)/$(HTML_INSTALL_FROM)"; \ fi; \ - cd $$topdir; \ - find html -type d -exec $(mkinstalldirs) $(DESTDIR)$(docdir)/{} \; ; \ - find html -type f -exec $(INSTALL_DATA) {} $(DESTDIR)$(docdir)/{} \; + if test -n "$$topdir"; then \ + cd $$topdir/..; \ + find html -type d -exec $(mkinstalldirs) $(DESTDIR)$(docdir)/{} \; ; \ + find html -type f -exec $(INSTALL_DATA) {} $(DESTDIR)$(docdir)/{} \; ; \ + fi uninstall-hook: rm -rf $(DESTDIR)$(docdir) diff --git a/docs/conf.py b/docs/conf.py index bf192f5356b..b8b7e8c4690 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,9 +10,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os # -- Project information ----------------------------------------------------- @@ -24,8 +22,20 @@ author = 'The Open MPI Community' # The full version, including alpha/beta/rc tags -# Read the Open MPI version from the VERSION file -with open("../VERSION") as fp: +# Read the Open MPI version from the VERSION file in the source tree +# The docs/Makefile.am will set the env var OMPI_VERSION_FILE, because +# we might be doing a VPATH build. +filename = None +if 'OMPI_VERSION_FILE' in os.environ: + filename = os.environ['OMPI_VERSION_FILE'] +elif os.path.exists("../VERSION"): + filename = '../VERSION' + +if filename is None: + print("ERROR: Could not find Open MPI source tree VERSION file") + exit(1) + +with open(filename) as fp: ompi_lines = fp.readlines() ompi_data = dict() diff --git a/docs/index.rst b/docs/index.rst index a1f7d0b6d2f..c339c213622 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,9 +28,13 @@ Documentation for Open MPI can be found in the following locations: * - v5.0.0 and later - Web: https://docs.open-mpi.org/ - Tarball: ``docs/_build/html/index.html`` + Included in tarball: ``docs/html/index.html`` - Installed: ``$prefix/share/doc/openmpi/html/index.html`` + Built in source tree (if Sphinx available): ``docs/_build/html/index.html`` + + Installed: ``$docdir/html/index.html`` + + (which defaults to: ``$prefix/share/doc/openmpi/html/index.html``) * - v4.1.x and earlier - See the `legacy Open MPI FAQ `_ diff --git a/docs/installing-open-mpi/packagers.rst b/docs/installing-open-mpi/packagers.rst index 6435abded08..e43d52b101a 100644 --- a/docs/installing-open-mpi/packagers.rst +++ b/docs/installing-open-mpi/packagers.rst @@ -1,3 +1,5 @@ +.. _label-install-packagers: + Advice for packagers ==================== @@ -20,9 +22,26 @@ the following: .. code-block:: sh + # Install Sphinx so that Open MPI can re-build its docs with the + # installed PRRTE's docs + + virtualalenv venv + . ./venv/bin/activate + pip install docs/requirements.txt + ./configure --with-libevent=external --with-hwloc=external \ --with-pmix=external --with-prrte=external ... +.. important:: Note the installation of the Sphinx tool so that Open + MPI can re-build its documentation with the external + PRRTE's documentation. + + Failure to do this will mean Open MPI's documentation + will be correct for the version of PRRTE that is + bundled in the Open MPI distribution, but may not be + entirely correct for the version of PRRTE that you are + building against. + The ``external`` keywords will force Open MPI's ``configure`` to ignore all the bundled libraries and only look for external versions of these support libraries. This also has the benefit of causing @@ -36,6 +55,29 @@ independently-built and installed versions. information about the required support library ``--with-FOO`` command line options. +Have Sphinx installed +--------------------- + +Since you should be (will be) installing Open MPI against an external +PRRTE and PMIx, you should have `Sphinx +`_ installed before running Open MPI's +``configure`` script. + +This will allow Open MPI to (re-)build its documentation according to +the PMIx and PRRTE that you are building against. + +To be clear: the Open MPI distribution tarball comes with pre-built +documentation |mdash| rendered in HTML and nroff |mdash| that is +suitable for the versions of PRRTE and PMIx that are bundled in that +tarball. + +However, if you are building Open MPI against not-bundled versions of +PRRTE / PMIx (as all packagers should be), Open MPI needs to re-build +its documentation with specific information from those external PRRTE +/ PMIx installs. For that, you need to have Sphinx installed before +running Open MPI's ``configure`` script. + + .. _label-install-packagers-dso-or-not: Components ("plugins"): DSO or no? diff --git a/docs/installing-open-mpi/required-support-libraries.rst b/docs/installing-open-mpi/required-support-libraries.rst index 9e02297998b..b411e1a02f5 100644 --- a/docs/installing-open-mpi/required-support-libraries.rst +++ b/docs/installing-open-mpi/required-support-libraries.rst @@ -399,6 +399,5 @@ Open MPI package should not include Hwloc, Libevent, PMIx, or PRRTE. Instead, it should depend on external, independently-built versions of these packages. -See the :ref:`Advice for packagers -` section for more -details. +See the :ref:`Advice for packagers ` section +for more details. diff --git a/docs/man-openmpi/man1/mpirun.1.rst b/docs/man-openmpi/man1/mpirun.1.rst index 66a0e75c269..c9168b60076 100644 --- a/docs/man-openmpi/man1/mpirun.1.rst +++ b/docs/man-openmpi/man1/mpirun.1.rst @@ -60,15 +60,17 @@ probably want to use a command line of the following form: This will run ``X`` copies of ```` in your current run-time environment (if running under a supported resource manager, Open MPI's -mpirun will usually automatically use the corresponding resource -manager process starter, as opposed to, for example, ``rsh`` or ``ssh``, which -require the use of a hostfile, or will default to running all ``X`` copies -on the localhost), scheduling (by default) in a round-robin fashion by -CPU slot. See the rest of this page for more details. - -Please note that mpirun automatically binds processes as of the start -of the v1.8 series. Three binding patterns are used in the absence of -any further directives (See :ref:`map/rank/bind defaults ` for more details): +``mpirun`` will usually automatically use the corresponding resource +manager process starter, as opposed to ``ssh`` (for example), which +require the use of a hostfile, or will default to running all ``X`` +copies on the localhost), scheduling (by default) in a round-robin +fashion by CPU slot. See the rest of this documentation for more +details. + +Please note that ``mpirun`` automatically binds processes to hardware +resources. Three binding patterns are used in the absence of any +further directives (See :ref:`map/rank/bind defaults +` for more details): * **Bind to core**: when the number of processes is <= 2 * **Bind to package**: when the number of processes is > 2 @@ -79,103 +81,43 @@ that you are either not bound at all (by specifying ``--bind-to none``), or bound to multiple cores using an appropriate binding level or specific number of processing elements per application process. -.. _man1-mpirun-definition-of-slot: - -DEFINITION OF 'SLOT' --------------------- - -The term "slot" is used extensively in the rest of this manual page. -A slot is an allocation unit for a process. The number of slots on a -node indicate how many processes can potentially execute on that node. -By default, Open MPI will allow one process per slot. - -If Open MPI is not explicitly told how many slots are available on a -node (e.g., if a hostfile is used and the number of slots is not -specified for a given node), it will determine a maximum number of -slots for that node in one of two ways: - -#. Default behavior: By default, Open MPI will attempt to discover the - number of processor cores on the node, and use that as the number - of slots available. - -#. When ``--use-hwthread-cpus`` is used: If ``--use-hwthread-cpus`` is - specified on the ``mpirun`` command line, then Open MPI will attempt to - discover the number of hardware threads on the node, and use that - as the number of slots available. - -This default behavior also occurs when specifying the ``--host`` -option with a single host. Thus, the command: - -.. code:: sh - - shell$ mpirun --host node1 ./a.out - -launches a number of processes equal to the number of cores on node -``node1``, whereas: - -.. code:: sh - - shell$ mpirun --host node1 --use-hwthread-cpus ./a.out - -launches a number of processes equal to the number of hardware -threads on ``node1``. - -When Open MPI applications are invoked in an environment managed by a -resource manager (e.g., inside of a Slurm job), and Open MPI was built -with appropriate support for that resource manager, then Open MPI will -be informed of the number of slots for each node by the resource -manager. For example: - -.. code:: sh - - shell$ mpirun ./a.out - -launches one process for every slot (on every node) as dictated by -the resource manager job specification. - -Also note that the one-process-per-slot restriction can be overridden -in unmanaged environments (e.g., when using hostfiles without a -resource manager) if oversubscription is enabled (by default, it is -disabled). Most MPI applications and HPC environments do not -oversubscribe; for simplicity, the majority of this documentation -assumes that oversubscription is not enabled. - -Slots are not hardware resources -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OPEN MPI'S USE OF PRRTE +----------------------- -Slots are frequently incorrectly conflated with hardware resources. -It is important to realize that slots are an entirely different metric -than the number (and type) of hardware resources available. +Open MPI uses the PMIx Reference Runtime Environment (PRRTE) as the +main engine for launching, monitoring, and terminating MPI processes. -Here are some examples that may help illustrate the difference: +Much of the documentation below is directly imported from PRRTE. As +such, it frequently refers to PRRTE concepts and command line options. +Except where noted, these concepts and command line argument are all +applicable to Open MPI as well. Open MPI extends the available PRRTE +command line options, and also slightly modifies the PRRTE's default +behaviors in a few cases. These will be specifically described in the +docuemtnation below. -#. More processor cores than slots: Consider a resource manager job - environment that tells Open MPI that there is a single node with 20 - processor cores and 2 slots available. By default, Open MPI will - only let you run up to 2 processes. - - Meaning: you run out of slots long before you run out of processor - cores. +COMMAND LINE OPTIONS +-------------------- -#. More slots than processor cores: Consider a hostfile with a single - node listed with a ``slots=50`` qualification. The node has 20 - processor cores. By default, Open MPI will let you run up to 50 - processes. +The core of Open MPI's ``mpirun`` processing is performed via the +`PRRTE `_. Specifically: ``mpirun`` is +effectively a wrapper around ``prterun``, but ``mpirun``'s CLI options +are slightly different than PRRTE's CLI commands. - Meaning: you can run many more processes than you have processor - cores. +.. include:: /schizo-ompi-rst-content/schizo-ompi-cli.rstxt -.. _man1-mpirun-definition-of-processor-element: +OPTIONS (OLD / HARD-CODED CONTENT -- TO BE AUDITED +-------------------------------------------------- -DEFINITION OF 'PROCESSOR ELEMENT' ---------------------------------- +.. admonition:: This is old content + :class: error -By default, Open MPI defines that a "processing element" is a -processor core. However, if ``--use-hwthread-cpus`` is specified on the -mpirun command line, then a "processing element" is a hardware thread. + This is the old section of manually hard-coded content. It should + probably be read / audited and see what we want to keep and what we + want to discard. -OPTIONS -------- + Feel free to refer to https://docs.prrte.org/ rather than + replicating content here (e.g., for the definition of a slot and + other things). mpirun will send the name of the directory where it was invoked on the local node to each of the remote nodes, and attempt to change to that @@ -251,10 +193,11 @@ processes will be bound to the package. context. If no value is provided for the number of copies to execute (i.e., neither the ``-n`` nor its synonyms are provided on the command line), Open MPI will automatically execute a copy of the - program on each process slot (see :ref:`defintion of slot ` for description of a - "process slot"). This feature, however, can only be used in the SPMD - model and will return an error (without beginning execution of the - application) otherwise. + program on each process slot (see PRRTE's `defintion of "slot" + `_ + for description of a "process slot"). This feature, however, can + only be used in the SPMD model and will return an error (without + beginning execution of the application) otherwise. .. note:: The ``-n`` option is the preferred option to be used to specify the number of copies of the program to be executed, but the alternate @@ -280,7 +223,7 @@ To map processes: * ``--map-by ``: Map to the specified object, defaults to ``package``. Supported options include ``slot``, ``hwthread``, ``core``, ``L1cache``, ``L2cache``, ``L3cache``, ``package``, ``numa``, - ``node``, ``seq``, ``rankfile``, ``pe-list=#``, and ``ppr``. + ``node``, ``seq``, ``rankfile``, ``pe-list=#``, and ``ppr``. Any object can include modifiers by adding a ``:`` and any combination of the following: @@ -561,13 +504,17 @@ There are also other options: Note that if a number of slots is not provided to Open MPI (e.g., via the ``slots`` keyword in a hostfile or from a resource manager such as Slurm), the use of this option changes the default - calculation of number of slots on a node. See the :ref:`DEFINITION - OF 'SLOT' ` section. + calculation of number of slots on a node. See the PRRTE's + `defintion of "slot" + `_ + for more details. Also note that the use of this option changes the Open MPI's definition of a "processor element" from a processor core to a - hardware thread. See the :ref:`DEFINITION OF 'PROCESSOR ELEMENT' - ` section. + hardware thread. See + PRRTE's `defintion of a "processor element" + `_ + for more details. The following options are useful for developers; they are not generally useful to most Open MPI users: @@ -601,11 +548,23 @@ There may be other options listed with ``mpirun --help``. Environment Variables ^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + * ``MPIEXEC_TIMEOUT``: Synonym for the ``--timeout`` command line option. DESCRIPTION ----------- +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + One invocation of ``mpirun`` starts an MPI application running under Open MPI. If the application is single process multiple data (SPMD), the application can be specified on the ``mpirun`` command line. @@ -630,6 +589,12 @@ while others are specific to a single program (e.g., ``-n``). Specifying Host Nodes ^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Host nodes can be identified on the ``mpirun`` command line with the ``--host`` option or in a hostfile. @@ -679,6 +644,12 @@ from the resource manager. Specifying Number of Processes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + As we have just seen, the number of processes to run can be set using the hostfile. Other mechanisms exist. @@ -733,6 +704,12 @@ the ``-n`` option indicated that only 6 processes should be launched. Mapping Processes to Nodes: Using Policies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The examples above illustrate the default mapping of process processes to nodes. This mapping can also be controlled with various ``mpirun`` options that describe mapping policies. @@ -845,6 +822,12 @@ and 2 each running uptime on nodes ``bb`` and ``cc``, respectively. Mapping, Ranking, and Binding: Oh My! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI employs a three-phase procedure for assigning process locations and ranks: @@ -934,7 +917,7 @@ Alternatively, processes can be mapped and bound to specified cores using the ``--map-by pe-list=`` option. For example, ``--map-by pe-list=0,2,5`` will map three processes all three of which will be bound to logical cores ``0,2,5``. If you intend to bind each of the three processes to different -cores then the ``:ordered`` qualifier can be used like +cores then the ``:ordered`` qualifier can be used like ``--map-by pe-list=0,2,5:ordered``. In this example, the first process on a node will be bound to CPU 0, the second process on the node will be bound to CPU 2, and the third process on the node will be bound to @@ -992,7 +975,7 @@ in ranking when the ``span`` qualifier is used instead of the default. In the above case, the output shows us that 2 cores have been bound per process. Specifically, the mapping by ``slot`` with the ``PE=2`` qualifier indicated that each slot (i.e., process) should consume two processor -elements. By default, Open MPI defines "processor element" as "core", +elements. By default, Open MPI defines "processor element" as "core", and therefore the ``--bind-to core`` caused each process to be bound to both of the cores to which it was mapped. @@ -1030,16 +1013,16 @@ MCA parameters can be set not only on the mpirun command line, but alternatively in a system or user ``mca-params.conf`` file or as environment variables, as described in the :ref:`Setting MCA Parameters `. These are MCA parameters for -the PRRTE runtime so the command line argument ``--PRRTEmca`` must be used to +the PRRTE runtime so the command line argument ``--PRRTEmca`` must be used to pass the MCA parameter key/value pair. Alternatively, the MCA parameter key/ -value pair may be specific on the command line by prefixing the key with +value pair may be specific on the command line by prefixing the key with ``PRRTE_MCA_``. Some examples include: .. list-table:: :header-rows: 1 * - Option - - PRRTE MCA parameter key + - PRRTE MCA parameter key - Value * - ``--map-by core`` @@ -1071,6 +1054,12 @@ value pair may be specific on the command line by prefixing the key with Defaults for Mapping, Ranking, and Binding ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + If the user does not specify each of ``--map-by``, ``--rank-by``, and ``--bind-to`` option then the default values are as follows: * If no options are specified then @@ -1167,6 +1156,12 @@ The mapping pattern might be better seen if we change the default ``--rank-by`` Rankfiles ^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Rankfiles are text files that specify detailed information about how individual processes should be mapped to nodes, and to which processor(s) they should be bound. Each line of a rankfile specifies @@ -1226,6 +1221,12 @@ indexes of package and cores. Application Context or Executable Program? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + To distinguish the two different forms, mpirun looks on the command line for ``--app`` option. If it is specified, then the file named on the command line is assumed to be an application context. If it is @@ -1234,6 +1235,12 @@ not specified, then the file is assumed to be an executable program. Locating Files ^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + If no relative or absolute path is specified for a file, Open MPI will first look for files by searching the directories specified by the ``--path`` option. If there is no ``--path`` option set or if the @@ -1252,6 +1259,12 @@ current working directory from the invocation of ``mpirun``. Current Working Directory ^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--wdir`` ``mpirun`` option (and its synonym, ``--wd``) allows the user to change to an arbitrary directory before the program is invoked. It can also be used in application context files to specify @@ -1279,6 +1292,12 @@ does not wait until :ref:`MPI_INIT(3) ` is called. Standard I/O ^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI directs UNIX standard input to ``/dev/null`` on all processes except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process inherits standard input from ``mpirun``. @@ -1309,6 +1328,12 @@ will be collected into the ``my_output`` file. Signal Propagation ^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + When ``mpirun`` receives a SIGTERM and SIGINT, it will attempt to kill the entire job by sending all processes in the job a SIGTERM, waiting a small number of seconds, then sending all processes in the job a @@ -1326,6 +1351,12 @@ Other signals are not currently propagated by ``mpirun``. Process Termination / Signal Handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + During the run of an MPI application, if any process dies abnormally (either exiting before invoking :ref:`MPI_FINALIZE(3) `, or dying as the result of a signal), ``mpirun`` will print out an @@ -1346,6 +1377,12 @@ safest) for the user to only clean up non-MPI state. Process Environment ^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Processes in the MPI application inherit their environment from the PRRTE daemon upon the node on which they are running. The environment is typically inherited from the user's shell. On remote @@ -1365,6 +1402,12 @@ for more details. Remote Execution ^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI requires that the ``PATH`` environment variable be set to find executables on remote nodes (this is typically only necessary in rsh- or ssh-based environments |mdash| batch/scheduled environments @@ -1431,6 +1474,12 @@ is equivalent to Exported Environment Variables ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + All environment variables that are named in the form ``OMPI_*`` will automatically be exported to new processes on the local and remote nodes. Environmental parameters can also be set/forwarded to the new @@ -1448,6 +1497,12 @@ them. Setting MCA Parameters ^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--mca`` switch allows the passing of parameters to various MCA (Modular Component Architecture) modules. MCA modules have direct impact on MPI programs because they allow tunable parameters to be set @@ -1508,6 +1563,12 @@ page for detailed information on this command. Setting MCA parameters and environment variables from file ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--tune`` command line option and its synonym ``--mca`` ``mca_base_envar_file_prefix`` allows a user to set MCA parameters and environment variables with the syntax described below. This option @@ -1532,6 +1593,12 @@ have higher precedence than variables specified in the file. Running as root ^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + .. warning:: The Open MPI team **strongly** advises against executing ``mpirun`` as the root user. MPI applications should be run as regular (non-root) users. @@ -1558,6 +1625,12 @@ against this behavior. Exit status ^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + There is no standard definition for what ``mpirun`` should return as an exit status. After considerable discussion, we settled on the following method for assigning the ``mpirun`` exit status (note: in @@ -1599,6 +1672,12 @@ bullet points above). EXAMPLES -------- +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Be sure also to see the examples throughout the sections above. .. code:: sh @@ -1613,6 +1692,12 @@ messages. RETURN VALUE ------------ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + ``mpirun`` returns 0 if all processes started by mpirun exit after calling :ref:`MPI_FINALIZE(3) `. A non-zero value is returned if an internal error occurred in mpirun, or one or more diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index bd1bc89ad57..749dfa1c7bf 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -154,9 +154,10 @@ Open MPI version 5.0.0rc12 - Many MPI one-sided and RDMA emulation fixes for the ``tcp`` BTL. - - This patch series fixs many issues when running with ``--mca - osc rdma --mca btl tcp``, i.e., TCP support for one sided - MPI calls. + This patch series fixs many issues when running with ``--mca + osc rdma --mca btl tcp``, i.e., TCP support for one sided + MPI calls. + - Many MPI one-sided fixes for the ``uct`` BTL. - Added support for ``acc_single_intrinsic`` to the one-sided ``ucx`` component. diff --git a/docs/no-prrte-content.rst.txt b/docs/no-prrte-content.rst.txt new file mode 100644 index 00000000000..ea034952d31 --- /dev/null +++ b/docs/no-prrte-content.rst.txt @@ -0,0 +1,24 @@ +.. This file is only used in certain cases. Hence, the original file + in the Open MPI "docs" source tree ends in ".txt", so that Sphinx + will not complain if it is not used. If it *is* used, it is copied + to another file (that ends in ".rst") so that it can be properly + found / used by Sphinx. + +No content +^^^^^^^^^^ + +There is no meaningful content in this file because Open MPI was either: + +* Built without PRRTE support. + +* Built with a PRRTE that was too old to include machine-readable + documentation that could be incorporated into Open MPI's + documentation. + +If you build Open MPI with a newer version of PRRTE (and have the +Sphinx tool available when you run Open MPI's ``configure`` command), +you should get more meaningful documentation here. + +Hence, there is no documentation for this section. + +Sorry! From c3569811ec7cb518bb202b7d2fbeb32b238ca3c2 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sun, 10 Sep 2023 20:14:37 -0400 Subject: [PATCH 32/73] ReadTheDocs CI builds updates Since RTD doesn't run autogen, configure, or make, we now have to manually copy a few RST files from the embedded PRRTE to the docs/ tree before RTD invokes Sphinx. Signed-off-by: Jeff Squyres --- .readthedocs-pre-create-environment.sh | 36 ++++++++++++++++++++++++++ .readthedocs.yaml | 8 ++++++ 2 files changed, 44 insertions(+) create mode 100755 .readthedocs-pre-create-environment.sh diff --git a/.readthedocs-pre-create-environment.sh b/.readthedocs-pre-create-environment.sh new file mode 100755 index 00000000000..2709b822b80 --- /dev/null +++ b/.readthedocs-pre-create-environment.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -euxo pipefail + +# The ReadTheDocs build process does not run autogen/configure/make. +# Hence, we have to copy the PRRTE RST files (from the 3rd-party/prrte +# tree) to our docs/ tree manually. + +# Ensure that we're in the RTD CI environment + +if [[ "${READTHEDOCS:-no}" == "no" ]]; then + echo "This script is only intended to be run in the ReadTheDocs CI environment" + exit 1 +fi + +SCHIZO_SRC_DIR=3rd-party/prrte/src/mca/schizo/ompi +SCHIZO_TARGET_DIR=docs/schizo-ompi-rst-content + +PRRTE_RST_SRC_DIR=3rd-party/prrte/src/docs/prrte-rst-content +PRRTE_RST_TARGET_DIR=docs/prrte-rst-content + +# Copy the OMPI schizo file from PRRTE + +cp -rp $SCHIZO_SRC_DIR $SCHIZO_TARGET_DIR + +# Only copy the PRRTE RST source files in prrte-rst-content that are +# referenced by ".. include::" in the schizo-ompi-cli.rst file. We do +# this because Sphinx complains if there are .rst files that are not +# referenced. :-( + +mkdir -p $PRRTE_RST_TARGET_DIR +files=`fgrep '.. include::' $SCHIZO_TARGET_DIR/schizo-ompi-cli.rstxt | awk '{ print $3 }'` +for file in $files; do + filename=`basename $file` + cp -pf $PRRTE_RST_SRC_DIR/$filename $PRRTE_RST_TARGET_DIR +done diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 44e0bbac5a7..2ba1fc07842 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,6 +12,11 @@ build: os: ubuntu-22.04 tools: python: "3.10" + jobs: + # RTD doesn't run configure or make. So we have to manually copy + # in the PRRTE RST files to docs/. + pre_create_environment: + - ./.readthedocs-pre-create-environment.sh python: install: @@ -21,3 +26,6 @@ python: sphinx: configuration: docs/conf.py fail_on_warning: true + +submodules: + include: all From e1f2eafd846349d26eab457ecd60f19590400ba9 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Mon, 11 Sep 2023 23:19:41 +0000 Subject: [PATCH 33/73] opal: deprecate dead code Remove unused code. The logic has been moved to openpmix/prrte. Signed-off-by: Wenduo Wang --- contrib/ompi_cplusplus.txt | 1 - opal/util/Makefile.am | 2 - opal/util/opal_pty.c | 256 ------------------------------------- opal/util/opal_pty.h | 53 -------- 4 files changed, 312 deletions(-) delete mode 100644 opal/util/opal_pty.c delete mode 100644 opal/util/opal_pty.h diff --git a/contrib/ompi_cplusplus.txt b/contrib/ompi_cplusplus.txt index a61994b0e69..35f2c95e36a 100644 --- a/contrib/ompi_cplusplus.txt +++ b/contrib/ompi_cplusplus.txt @@ -132,7 +132,6 @@ ./opal/util/few.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/keyval_parse.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/malloc.h: defined(c_plusplus) defined(__cplusplus) -./opal/util/opal_pty.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/os_path.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/qsort.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/show_help_lex.h: defined(c_plusplus) defined(__cplusplus) diff --git a/opal/util/Makefile.am b/opal/util/Makefile.am index 646f44412b2..23f6b0ccd67 100644 --- a/opal/util/Makefile.am +++ b/opal/util/Makefile.am @@ -63,7 +63,6 @@ headers = \ numtostr.h \ opal_environ.h \ opal_getcwd.h \ - opal_pty.h \ os_dirpath.h \ os_path.h \ output.h \ @@ -108,7 +107,6 @@ libopalutil_core_la_SOURCES = \ numtostr.c \ opal_environ.c \ opal_getcwd.c \ - opal_pty.c \ os_dirpath.c \ os_path.c \ output.c \ diff --git a/opal/util/opal_pty.c b/opal/util/opal_pty.c deleted file mode 100644 index adbbc8570bb..00000000000 --- a/opal/util/opal_pty.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2018 Cisco Systems, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/*- - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "opal_config.h" - -#ifdef HAVE_SYS_CDEFS_H -# include -#endif -#ifdef HAVE_SYS_TYPES_H -# include -#endif -#include -#ifdef HAVE_SYS_IOCTL_H -# include -#endif -#ifdef HAVE_FCNTL_H -# include -#endif -#ifdef HAVE_TERMIOS_H -# include -#else -# ifdef HAVE_TERMIO_H -# include -# endif -#endif -#include -#ifdef HAVE_UNISTD_H -# include -#endif -#include -#include -#ifdef HAVE_GRP_H -# include -#endif -#ifdef HAVE_PTY_H -# include -#endif -#ifdef HAVE_UTMP_H -# include -#endif - -#ifdef HAVE_PTSNAME -# include -# ifdef HAVE_STROPTS_H -# include -# endif -#endif - -#ifdef HAVE_UTIL_H -# include -#endif - -#include "opal/util/opal_pty.h" - -/* The only public interface is openpty - all others are to support - openpty() */ - -#if OPAL_ENABLE_PTY_SUPPORT == 0 - -int opal_openpty(int *amaster, int *aslave, char *name, void *termp, void *winpp) -{ - return -1; -} - -#elif defined(HAVE_OPENPTY) - -int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, struct winsize *winp) -{ - return openpty(amaster, aslave, name, termp, winp); -} - -#else - -/* implement openpty in terms of ptym_open and ptys_open */ - -static int ptym_open(char *pts_name); -static int ptys_open(int fdm, char *pts_name); - -int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, struct winsize *winp) -{ - char line[20]; - *amaster = ptym_open(line); - if (*amaster < 0) { - return -1; - } - *aslave = ptys_open(*amaster, line); - if (*aslave < 0) { - close(*amaster); - return -1; - } - if (name) { - // We don't know the max length of name, but we do know the - // max length of the source, so at least use that. - opal_string_copy(name, line, sizeof(line)); - } -# ifndef TCSAFLUSH -# define TCSAFLUSH TCSETAF -# endif - if (termp) { - (void) tcsetattr(*aslave, TCSAFLUSH, termp); - } -# ifdef TIOCSWINSZ - if (winp) { - (void) ioctl(*aslave, TIOCSWINSZ, (char *) winp); - } -# endif - return 0; -} - -static int ptym_open(char *pts_name) -{ - int fdm; -# ifdef HAVE_PTSNAME - char *ptr; - -# ifdef _AIX - strcpy(pts_name, "/dev/ptc"); -# else - strcpy(pts_name, "/dev/ptmx"); -# endif - fdm = open(pts_name, O_RDWR); - if (fdm < 0) { - return -1; - } - if (grantpt(fdm) < 0) { /* grant access to slave */ - close(fdm); - return -2; - } - if (unlockpt(fdm) < 0) { /* clear slave's lock flag */ - close(fdm); - return -3; - } - ptr = ptsname(fdm); - if (ptr == NULL) { /* get slave's name */ - close(fdm); - return -4; - } - strcpy(pts_name, ptr); /* return name of slave */ - return fdm; /* return fd of master */ -# else - char *ptr1, *ptr2; - - strcpy(pts_name, "/dev/ptyXY"); - /* array index: 012345689 (for references in following code) */ - for (ptr1 = "pqrstuvwxyzPQRST"; *ptr1 != 0; ptr1++) { - pts_name[8] = *ptr1; - for (ptr2 = "0123456789abcdef"; *ptr2 != 0; ptr2++) { - pts_name[9] = *ptr2; - /* try to open master */ - fdm = open(pts_name, O_RDWR); - if (fdm < 0) { - if (errno == ENOENT) { /* different from EIO */ - return -1; /* out of pty devices */ - } else { - continue; /* try next pty device */ - } - } - pts_name[5] = 't'; /* change "pty" to "tty" */ - return fdm; /* got it, return fd of master */ - } - } - return -1; /* out of pty devices */ -# endif -} - -static int ptys_open(int fdm, char *pts_name) -{ - int fds; -# ifdef HAVE_PTSNAME - /* following should allocate controlling terminal */ - fds = open(pts_name, O_RDWR); - if (fds < 0) { - close(fdm); - return -5; - } -# if defined(__SVR4) && defined(__sun) - if (ioctl(fds, I_PUSH, "ptem") < 0) { - close(fdm); - close(fds); - return -6; - } - if (ioctl(fds, I_PUSH, "ldterm") < 0) { - close(fdm); - close(fds); - return -7; - } -# endif - - return fds; -# else - int gid; - struct group *grptr; - - grptr = getgrnam("tty"); - if (grptr != NULL) { - gid = grptr->gr_gid; - } else { - gid = -1; /* group tty is not in the group file */ - } - /* following two functions don't work unless we're root */ - chown(pts_name, getuid(), gid); - chmod(pts_name, S_IRUSR | S_IWUSR | S_IWGRP); - fds = open(pts_name, O_RDWR); - if (fds < 0) { - close(fdm); - return -1; - } - return fds; -# endif -} - -#endif /* #ifdef HAVE_OPENPTY */ diff --git a/opal/util/opal_pty.h b/opal/util/opal_pty.h deleted file mode 100644 index f30cd97d5ec..00000000000 --- a/opal/util/opal_pty.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OPAL_UTIL_PTY_H -#define OPAL_UTIL_PTY_H - -#include "opal_config.h" - -#ifdef HAVE_UTIL_H -# include -#endif -#ifdef HAVE_LIBUTIL_H -# include -#endif -#ifdef HAVE_TERMIOS_H -# include -#else -# ifdef HAVE_TERMIO_H -# include -# endif -#endif - -BEGIN_C_DECLS - -#if OPAL_ENABLE_PTY_SUPPORT - -OPAL_DECLSPEC int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, - struct winsize *winp); - -#else - -OPAL_DECLSPEC int opal_openpty(int *amaster, int *aslave, char *name, void *termp, void *winpp); - -#endif - -END_C_DECLS - -#endif /* OPAL_UTIL_PTY_H */ From fff842684005f556343209c0f4c2e88133f19b51 Mon Sep 17 00:00:00 2001 From: Evgeny Baskakov Date: Mon, 25 Sep 2023 12:25:40 -0700 Subject: [PATCH 34/73] Bugfix in OMPI_ARRAY_FINT_2_INT_ALLOC and OMPI_ARRAY_LOGICAL_2_INT_ALLOC macros for incorrect storage size calculation. Signed-off-by: Evgeny Baskakov --- ompi/mpi/fortran/base/fint_2_int.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ompi/mpi/fortran/base/fint_2_int.h b/ompi/mpi/fortran/base/fint_2_int.h index d3c71454386..ec2ba43fa1b 100644 --- a/ompi/mpi/fortran/base/fint_2_int.h +++ b/ompi/mpi/fortran/base/fint_2_int.h @@ -60,7 +60,7 @@ /* This is for OUT parameters. Does only alloc */ #define OMPI_ARRAY_FINT_2_INT_ALLOC(in, n) \ - OMPI_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) /* This is for IN/IN-OUT parameters. Does alloc and assignment */ #define OMPI_ARRAY_FINT_2_INT(in, n) \ @@ -117,7 +117,7 @@ /* This is for OUT parameters. Does only alloc */ #define OMPI_ARRAY_FINT_2_INT_ALLOC(in, n) \ - OMPI_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) #define OMPI_ARRAY_FINT_2_INT(in, n) \ do { \ @@ -204,7 +204,7 @@ # define OMPI_LOGICAL_ARRAY_NAME_DECL(in) int * c_##in # define OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) c_##in # define OMPI_ARRAY_LOGICAL_2_INT_ALLOC(in,n) \ - OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) # define OMPI_ARRAY_LOGICAL_2_INT_CLEANUP(in) \ free(OMPI_LOGICAL_ARRAY_NAME_CONVERT(in)) From 62d19b01534fd3f781e33f2762ef5371a9dc95f2 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 21 Sep 2023 13:38:37 +0000 Subject: [PATCH 35/73] pr-checks: update compile-rocm workflow - use rocm-hip-runtime instead of rocm-hip-sdk macropackage to reduce the size of the installed packages - add a clean-up step to the rocm-compile script to help potentially with the memory-consumption of the github actions environment. Signed-off-by: Edgar Gabriel --- .github/workflows/compile-rocm.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/compile-rocm.yaml b/.github/workflows/compile-rocm.yaml index 7c98e1a5916..cf4ad932032 100644 --- a/.github/workflows/compile-rocm.yaml +++ b/.github/workflows/compile-rocm.yaml @@ -17,7 +17,7 @@ jobs: curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg echo 'deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/debian focal main' | sudo tee /etc/apt/sources.list.d/rocm.list sudo apt-get update - sudo apt-get install -y rocm-hip-sdk + sudo apt-get install -y rocm-hip-runtime - uses: actions/checkout@v3 with: submodules: recursive @@ -26,3 +26,9 @@ jobs: ./autogen.pl ./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran make -j + - name: Clean up + run: | + ls -la ./ + rm -rf ./* + rm -rf ./.??* + ls -la ./ \ No newline at end of file From 2a38fe4390d3ec6ba95996272ec1360dc32c7f4c Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Fri, 29 Sep 2023 15:10:45 -0400 Subject: [PATCH 36/73] Update news in preparation for v5.0.0rc13. Signed-off-by: Austen Lauria --- docs/news/news-v5.0.x.rst | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 749dfa1c7bf..54445d8b743 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -4,9 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI version 5.0.0rc12 +Open MPI version 5.0.0rc13 -------------------------- -:Date: 19 May 2023 +:Date: 29 September 2023 .. admonition:: The MPIR API has been removed :class: warning @@ -66,30 +66,23 @@ Open MPI version 5.0.0rc12 Libevent symbols and then statically pulled the library into ``libmpi.so``. -- Changes since rc11: - - - ``accelerator/rocm``: add SYNC_MEMOPS support. - - Update PMIx, PRRTe, and OAC submodule pointers. - - Fix ``mca_btl_ofi_flush()`` in multithreaded environments.. - - ``smcuda``: fixed an edge case when building MCA components as - dynamic shared objects. - - Fix ``MPI_Session_init()`` bug if all previous sessions are - finalized. - - Fix `mpi4py `_ hang in - ``MPI_Intercomm_create_from_groups()``. - - Fix finalization segfault with OSHMEM 4.1.5. - - Improve AVX detection. Fixes ``op/avx`` link failure with the - ``nvhpc`` compiler. - - Fix incorrect results with ``pml/ucx`` using Intel compiler. - - Fix segfault when broadcasting large MPI structs. - - Add platform files for Google Cloud HPC. - - UCC/HCOLL: Fix ``MPI_Waitall()`` for non blokcing collectives. - - Fix pre-built docs check. +- Changes since rc12: + + - Update PMIx to the ``v4.2.6`` release tag. Hash: ``f20e0d5``. + - Update PRRTE to the ``v3.0.1`` release tag. Hash: ``63370ca``. + - Lots of documentation updates. + - Fixed parameter name in ``MPI_Intercomm_merge``. Thanks to Yan Wu for the report. + - ``OFI``: Update NIC selection to determine optimal interfaces from the current process. + - Fix reordering of received data in ``MPI_Gather``. + - Disable builds with ``HWLOC`` versions >= 3.0.0. This is currently not supported. + - Fix re-ordering of ranks in ``MPI_Dist_graph_create``. + - ``coll/HAN``: Fix bug when using ``MPI_IN_PLACE`` with ``MPI_Reduce``. + - Fix ``MPI_Type_Dup`` to propagate errors from inner calls. + - Fix the compilation of the monitoring infrastructure. + - Various other bug fixes. - All other notable updates for v5.0.0: - - Update PMIx to the ``v4.2`` branch - current hash: ``f34a7ce2``. - - Update PRRTE to the ``v3.0`` branch - current hash: ``c4925aa5cc``. - New Features: - ULFM Fault Tolerance support has been added. See :ref:`the ULFM From fea96e1b2debdcbc896123b23cf642fb42dd6b3e Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 24 Aug 2023 12:04:11 -0500 Subject: [PATCH 37/73] docs: Better explain required versions of support libraries Get rid of the table that listed the required support library versions -- the table format was too limiting. Instead, add a bunch more verbiage about the versions that are included in the Open MPI distribution tarball and the minimum required versions of each of hwloc, libevent, PMIx, and PRRTE. Pay special attention to the corner cases of building PMIx (internal and external), with and without PRRTE. Also set the minimum required versions for PMIx and PRRTE in VERSIONS: * Testing shows that Open MPI v5.0.x requires at least PMIx v4.2.0 * The minimum required version for PRRTE is v3.0.0, but we recommend in the docs that users use >=v3.0.1 so that they get a full mpirun(1) man page Finally, also show in the docs the versions of the embedded packages (hwloc, libevent, PMIx, and PRRTE). This required adding a little Python in docs/conf.py to read VERSION files and extract version numbers from tarball filenames. Signed-off-by: Jeff Squyres --- VERSION | 4 +- docs/Makefile.am | 4 +- docs/conf.py | 108 ++++++++++---- .../required-support-libraries.rst | 141 ++++++++++++------ 4 files changed, 178 insertions(+), 79 deletions(-) diff --git a/VERSION b/VERSION index 2178439c11a..b2ba2da0c1f 100644 --- a/VERSION +++ b/VERSION @@ -25,8 +25,8 @@ mpi_standard_subversion=1 # OMPI required dependency versions. # List in x.y.z format. -pmix_min_version=4.1.2 -prte_min_version=2.0.2 +pmix_min_version=4.2.0 +prte_min_version=3.0.0 hwloc_min_version=1.11.0 event_min_version=2.0.21 automake_min_version=1.13.4 diff --git a/docs/Makefile.am b/docs/Makefile.am index dc9a085e99e..2023ece395b 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -1032,8 +1032,8 @@ $(ALL_MAN_BUILT): filename=`basename $$file`; \ cp -pf $(OMPI_PRRTE_RST_CONTENT_DIR)/$$filename "$(builddir)/prrte-rst-content"; \ done - $(OMPI_V_SPHINX_HTML) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M html "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) - $(OMPI_V_SPHINX_MAN) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M man "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_HTML) OMPI_TOP_SRCDIR=$(top_srcdir) $(SPHINX_BUILD) -M html "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_MAN) OMPI_TOP_SRCDIR=$(top_srcdir) $(SPHINX_BUILD) -M man "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) # A useful rule to invoke manually to ensure that all of the external # HTML links we have are valid. Running this rule requires diff --git a/docs/conf.py b/docs/conf.py index b8b7e8c4690..f0d0e283092 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,46 +14,85 @@ # -- Project information ----------------------------------------------------- +import os +import re import datetime + year = datetime.datetime.now().year project = 'Open MPI' copyright = f'2003-{year}, The Open MPI Community' author = 'The Open MPI Community' -# The full version, including alpha/beta/rc tags -# Read the Open MPI version from the VERSION file in the source tree -# The docs/Makefile.am will set the env var OMPI_VERSION_FILE, because -# we might be doing a VPATH build. -filename = None -if 'OMPI_VERSION_FILE' in os.environ: - filename = os.environ['OMPI_VERSION_FILE'] -elif os.path.exists("../VERSION"): - filename = '../VERSION' - -if filename is None: - print("ERROR: Could not find Open MPI source tree VERSION file") - exit(1) - -with open(filename) as fp: - ompi_lines = fp.readlines() - -ompi_data = dict() -for ompi_line in ompi_lines: - if '#' in ompi_line: - parts = ompi_line.split("#") - ompi_line = parts[0] - ompi_line = ompi_line.strip() - - if '=' not in ompi_line: - continue +# --------------------------- - ompi_key, ompi_val = ompi_line.split("=") - ompi_data[ompi_key.strip()] = ompi_val.strip() +# The docs/Makefile.am will set the env var OMPI_TOP_SRCDIR, because +# we might be doing a VPATH build. +ompi_top_srcdir = '..' +if 'OMPI_TOP_SRCDIR' in os.environ: + ompi_top_srcdir = os.environ['OMPI_TOP_SRCDIR'] + +# Read an Open MPI-style VERSION file +def read_version_file(path): + if not os.path.exists(path): + print(f"ERROR: Unable to find file {path}") + exit(1) + + with open(path) as fp: + version_lines = fp.readlines() + + data = dict() + for line in version_lines: + if '#' in line: + parts = line.split("#") + line = parts[0] + line = line.strip() + + if '=' not in line: + continue + + key, val = line.split("=") + data[key.strip()] = val.strip() + + return data + +# Look for a version string via a regular expresion of a filename in a +# given directory +def get_tarball_version(path, expr): + if not os.path.exists(path): + print(f"ERROR: Unable to find path {path}") + exit(1) + + for file in os.listdir(path): + m = re.match(expr, file) + if not m: + continue + return m.group(1) + + return "" + +# Read all the various versions from the source tree + +ompi_data = read_version_file(f"{ompi_top_srcdir}/VERSION") +pmix_data = read_version_file(f"{ompi_top_srcdir}/3rd-party/openpmix/VERSION") +prte_data = read_version_file(f"{ompi_top_srcdir}/3rd-party/prrte/VERSION") + +hwloc_embedded_version = get_tarball_version(f"{ompi_top_srcdir}/3rd-party/", + r"hwloc-(.*).tar") +event_embedded_version = get_tarball_version(f"{ompi_top_srcdir}/3rd-party/", + r"libevent-(.*)-stable.tar") + +# --------------------------- + +# Assemble several different combinations of version strings ompi_series = f"v{ompi_data['major']}.{ompi_data['minor']}.x" ompi_ver = f"v{ompi_data['major']}.{ompi_data['minor']}.{ompi_data['release']}{ompi_data['greek']}" +pmix_embedded_version = f"v{pmix_data['major']}.{pmix_data['minor']}.{pmix_data['release']}{pmix_data['greek']}" +prte_embedded_version = f"v{prte_data['major']}.{prte_data['minor']}.{prte_data['release']}{prte_data['greek']}" +prte_embedded_series = f"v{prte_data['major']}.{prte_data['minor']}" + pmix_min_version = f"{ompi_data['pmix_min_version']}" prte_min_version = f"{ompi_data['prte_min_version']}" hwloc_min_version = f"{ompi_data['hwloc_min_version']}" @@ -86,7 +125,6 @@ # If we're building in an RTD environment for a tag or external (i.e., # PR), use the RTD version -- not what we just read from the VERSIONS # file. -import os key = 'READTHEDOCS' if key in os.environ and os.environ[key] == 'True': print("OMPI: found ReadTheDocs build environment") @@ -172,9 +210,6 @@ # -- Options for MAN output ------------------------------------------------- -import os -import re - # Dynamically find all the man pages and build the appropriate list of # tuples so that we don't have to manually maintain it. @@ -222,9 +257,14 @@ def _doit(topdir): .. |ompi_ver| replace:: {ompi_ver} .. |ompi_series| replace:: {ompi_series} .. |pmix_min_version| replace:: {pmix_min_version} +.. |pmix_embedded_version| replace:: {pmix_embedded_version} .. |prte_min_version| replace:: {prte_min_version} +.. |prte_embedded_version| replace:: {prte_embedded_version} +.. |prte_embedded_series| replace:: {prte_embedded_series} .. |hwloc_min_version| replace:: {hwloc_min_version} +.. |hwloc_embedded_version| replace:: {hwloc_embedded_version} .. |event_min_version| replace:: {event_min_version} +.. |event_embedded_version| replace:: {event_embedded_version} .. |automake_min_version| replace:: {automake_min_version} .. |autoconf_min_version| replace:: {autoconf_min_version} .. |libtool_min_version| replace:: {libtool_min_version} @@ -234,6 +274,10 @@ def _doit(topdir): .. |mpi_standard_minor_version| replace:: {mpi_standard_minor_version} .. |deprecated_favor| replace:: this routine is deprecated in favor of +.. |br| raw:: html + +
+ """ # The sphinx_rtd_theme does not properly handle wrapping long lines in diff --git a/docs/installing-open-mpi/required-support-libraries.rst b/docs/installing-open-mpi/required-support-libraries.rst index b411e1a02f5..f88a7987fc1 100644 --- a/docs/installing-open-mpi/required-support-libraries.rst +++ b/docs/installing-open-mpi/required-support-libraries.rst @@ -3,50 +3,28 @@ Required support libraries ========================== -Open MPI requires the following support libraries with the minimum listed versions: - -.. list-table:: - :header-rows: 1 - - * - Library - - Minimum version - - Notes - * - `Hardware Locality `_ - - |hwloc_min_version| - - This library is required; Open MPI will not build without it. - * - `Libevent `_ - - |event_min_version| - - This library is required; Open MPI will not build without it. - * - `PMIx `_ - - |pmix_min_version| - - This library is required; Open MPI will not build without it. - * - `PRRTE `_ - - |prte_min_version| - - This library is optional in some environments. PRRTE provides - Open MPI's full-featured ``mpirun`` / ``mpiexec`` MPI - application launchers (the two are identical; they are symbolic - links to the same executable). - - * If your environment uses another MPI application launcher - (e.g., Slurm users can use the ``srun`` launcher to "direct - launch" Open MPI applications), then the use of PRRTE is - optional. - * If your environment has no other MPI application launcher, then - you need to install PRRTE and build Open MPI with PRRTE - support. - * Open MPI can use the copy of PRRTE embedded in its source - code tree, or compile/link against an external PRRTE - installation. :ref:`See this section for details about how - to specify each method - `. - -Since these support libraries are fundamental to Open MPI's operation -and not universally available in all environments, they are directly + +While Open MPI can be built with support for a wide variety of +systems, a small set of support libraries are *required* in order to +build Open MPI in *any* environment. Several of these packages are +both fundamental to Open MPI's operation and not universally available +in all environments. As such, these "fundamental" packages are both +embedded in Open MPI's distribution tarballs and also directly incorporated into Open MPI's configure, build, and installation -process. More on this below. +process. + +:ref:`See below +` for a +description of how Open MPI chooses whether to use the embedded +versions of these packages or versions already installed on your +system. + +* `Hardware Locality `_ - .. note:: The versions listed in this table are the *minimum* versions needed. In general, the Open MPI community recommends using more recent versions of both the :ref:`required support libraries ` and any other optional support libraries. This is because more recent versions typically tend to include bug fixes, sometimes affecting Open MPI functionality. As a specific example, there is a known issue with `Hardware Locality `_ releases older than v2.8.0 on systems with Intel Ponte Vecchio accelerators. If you run Open MPI on such systems, you need to use Hwloc v2.8.0 or newer, or you will experience undefined behavior. - This effect is not unique to the Hardware Locality library; this is why the Open MPI community recommends using as recent as possible versions of all support libraries. + * This library is required; Open MPI will not build without it. + * **Minimum version required:** |hwloc_min_version| + * **Version embedded in Open MPI distribution:** + |hwloc_embedded_version| .. danger:: As of |ompi_ver|, Open MPI does not yet support the Hwloc v3.x series (which may not even be available at @@ -77,6 +55,81 @@ process. More on this below. uses Hwloc, it uses the *same* Hwloc with which Open MPI was compiled. +* `Libevent `_ + + * This library is required; Open MPI will not build without it. + * **Minimum version required:** |event_min_version| + * **Version embedded in Open MPI distribution:** + |event_embedded_version| + +* `PMIx `_ + + * This library is required; Open MPI will not build without it. + * **Minimum version required when building without PRRTE:** + |pmix_min_version| + * **Minimum version required when building with PRRTE:** `See the + PRRTE project documentation `_. + * **Version embedded in Open MPI distribution:** + |pmix_embedded_version| + +* `PRRTE `_ + + * This library is optional in some environments. See below. + * **Minimum version required:** |prte_min_version| + + .. note:: While building Open MPI with PRRTE |prte_min_version| + *works*, you will not get a fully-populated + ``mpirun(1)`` man page. The Open MPI community + recommends that you use PRRTE version 3.0.1 or higher. + + * **Version embedded in Open MPI distribution:** + |prte_embedded_version| + + PRRTE provides Open MPI's full-featured ``mpirun`` / ``mpiexec`` MPI + application launchers (the two commands are identical; they are + symbolic links to the same executable). + + .. warning:: If you are building the PRRTE that is embedded in the + Open MPI |ompi_ver| distribution: + + * If you are also building the PMIx that is embedded in + the Open MPI |ompi_ver| distribution, that + combination of packages is supported. + + * If you are building against an external PMIx + installation (i.e., a version of PMIx that is not + embedded in the Open MPI |ompi_ver| distribution), + you should check `the PRRTE project documentation + `_ to see what minimum + version of PMIx is required. + + * If your environment uses another MPI application launcher (e.g., + Slurm users can use the ``srun`` launcher to "direct launch" Open + MPI applications), then the use of PRRTE is optional. + * If your environment has no other MPI application launcher, then + you need to install PRRTE and build Open MPI with PRRTE support. + * Open MPI can use the copy of PRRTE embedded in its source code + tree, or compile/link against an external PRRTE installation. + :ref:`See this section for details about how to specify each + method + `. + +.. note:: In general, the Open MPI community recommends using the most + recent versions of both the :ref:`required support libraries + ` and any other + optional support libraries. This is because more recent + versions typically tend to include bug fixes, sometimes + affecting Open MPI functionality. As a specific example, + there is a known issue with `Hardware Locality + `_ releases older + than v2.8.0 on systems with Intel Ponte Vecchio + accelerators. If you run Open MPI on such systems, you need + to use Hwloc v2.8.0 or newer, or you will experience + undefined behavior. This effect is not unique to the + Hardware Locality library; this is why the Open MPI + community recommends using as recent as possible versions of + *all* support libraries. + Library dependencies -------------------- @@ -145,6 +198,8 @@ example |mdash| only Libevent and Hwloc, that somewhat simplifies the final Open MPI configuration, and therefore avoids some potentially erroneous configurations. +.. _required-support-libraries-configure-discovery-label: + How ``configure`` finds the required libraries ---------------------------------------------- @@ -264,7 +319,7 @@ on Mac OS because: tarballs). #. In MacOS, it is common for `Homebrew `_ or `MacPorts `_ to install: - + * `Hardware Locality `_ * `Libevent `_ From 5adb240f6509e86407d642214251b5640c2344f8 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 2 Oct 2023 20:31:42 -0700 Subject: [PATCH 38/73] docs: Fix build case with --disable-prrte Fix a small issue in properly setting filename when building the empty schizo rst file. Signed-off-by: Brian Barrett --- docs/Makefile.am | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/Makefile.am b/docs/Makefile.am index dc9a085e99e..eacf2baf9b8 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -962,10 +962,9 @@ $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(OMPI_SCHIZO_OMPI_RS else $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(srcdir)/no-prrte-content.rst.txt - if test ! -d "$$dir"; then mkdir "$$dir"; fi + dir=`dirname $@`; if test ! -d "$$dir"; then mkdir "$$dir"; fi $(OMPI_V_SPHINX_COPYRST) \ - dir=`dirname $@`; \ - cp -pf $(srcdir)/no-prrte-content.rst.txt "$$dir" + cp -pf $(srcdir)/no-prrte-content.rst.txt "$@" endif $(ALL_MAN_BUILT): $(builddir)/prrte-rst-content From 3ef5dc9a0c901322a6aa190f63f8dbc6af75626d Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Tue, 3 Oct 2023 08:26:41 -0400 Subject: [PATCH 39/73] Patch the prrte.spec file. This is already fixed in prrte but for v5.0.x and main we'll want this fix applied for any rpm generation. This can safely be removed once main and v5.0.x advance. On v5.0.x this will be the next prrte release. For main, the next submodule update is fine to remove this. Signed-off-by: Austen Lauria --- autogen.pl | 4 ++++ config/prrte.spec.diff | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 config/prrte.spec.diff diff --git a/autogen.pl b/autogen.pl index 5af4704f2a1..3cb79025dbf 100755 --- a/autogen.pl +++ b/autogen.pl @@ -1643,6 +1643,10 @@ sub replace_config_sub_guess { if (! -f "3rd-party/prrte/configure.ac") { my_die("Could not find pmix files\n"); } + + verbose "Patching prrte.spec file\n"; + system("$patch_prog -N -p0 < ./config/prrte.spec.diff > /dev/null 2>&1"); + push(@subdirs, "3rd-party/prrte/"); $m4 .= "m4_define([package_prrte], [1])\n"; diff --git a/config/prrte.spec.diff b/config/prrte.spec.diff new file mode 100644 index 00000000000..4e8b1a86eb1 --- /dev/null +++ b/config/prrte.spec.diff @@ -0,0 +1,20 @@ +--- 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:43.842625000 -0400 ++++ 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:27.849686000 -0400 +@@ -612,7 +612,7 @@ + %{shell_scripts_path}/%{shell_scripts_basename}.sh + %{shell_scripts_path}/%{shell_scripts_basename}.csh + %endif +-%doc README INSTALL LICENSE ++%doc README.md LICENSE + + %else + +@@ -656,7 +656,7 @@ + %{shell_scripts_path}/%{shell_scripts_basename}.sh + %{shell_scripts_path}/%{shell_scripts_basename}.csh + %endif +-%doc README INSTALL LICENSE ++%doc README.md LICENSE + %{_pkgdatadir} + + %files devel -f devel.files From 776e8babd6868b968d1724161a6999861723b08a Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 19 Sep 2023 09:47:42 +0300 Subject: [PATCH 40/73] oshmem: Add symmetric remote key handling code At very high scale, having each rank storing each other rank's remote keys for each segment can lead to high memory consumption. We activate symmetric remote key option to generate remote keys that will be deduplicated and then used interchangeably. Signed-off-by: Thomas Vegas --- config/ompi_check_ucx.m4 | 6 +- oshmem/mca/spml/ucx/spml_ucx.c | 183 +++++++++++++++++++++- oshmem/mca/spml/ucx/spml_ucx.h | 41 +++-- oshmem/mca/spml/ucx/spml_ucx_component.c | 39 +++-- oshmem/mca/sshmem/ucx/sshmem_ucx_module.c | 3 +- 5 files changed, 241 insertions(+), 31 deletions(-) diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index fbea98cd7b3..01e39aaf968 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -108,7 +108,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_PARAM_FIELD_ESTIMATED_NUM_PPN, UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK, UCP_OP_ATTR_FLAG_MULTI_SEND, - UCS_MEMORY_TYPE_RDMA], + UCS_MEMORY_TYPE_RDMA, + UCP_MEM_MAP_SYMMETRIC_RKEY], [], [], [#include ]) AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], @@ -124,7 +125,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ [#include ]) AC_CHECK_DECLS([ucp_tag_send_nbx, ucp_tag_send_sync_nbx, - ucp_tag_recv_nbx], + ucp_tag_recv_nbx, + ucp_rkey_compare], [], [], [#include ]) AC_CHECK_TYPES([ucp_request_param_t], diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 570b4d25a7a..5493d78e661 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -22,6 +22,7 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/common/ucx/common_ucx.h" #include "opal/util/opal_environ.h" +#include "opal/util/minmax.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/mca/pml/pml.h" @@ -126,6 +127,171 @@ static ucp_request_param_t mca_spml_ucx_request_param_b = { }; #endif +unsigned +mca_spml_ucx_mem_map_flags_symmetric_rkey(struct mca_spml_ucx *spml_ucx) +{ +#if HAVE_DECL_UCP_MEM_MAP_SYMMETRIC_RKEY + if (spml_ucx->symmetric_rkey_max_count > 0) { + return UCP_MEM_MAP_SYMMETRIC_RKEY; + } +#endif + + return 0; +} + +void mca_spml_ucx_rkey_store_init(mca_spml_ucx_rkey_store_t *store) +{ + store->array = NULL; + store->count = 0; + store->size = 0; +} + +void mca_spml_ucx_rkey_store_cleanup(mca_spml_ucx_rkey_store_t *store) +{ + int i; + + for (i = 0; i < store->count; i++) { + if (store->array[i].refcnt != 0) { + SPML_UCX_ERROR("rkey store destroy: %d/%d has refcnt %d > 0", + i, store->count, store->array[i].refcnt); + } + + ucp_rkey_destroy(store->array[i].rkey); + } + + free(store->array); +} + +/** + * Find position in sorted array for existing or future entry + * + * @param[in] store Store of the entries + * @param[in] worker Common worker for rkeys used + * @param[in] rkey Remote key to search for + * @param[out] index Index of entry + * + * @return + * OSHMEM_ERR_NOT_FOUND: index contains the position where future element + * should be inserted to keep array sorted + * OSHMEM_SUCCESS : index contains the position of the element + * Other error : index is not valid + */ +static int mca_spml_ucx_rkey_store_find(const mca_spml_ucx_rkey_store_t *store, + const ucp_worker_h worker, + const ucp_rkey_h rkey, + int *index) +{ +#if HAVE_DECL_UCP_RKEY_COMPARE + ucp_rkey_compare_params_t params; + int i, result, m, end; + ucs_status_t status; + + for (i = 0, end = store->count; i < end;) { + m = (i + end) / 2; + + params.field_mask = 0; + status = ucp_rkey_compare(worker, store->array[m].rkey, + rkey, ¶ms, &result); + if (status != UCS_OK) { + return OSHMEM_ERROR; + } else if (result == 0) { + *index = m; + return OSHMEM_SUCCESS; + } else if (result > 0) { + end = m; + } else { + i = m + 1; + } + } + + *index = i; + return OSHMEM_ERR_NOT_FOUND; +#else + return OSHMEM_ERROR; +#endif +} + +static void mca_spml_ucx_rkey_store_insert(mca_spml_ucx_rkey_store_t *store, + int i, ucp_rkey_h rkey) +{ + int size; + mca_spml_ucx_rkey_t *tmp; + + if (store->count >= mca_spml_ucx.symmetric_rkey_max_count) { + return; + } + + if (store->count >= store->size) { + size = opal_min(opal_max(store->size, 8) * 2, + mca_spml_ucx.symmetric_rkey_max_count); + tmp = realloc(store->array, size * sizeof(*store->array)); + if (tmp == NULL) { + return; + } + + store->array = tmp; + store->size = size; + } + + memmove(&store->array[i + 1], &store->array[i], + (store->count - i) * sizeof(*store->array)); + store->array[i].rkey = rkey; + store->array[i].refcnt = 1; + store->count++; + return; +} + +/* Takes ownership of input ucp remote key */ +static ucp_rkey_h mca_spml_ucx_rkey_store_get(mca_spml_ucx_rkey_store_t *store, + ucp_worker_h worker, + ucp_rkey_h rkey) +{ + int ret, i; + + if (mca_spml_ucx.symmetric_rkey_max_count == 0) { + return rkey; + } + + ret = mca_spml_ucx_rkey_store_find(store, worker, rkey, &i); + if (ret == OSHMEM_SUCCESS) { + ucp_rkey_destroy(rkey); + store->array[i].refcnt++; + return store->array[i].rkey; + } + + if (ret == OSHMEM_ERR_NOT_FOUND) { + mca_spml_ucx_rkey_store_insert(store, i, rkey); + } + + return rkey; +} + +static void mca_spml_ucx_rkey_store_put(mca_spml_ucx_rkey_store_t *store, + ucp_worker_h worker, + ucp_rkey_h rkey) +{ + mca_spml_ucx_rkey_t *entry; + int ret, i; + + ret = mca_spml_ucx_rkey_store_find(store, worker, rkey, &i); + if (ret != OSHMEM_SUCCESS) { + goto out; + } + + entry = &store->array[i]; + assert(entry->rkey == rkey); + if (--entry->refcnt > 0) { + return; + } + + memmove(&store->array[i], &store->array[i + 1], + (store->count - (i + 1)) * sizeof(*store->array)); + store->count--; + +out: + ucp_rkey_destroy(rkey); +} + int mca_spml_ucx_enable(bool enable) { SPML_UCX_VERBOSE(50, "*** ucx ENABLED ****"); @@ -240,6 +406,7 @@ int mca_spml_ucx_ctx_mkey_add(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn { int rc; ucs_status_t err; + ucp_rkey_h rkey; rc = mca_spml_ucx_ctx_mkey_new(ucx_ctx, pe, segno, ucx_mkey); if (OSHMEM_SUCCESS != rc) { @@ -248,11 +415,18 @@ int mca_spml_ucx_ctx_mkey_add(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn } if (mkey->u.data) { - err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[pe].ucp_conn, mkey->u.data, &((*ucx_mkey)->rkey)); + err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[pe].ucp_conn, mkey->u.data, &rkey); if (UCS_OK != err) { SPML_UCX_ERROR("failed to unpack rkey: %s", ucs_status_string(err)); return OSHMEM_ERROR; } + + if (!oshmem_proc_on_local_node(pe)) { + rkey = mca_spml_ucx_rkey_store_get(&ucx_ctx->rkey_store, ucx_ctx->ucp_worker[0], rkey); + } + + (*ucx_mkey)->rkey = rkey; + rc = mca_spml_ucx_ctx_mkey_cache(ucx_ctx, mkey, segno, pe); if (OSHMEM_SUCCESS != rc) { SPML_UCX_ERROR("mca_spml_ucx_ctx_mkey_cache failed"); @@ -267,7 +441,7 @@ int mca_spml_ucx_ctx_mkey_del(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn ucp_peer_t *ucp_peer; int rc; ucp_peer = &(ucx_ctx->ucp_peers[pe]); - ucp_rkey_destroy(ucx_mkey->rkey); + mca_spml_ucx_rkey_store_put(&ucx_ctx->rkey_store, ucx_ctx->ucp_worker[0], ucx_mkey->rkey); ucx_mkey->rkey = NULL; rc = mca_spml_ucx_peer_mkey_cache_del(ucp_peer, segno); if(OSHMEM_SUCCESS != rc){ @@ -725,7 +899,8 @@ sshmem_mkey_t *mca_spml_ucx_register(void* addr, UCP_MEM_MAP_PARAM_FIELD_FLAGS; mem_map_params.address = addr; mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.flags = flags | + mca_spml_ucx_mem_map_flags_symmetric_rkey(&mca_spml_ucx); status = ucp_mem_map(mca_spml_ucx.ucp_context, &mem_map_params, &mem_h); if (UCS_OK != status) { @@ -917,6 +1092,8 @@ static int mca_spml_ucx_ctx_create_common(long options, mca_spml_ucx_ctx_t **ucx } } + mca_spml_ucx_rkey_store_init(&ucx_ctx->rkey_store); + *ucx_ctx_p = ucx_ctx; return OSHMEM_SUCCESS; diff --git a/oshmem/mca/spml/ucx/spml_ucx.h b/oshmem/mca/spml/ucx/spml_ucx.h index a93ff3756a3..2fec131ad2d 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.h +++ b/oshmem/mca/spml/ucx/spml_ucx.h @@ -76,18 +76,31 @@ struct ucp_peer { size_t mkeys_cnt; }; typedef struct ucp_peer ucp_peer_t; - + +/* An rkey_store entry */ +typedef struct mca_spml_ucx_rkey { + ucp_rkey_h rkey; + int refcnt; +} mca_spml_ucx_rkey_t; + +typedef struct mca_spml_ucx_rkey_store { + mca_spml_ucx_rkey_t *array; + int size; + int count; +} mca_spml_ucx_rkey_store_t; + struct mca_spml_ucx_ctx { - ucp_worker_h *ucp_worker; - ucp_peer_t *ucp_peers; - long options; - opal_bitmap_t put_op_bitmap; - unsigned long nb_progress_cnt; - unsigned int ucp_workers; - int *put_proc_indexes; - unsigned put_proc_count; - bool synchronized_quiet; - int strong_sync; + ucp_worker_h *ucp_worker; + ucp_peer_t *ucp_peers; + long options; + opal_bitmap_t put_op_bitmap; + unsigned long nb_progress_cnt; + unsigned int ucp_workers; + int *put_proc_indexes; + unsigned put_proc_count; + bool synchronized_quiet; + int strong_sync; + mca_spml_ucx_rkey_store_t rkey_store; }; typedef struct mca_spml_ucx_ctx mca_spml_ucx_ctx_t; @@ -128,6 +141,7 @@ struct mca_spml_ucx { unsigned long nb_ucp_worker_progress; unsigned int ucp_workers; unsigned int ucp_worker_cnt; + int symmetric_rkey_max_count; }; typedef struct mca_spml_ucx mca_spml_ucx_t; @@ -280,6 +294,11 @@ extern int mca_spml_ucx_team_fcollect(shmem_team_t team, void extern int mca_spml_ucx_team_reduce(shmem_team_t team, void *dest, const void *source, size_t nreduce, int operation, int datatype); +extern unsigned +mca_spml_ucx_mem_map_flags_symmetric_rkey(struct mca_spml_ucx *spml_ucx); + +extern void mca_spml_ucx_rkey_store_init(mca_spml_ucx_rkey_store_t *store); +extern void mca_spml_ucx_rkey_store_cleanup(mca_spml_ucx_rkey_store_t *store); static inline int mca_spml_ucx_peer_mkey_get(ucp_peer_t *ucp_peer, int index, spml_ucx_cached_mkey_t **out_rmkey) diff --git a/oshmem/mca/spml/ucx/spml_ucx_component.c b/oshmem/mca/spml/ucx/spml_ucx_component.c index 1ab00ac1786..e44a800a8be 100644 --- a/oshmem/mca/spml/ucx/spml_ucx_component.c +++ b/oshmem/mca/spml/ucx/spml_ucx_component.c @@ -153,6 +153,10 @@ static int mca_spml_ucx_component_register(void) "Enable asynchronous progress thread", &mca_spml_ucx.async_progress); + mca_spml_ucx_param_register_int("symmetric_rkey_max_count", 0, + "Size of the symmetric key store. Non-zero to enable, typical use 5000", + &mca_spml_ucx.symmetric_rkey_max_count); + mca_spml_ucx_param_register_int("async_tick_usec", 3000, "Asynchronous progress tick granularity (in usec)", &mca_spml_ucx.async_tick); @@ -332,6 +336,8 @@ static int spml_ucx_init(void) mca_spml_ucx_ctx_default.ucp_workers++; } + mca_spml_ucx_rkey_store_init(&mca_spml_ucx_ctx_default.rkey_store); + wrk_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; err = ucp_worker_query(mca_spml_ucx_ctx_default.ucp_worker[0], &wrk_attr); @@ -436,10 +442,25 @@ static void _ctx_cleanup(mca_spml_ucx_ctx_t *ctx) free(ctx->ucp_peers); } +static void mca_spml_ucx_ctx_fini(mca_spml_ucx_ctx_t *ctx) +{ + unsigned int i; + + mca_spml_ucx_rkey_store_cleanup(&ctx->rkey_store); + for (i = 0; i < ctx->ucp_workers; i++) { + ucp_worker_destroy(ctx->ucp_worker[i]); + } + free(ctx->ucp_worker); + if (ctx != &mca_spml_ucx_ctx_default) { + free(ctx); + } +} + static int mca_spml_ucx_component_fini(void) { int fenced = 0, i; int ret = OSHMEM_SUCCESS; + mca_spml_ucx_ctx_t *ctx; opal_progress_unregister(spml_ucx_default_progress); if (mca_spml_ucx.active_array.ctxs_count) { @@ -492,36 +513,26 @@ static int mca_spml_ucx_component_fini(void) } } - /* delete all workers */ for (i = 0; i < mca_spml_ucx.active_array.ctxs_count; i++) { - ucp_worker_destroy(mca_spml_ucx.active_array.ctxs[i]->ucp_worker[0]); - free(mca_spml_ucx.active_array.ctxs[i]->ucp_worker); - free(mca_spml_ucx.active_array.ctxs[i]); + mca_spml_ucx_ctx_fini(mca_spml_ucx.active_array.ctxs[i]); } for (i = 0; i < mca_spml_ucx.idle_array.ctxs_count; i++) { - ucp_worker_destroy(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker[0]); - free(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker); - free(mca_spml_ucx.idle_array.ctxs[i]); + mca_spml_ucx_ctx_fini(mca_spml_ucx.idle_array.ctxs[i]); } if (mca_spml_ucx_ctx_default.ucp_worker) { - for (i = 0; i < (signed int)mca_spml_ucx.ucp_workers; i++) { - ucp_worker_destroy(mca_spml_ucx_ctx_default.ucp_worker[i]); - } - free(mca_spml_ucx_ctx_default.ucp_worker); + mca_spml_ucx_ctx_fini(&mca_spml_ucx_ctx_default); } if (mca_spml_ucx.aux_ctx != NULL) { - ucp_worker_destroy(mca_spml_ucx.aux_ctx->ucp_worker[0]); - free(mca_spml_ucx.aux_ctx->ucp_worker); + mca_spml_ucx_ctx_fini(mca_spml_ucx.aux_ctx); } mca_spml_ucx.enabled = false; /* not anymore */ free(mca_spml_ucx.active_array.ctxs); free(mca_spml_ucx.idle_array.ctxs); - free(mca_spml_ucx.aux_ctx); SHMEM_MUTEX_DESTROY(mca_spml_ucx.internal_mutex); pthread_mutex_destroy(&mca_spml_ucx.ctx_create_mutex); diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c index 262bef5ffe6..688bfce6f19 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c @@ -118,7 +118,8 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, mem_map_params.address = address; mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.flags = flags | + mca_spml_ucx_mem_map_flags_symmetric_rkey(spml); mem_map_params.memory_type = mem_type; status = ucp_mem_map(spml->ucp_context, &mem_map_params, &mem_h); From d1d9ad631aec398534032563e7ca6f2615f6162f Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Wed, 11 Oct 2023 16:43:43 -0400 Subject: [PATCH 41/73] Add support for MPI_ERR_VALUE_TOO_LARGE Signed-off-by: George Bosilca --- ompi/errhandler/errcode.c | 3 +++ ompi/include/mpi.h.in | 1 + ompi/include/mpif-values.pl | 1 + 3 files changed, 5 insertions(+) diff --git a/ompi/errhandler/errcode.c b/ompi/errhandler/errcode.c index c774a2748a2..63b55be9fea 100644 --- a/ompi/errhandler/errcode.c +++ b/ompi/errhandler/errcode.c @@ -126,6 +126,7 @@ static ompi_mpi_errcode_t ompi_err_proc_fail_pending; static ompi_mpi_errcode_t ompi_err_revoked; #endif static ompi_mpi_errcode_t ompi_err_session; +static ompi_mpi_errcode_t ompi_err_value_too_large; static void ompi_mpi_errcode_construct(ompi_mpi_errcode_t* errcode); static void ompi_mpi_errcode_destruct(ompi_mpi_errcode_t* errcode); @@ -243,6 +244,7 @@ int ompi_mpi_errcode_init (void) CONSTRUCT_ERRCODE( ompi_err_revoked, MPI_ERR_REVOKED, "MPI_ERR_REVOKED: Communication Object Revoked" ); #endif CONSTRUCT_ERRCODE( ompi_err_session, MPI_ERR_SESSION, "MPI_ERR_SESSION: Invalid session handle" ); + CONSTRUCT_ERRCODE( ompi_err_value_too_large, MPI_ERR_VALUE_TOO_LARGE, "MPI_ERR_VALUE_TOO_LARGE: Value is too large to store" ); /* Per MPI-3 p353:27-32, MPI_LASTUSEDCODE must be >= MPI_ERR_LASTCODE. So just start it as == MPI_ERR_LASTCODE. */ @@ -359,6 +361,7 @@ int ompi_mpi_errcode_finalize (void) OBJ_DESTRUCT(&ompi_err_revoked); #endif OBJ_DESTRUCT(&ompi_err_session); + OBJ_DESTRUCT(&ompi_err_value_too_large); OBJ_DESTRUCT(&ompi_mpi_errcodes); ompi_mpi_errcode_lastpredefined = 0; opal_mutex_unlock(&errcode_lock); diff --git a/ompi/include/mpi.h.in b/ompi/include/mpi.h.in index 0b370840ad5..0c26fa08d8f 100644 --- a/ompi/include/mpi.h.in +++ b/ompi/include/mpi.h.in @@ -752,6 +752,7 @@ enum { #define MPI_ERR_PROC_FAILED_PENDING 76 #define MPI_ERR_REVOKED 77 #define MPI_ERR_SESSION 78 +#define MPI_ERR_VALUE_TOO_LARGE 79 /* Per MPI-3 p349 47, MPI_ERR_LASTCODE must be >= the last predefined MPI_ERR_ code. Set the last code to allow some room for adding diff --git a/ompi/include/mpif-values.pl b/ompi/include/mpif-values.pl index 13d599ec982..21f69530cde 100755 --- a/ompi/include/mpif-values.pl +++ b/ompi/include/mpif-values.pl @@ -381,6 +381,7 @@ sub read_value_from_file { $constants->{MPI_ERR_RMA_SHARED} = 71; $constants->{MPI_T_ERR_INVALID} = 72; $constants->{MPI_ERR_SESSION} = 78; +$constants->{MPI_ERR_VALUE_TOO_LARGE} = 79; $constants->{MPI_ERR_LASTCODE} = 92; $constants->{MPI_IDENT} = 0; From e1f64e1b1a4b50d182022df3cdb3070d6741a014 Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Wed, 11 Oct 2023 12:34:33 +0300 Subject: [PATCH 42/73] OPAL/MCA/COMMON/UCX: added a NULL check for tls and devices in opal_common_ucx_support_level Signed-off-by: Roie Danino --- opal/mca/common/ucx/common_ucx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 7dc916f42db..208b93c3ce4 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -245,6 +245,10 @@ OPAL_DECLSPEC opal_common_ucx_support_level_t opal_common_ucx_support_level(ucp_ int ret; #endif + if ((*opal_common_ucx.tls == NULL) || (*opal_common_ucx.devices == NULL)) { + opal_common_ucx_mca_var_register(NULL); + } + is_any_tl = !strcmp(*opal_common_ucx.tls, "any"); is_any_device = !strcmp(*opal_common_ucx.devices, "any"); From e1134affa5587e028907b0359f85a8837f60e809 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Thu, 12 Oct 2023 11:37:26 -0600 Subject: [PATCH 43/73] pcomm:fix fortran interface for precv/psend The Fortran interfaces were using INTEGER for count argument. The MPI-4 standard states that this argument should be INTEGER(KIND=MPI_COUNT_KIND). related to #11982 Signed-off-by: Howard Pritchard --- ompi/mpi/fortran/mpif-h/precv_init_f.c | 10 ++++++---- ompi/mpi/fortran/mpif-h/prototypes_mpi.h | 6 +++--- ompi/mpi/fortran/mpif-h/psend_init_f.c | 10 ++++++---- .../use-mpi-f08/bindings/mpi-f-interfaces-bind.h | 8 ++++++-- .../fortran/use-mpi-f08/mod/mpi-f08-interfaces.h.in | 12 +++++++----- ompi/mpi/fortran/use-mpi-f08/precv_init_f08.F90 | 7 +++++-- ompi/mpi/fortran/use-mpi-f08/psend_init_f08.F90 | 7 +++++-- .../mpi-ignore-tkr-interfaces.h.in | 8 +++++--- 8 files changed, 43 insertions(+), 25 deletions(-) diff --git a/ompi/mpi/fortran/mpif-h/precv_init_f.c b/ompi/mpi/fortran/mpif-h/precv_init_f.c index b411e06a460..2fe64d55c55 100644 --- a/ompi/mpi/fortran/mpif-h/precv_init_f.c +++ b/ompi/mpi/fortran/mpif-h/precv_init_f.c @@ -14,6 +14,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Sandia National Laboratories. All rights reserved. * Copyright (c) 2021 Bull S.A.S. All rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,7 +43,7 @@ OMPI_GENERATE_F77_BINDINGS (PMPI_PRECV_INIT, pmpi_precv_init_, pmpi_precv_init__, pompi_precv_init_f, - (char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), + (char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), (buf, partitions, count, datatype, dest, tag, comm, info, request, ierr) ) #endif #endif @@ -61,7 +63,7 @@ OMPI_GENERATE_F77_BINDINGS (MPI_PRECV_INIT, mpi_precv_init_, mpi_precv_init__, ompi_precv_init_f, - (char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), + (char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), (buf, partitions, count, datatype, dest, tag, comm, info, request, ierr) ) #else #define ompi_precv_init_f pompi_precv_init_f @@ -69,7 +71,7 @@ OMPI_GENERATE_F77_BINDINGS (MPI_PRECV_INIT, #endif -void ompi_precv_init_f(char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr) +void ompi_precv_init_f(char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr) { int c_ierr; MPI_Info c_info; @@ -82,7 +84,7 @@ void ompi_precv_init_f(char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fin c_ierr = PMPI_Precv_init(OMPI_F2C_BOTTOM(buf), OMPI_FINT_2_INT(*partitions), - OMPI_FINT_2_INT(*count), + *count, c_type, OMPI_FINT_2_INT(*dest), OMPI_FINT_2_INT(*tag), c_comm, c_info, &c_req); diff --git a/ompi/mpi/fortran/mpif-h/prototypes_mpi.h b/ompi/mpi/fortran/mpif-h/prototypes_mpi.h index 5d089fc57ef..b1eb065c1a7 100644 --- a/ompi/mpi/fortran/mpif-h/prototypes_mpi.h +++ b/ompi/mpi/fortran/mpif-h/prototypes_mpi.h @@ -16,7 +16,7 @@ * reserved. * Copyright (c) 2016-2023 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2019-2022 Triad National Security, LLC. All rights + * Copyright (c) 2019-2023 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2021 Bull S.A.S. All rights reserved. * $COPYRIGHT$ @@ -355,8 +355,8 @@ PN2(void, MPI_Pcontrol, mpi_pcontrol, MPI_PCONTROL, (MPI_Fint *level)); PN2(void, MPI_Pready, mpi_pready, MPI_PREADY, (MPI_Fint *partition, MPI_Fint *request, MPI_Fint *ierr)); PN2(void, MPI_Pready_list, mpi_pready_list, MPI_PREADY_LIST, (MPI_Fint *length, MPI_Fint *partition, MPI_Fint *request, MPI_Fint *ierr)); PN2(void, MPI_Pready_range, mpi_pready_range, MPI_PREADY_RANGE, (MPI_Fint *partition_low, MPI_Fint *partition_high, MPI_Fint *request, MPI_Fint *ierr)); -PN2(void, MPI_Precv_init, mpi_precv_init, MPI_PRECV_INIT, (char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *src, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr)); -PN2(void, MPI_Psend_init, mpi_psend_init, MPI_PSEND_INIT, (char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr)); +PN2(void, MPI_Precv_init, mpi_precv_init, MPI_PRECV_INIT, (char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *src, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr)); +PN2(void, MPI_Psend_init, mpi_psend_init, MPI_PSEND_INIT, (char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr)); PN2(void, MPI_Probe, mpi_probe, MPI_PROBE, (MPI_Fint *source, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *status, MPI_Fint *ierr)); PN2(void, MPI_Publish_name, mpi_publish_name, MPI_PUBLISH_NAME, (char *service_name, MPI_Fint *info, char *port_name, MPI_Fint *ierr, int service_name_len, int port_name_len)); PN2(void, MPI_Put, mpi_put, MPI_PUT, (char *origin_addr, MPI_Fint *origin_count, MPI_Fint *origin_datatype, MPI_Fint *target_rank, MPI_Aint *target_disp, MPI_Fint *target_count, MPI_Fint *target_datatype, MPI_Fint *win, MPI_Fint *ierr)); diff --git a/ompi/mpi/fortran/mpif-h/psend_init_f.c b/ompi/mpi/fortran/mpif-h/psend_init_f.c index 655ce8d7945..5325b9eb7d0 100644 --- a/ompi/mpi/fortran/mpif-h/psend_init_f.c +++ b/ompi/mpi/fortran/mpif-h/psend_init_f.c @@ -14,6 +14,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Sandia National Laboratories. All rights reserved. * Copyright (c) 2021 Bull S.A.S. All rights reserved. + * Copyright (c) 2023 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -41,7 +43,7 @@ OMPI_GENERATE_F77_BINDINGS (PMPI_PSEND_INIT, pmpi_psend_init_, pmpi_psend_init__, pompi_psend_init_f, - (char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), + (char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), (buf, partitions, count, datatype, dest, tag, comm, info, request, ierr) ) #endif #endif @@ -61,7 +63,7 @@ OMPI_GENERATE_F77_BINDINGS (MPI_PSEND_INIT, mpi_psend_init_, mpi_psend_init__, ompi_psend_init_f, - (char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), + (char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr), (buf, partitions, count, datatype, dest, tag, comm, info, request, ierr) ) #else #define ompi_psend_init_f pompi_psend_init_f @@ -69,7 +71,7 @@ OMPI_GENERATE_F77_BINDINGS (MPI_PSEND_INIT, #endif -void ompi_psend_init_f(char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr) +void ompi_psend_init_f(char *buf, MPI_Fint *partitions, MPI_Count *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *tag, MPI_Fint *comm, MPI_Fint *info, MPI_Fint *request, MPI_Fint *ierr) { int c_ierr; MPI_Info c_info; @@ -82,7 +84,7 @@ void ompi_psend_init_f(char *buf, MPI_Fint *partitions, MPI_Fint *count, MPI_Fin c_ierr = PMPI_Psend_init(OMPI_F2C_BOTTOM(buf), OMPI_FINT_2_INT(*partitions), - OMPI_FINT_2_INT(*count), + *count, c_type, OMPI_FINT_2_INT(*dest), OMPI_FINT_2_INT(*tag), c_comm, c_info, &c_req); diff --git a/ompi/mpi/fortran/use-mpi-f08/bindings/mpi-f-interfaces-bind.h b/ompi/mpi/fortran/use-mpi-f08/bindings/mpi-f-interfaces-bind.h index 54efadf95c1..3324393b976 100644 --- a/ompi/mpi/fortran/use-mpi-f08/bindings/mpi-f-interfaces-bind.h +++ b/ompi/mpi/fortran/use-mpi-f08/bindings/mpi-f-interfaces-bind.h @@ -310,9 +310,11 @@ end subroutine ompi_issend_f subroutine ompi_psend_init_f(buf,partitions,count,datatype,dest,tag,comm,info,request,ierror) & BIND(C, name="ompi_psend_init_f") + use :: mpi_f08_types, only : MPI_COUNT_KIND implicit none OMPI_FORTRAN_IGNORE_TKR_TYPE, INTENT(IN) :: buf - INTEGER, INTENT(IN) :: partitions, count, dest, tag + INTEGER, INTENT(IN) :: partitions, dest, tag + INTEGER(KIND=MPI_COUNT_KIND), INTENT(IN):: count INTEGER, INTENT(IN) :: datatype INTEGER, INTENT(IN) :: comm INTEGER, INTENT(IN) :: info @@ -322,9 +324,11 @@ end subroutine ompi_psend_init_f subroutine ompi_precv_init_f(buf,partitions,count,datatype,dest,tag,comm,info,request,ierror) & BIND(C, name="ompi_precv_init_f") + use :: mpi_f08_types, only : MPI_COUNT_KIND implicit none OMPI_FORTRAN_IGNORE_TKR_TYPE, INTENT(IN) :: buf - INTEGER, INTENT(IN) :: partitions, count, dest, tag + INTEGER, INTENT(IN) :: partitions, dest, tag + INTEGER(KIND=MPI_COUNT_KIND), INTENT(IN):: count INTEGER, INTENT(IN) :: datatype INTEGER, INTENT(IN) :: comm INTEGER, INTENT(IN) :: info diff --git a/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces.h.in b/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces.h.in index c08b211c95c..c66f92d1332 100644 --- a/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces.h.in +++ b/ompi/mpi/fortran/use-mpi-f08/mod/mpi-f08-interfaces.h.in @@ -10,7 +10,7 @@ ! Copyright (c) 2015-2020 Research Organization for Information Science ! and Technology (RIST). All rights reserved. ! Copyright (c) 2017-2018 FUJITSU LIMITED. All rights reserved. -! Copyright (c) 2021-2022 Triad National Security, LLC. All rights +! Copyright (c) 2021-2023 Triad National Security, LLC. All rights ! reserved. ! $COPYRIGHT$ ! @@ -201,7 +201,7 @@ end interface MPI_Issend interface MPI_Precv_init subroutine MPI_Precv_init_f08(buf,partitions,count,datatype,dest,tag,comm,request,ierror) - use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Request + use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Request, MPI_COUNT_KIND implicit none !DEC$ ATTRIBUTES NO_ARG_CHECK :: buf !GCC$ ATTRIBUTES NO_ARG_CHECK :: buf @@ -209,7 +209,8 @@ subroutine MPI_Precv_init_f08(buf,partitions,count,datatype,dest,tag,comm,reques !DIR$ IGNORE_TKR buf !IBM* IGNORE_TKR buf OMPI_FORTRAN_IGNORE_TKR_TYPE, INTENT(IN) :: buf - INTEGER, INTENT(IN) :: partitions, count, dest, tag + INTEGER, INTENT(IN) :: partitions, dest, tag + INTEGER(MPI_COUNT_KIND), INTENT(IN) :: count TYPE(MPI_Datatype), INTENT(IN) :: datatype TYPE(MPI_Comm), INTENT(IN) :: comm TYPE(MPI_Request), INTENT(OUT) :: request @@ -219,7 +220,7 @@ end interface MPI_Precv_init interface MPI_Psend_init subroutine MPI_Psend_init_f08(buf,partitions,count,datatype,dest,tag,comm,request,ierror) - use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Request + use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Request, MPI_COUNT_KIND implicit none !DEC$ ATTRIBUTES NO_ARG_CHECK :: buf !GCC$ ATTRIBUTES NO_ARG_CHECK :: buf @@ -227,7 +228,8 @@ subroutine MPI_Psend_init_f08(buf,partitions,count,datatype,dest,tag,comm,reques !DIR$ IGNORE_TKR buf !IBM* IGNORE_TKR buf OMPI_FORTRAN_IGNORE_TKR_TYPE, INTENT(IN) :: buf - INTEGER, INTENT(IN) :: partitions, count, dest, tag + INTEGER, INTENT(IN) :: partitions, dest, tag + INTEGER(MPI_COUNT_KIND), INTENT(IN) :: count TYPE(MPI_Datatype), INTENT(IN) :: datatype TYPE(MPI_Comm), INTENT(IN) :: comm TYPE(MPI_Request), INTENT(OUT) :: request diff --git a/ompi/mpi/fortran/use-mpi-f08/precv_init_f08.F90 b/ompi/mpi/fortran/use-mpi-f08/precv_init_f08.F90 index 9e5c8e417f6..94fae6fb892 100644 --- a/ompi/mpi/fortran/use-mpi-f08/precv_init_f08.F90 +++ b/ompi/mpi/fortran/use-mpi-f08/precv_init_f08.F90 @@ -7,6 +7,8 @@ ! and Technology (RIST). All rights reserved. ! Copyright (c) 2020 Sandia National Laboratories. All rights reserved. ! Copyright (c) 2021 Bull S.A.S. All rights reserved. +! Copyright (c) 2023 Triad National Security, LLC. All rights +! reserved. ! $COPYRIGHT$ #include "ompi/mpi/fortran/configure-fortran-output.h" @@ -14,11 +16,12 @@ #include "mpi-f08-rename.h" subroutine MPI_Precv_init_f08(buf,partitions,count,datatype,dest,tag,comm,info,request,ierror) - use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Info, MPI_Request + use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Info, MPI_Request, MPI_COUNT_KIND use :: ompi_mpifh_bindings, only : ompi_precv_init_f implicit none OMPI_FORTRAN_IGNORE_TKR_TYPE, INTENT(IN) :: buf - INTEGER, INTENT(IN) :: partitions,count, dest, tag + INTEGER, INTENT(IN) :: partitions, dest, tag + INTEGER(KIND=MPI_COUNT_KIND), INTENT(IN) :: count TYPE(MPI_Datatype), INTENT(IN) :: datatype TYPE(MPI_Comm), INTENT(IN) :: comm TYPE(MPI_Info), INTENT(IN) :: info diff --git a/ompi/mpi/fortran/use-mpi-f08/psend_init_f08.F90 b/ompi/mpi/fortran/use-mpi-f08/psend_init_f08.F90 index 3c68b2c1cae..2d117074cc6 100644 --- a/ompi/mpi/fortran/use-mpi-f08/psend_init_f08.F90 +++ b/ompi/mpi/fortran/use-mpi-f08/psend_init_f08.F90 @@ -7,16 +7,19 @@ ! and Technology (RIST). All rights reserved. ! Copyright (c) 2020 Sandia National Laboratories. All rights reserved. ! Copyright (c) 2021 Bull S.A.S. All rights reserved. +! Copyright (c) 2023 Triad National Security, LLC. All rights +! reserved. ! $COPYRIGHT$ #include "ompi/mpi/fortran/configure-fortran-output.h" subroutine MPI_Psend_init_f08(buf,partitions,count,datatype,dest,tag,comm,info,request,ierror) - use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Info, MPI_Request + use :: mpi_f08_types, only : MPI_Datatype, MPI_Comm, MPI_Info, MPI_Request, MPI_COUNT_KIND use :: ompi_mpifh_bindings, only : ompi_psend_init_f implicit none OMPI_FORTRAN_IGNORE_TKR_TYPE, INTENT(IN) :: buf - INTEGER, INTENT(IN) :: partitions,count, dest, tag + INTEGER, INTENT(IN) :: partitions, dest, tag + INTEGER(KIND=MPI_COUNT_KIND), INTENT(IN) :: count TYPE(MPI_Datatype), INTENT(IN) :: datatype TYPE(MPI_Comm), INTENT(IN) :: comm TYPE(MPI_Info), INTENT(IN) :: info diff --git a/ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces.h.in b/ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces.h.in index 976fad2d578..6253d378bcc 100644 --- a/ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces.h.in +++ b/ompi/mpi/fortran/use-mpi-ignore-tkr/mpi-ignore-tkr-interfaces.h.in @@ -11,7 +11,7 @@ ! reserved. ! Copyright (c) 2015-2023 Research Organization for Information Science ! and Technology (RIST). All rights reserved. -! Copyright (c) 2019-2022 Triad National Security, LLC. All rights +! Copyright (c) 2019-2023 Triad National Security, LLC. All rights ! reserved. ! Copyright (c) 2021 Bull S.A.S. All rights reserved. ! Copyright (c) 2021 IBM Corporation. All rights reserved. @@ -2619,10 +2619,11 @@ interface subroutine MPI_Psend_init(buf, partitions, count, datatype, dest, tag, & comm, info, request, ierror) + include 'mpif-config.h' @OMPI_FORTRAN_IGNORE_TKR_PREDECL@ buf @OMPI_FORTRAN_IGNORE_TKR_TYPE@, intent(in) :: buf integer, intent(in) :: partitions - integer, intent(in) :: count + integer(KIND=MPI_COUNT_KIND), intent(in) :: count integer, intent(in) :: datatype integer, intent(in) :: dest integer, intent(in) :: tag @@ -2639,10 +2640,11 @@ interface subroutine MPI_Precv_init(buf, partitions, count, datatype, dest, tag, & comm, info, request, ierror) + include 'mpif-config.h' @OMPI_FORTRAN_IGNORE_TKR_PREDECL@ buf @OMPI_FORTRAN_IGNORE_TKR_TYPE@, intent(in) :: buf integer, intent(in) :: partitions - integer, intent(in) :: count + integer(KIND=MPI_COUNT_KIND), intent(in) :: count integer, intent(in) :: datatype integer, intent(in) :: dest integer, intent(in) :: tag From c1e49ca678aa9fe0279c8b397420fc5260c64459 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Sat, 14 Oct 2023 00:20:58 +0000 Subject: [PATCH 44/73] docs/news: update contributors Noticed a missing name for 5.0 release Signed-off-by: Wenduo Wang --- docs/news/news-v5.0.x.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 54445d8b743..55c32e185a3 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -249,6 +249,7 @@ Open MPI version 5.0.0rc13 - Sophia Fang - Rick Gleitz - Colton Kammes + - Quincey Koziol - Robert Langfield - Nick Papior - Luz Paz From 6bda9c1a2c70dabe0b57ee8be5c6213e69263b80 Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Sun, 15 Oct 2023 10:45:16 +0300 Subject: [PATCH 45/73] OPAL/MCA/COMMON/UCX: using mca_base_var_register to avoid segfault Signed-off-by: Roie Danino --- opal/mca/common/ucx/common_ucx.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index 208b93c3ce4..a2bddf32b7d 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -90,30 +90,26 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t * *opal_common_ucx.tls = strdup(default_tls); } - tls_index = mca_base_component_var_register( - component, "tls", + tls_index = mca_base_var_register( + "opal", "opal_common", "ucx", "tls", "List of UCX transports which should be supported on the system, to enable " "selecting the UCX component. Special values: any (any available). " "A '^' prefix negates the list. " "For example, in order to exclude on shared memory and TCP transports, " "please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, - MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, - opal_common_ucx.tls); + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DWG, + OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, opal_common_ucx.tls); if (NULL == opal_common_ucx.devices) { - opal_common_ucx.devices = (char **) malloc(sizeof(char *)); + opal_common_ucx.devices = (char**) malloc(sizeof(char*)); *opal_common_ucx.devices = strdup(default_devices); } - devices_index = mca_base_component_var_register( - component, "devices", + devices_index = mca_base_var_register( + "opal", "opal_common", "ucx", "devices", "List of device driver pattern names, which, if supported by UCX, will " "bump its priority above ob1. Special values: any (any available)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, - MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, - opal_common_ucx.devices); + MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE | MCA_BASE_VAR_FLAG_DWG, + OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, opal_common_ucx.devices); if (component) { mca_base_var_register_synonym(verbose_index, component->mca_project_name, From cb8c286e04900f126185116d3935b42fe5aa79cb Mon Sep 17 00:00:00 2001 From: Quincey Koziol Date: Mon, 16 Oct 2023 13:47:27 -0500 Subject: [PATCH 46/73] Add William to the list of people who have worked on the docs Signed-off-by: Quincey Koziol --- docs/news/news-v5.0.x.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 55c32e185a3..eac2fa71d8f 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -260,6 +260,7 @@ Open MPI version 5.0.0rc13 - Fangcong Yin - Seth Zegelstein - Yixin Zhang + - William Zhang - Build updates and fixes: From 4817e1e1b539866a10f6ac8d44622b5895bd9d7a Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Thu, 21 Sep 2023 02:04:15 +0000 Subject: [PATCH 47/73] communicator: introduce OMPI_COMM_DISJOINT flag This patch introduces a new communicator flag to indicate if no processes share the same node. This flag is intended for optimization of hierarchy-aware collective operations to select the more efficient transport/algorithm. In this change the flag is set during communicator activation via an allreduce op, which incurs a one-time overhead for new communicators. Signed-off-by: Wenduo Wang --- contrib/check_unnecessary_headers.sh | 6 ++-- ompi/communicator/comm.c | 44 +++++++++++++++++++++++++++- ompi/communicator/comm_cid.c | 10 +++++++ ompi/communicator/communicator.h | 17 +++++++++++ 4 files changed, 73 insertions(+), 4 deletions(-) diff --git a/contrib/check_unnecessary_headers.sh b/contrib/check_unnecessary_headers.sh index 15edb513c45..bac0e427951 100644 --- a/contrib/check_unnecessary_headers.sh +++ b/contrib/check_unnecessary_headers.sh @@ -11,7 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. -# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. +# Copyright (c) Amazon.com, Inc. or its affiliates. # All Rights reserved. # # @@ -181,8 +181,8 @@ SEARCH_HEADER[0]="ompi/attribute/attribute.h ATTR_HASH_SIZE OMPI_KEYVAL_PREDEFIN SEARCH_HEADER[1]="ompi/class/ompi_free_list.h ompi_free_list_item_init_fn_t ompi_free_list_t ompi_free_list_item_t ompi_free_list_init_ex ompi_free_list_init ompi_free_list_init_ex_new ompi_free_list_init_new ompi_free_list_grow ompi_free_list_resize ompi_free_list_pos_t OMPI_FREE_LIST_POS_BEGINNING ompi_free_list_parse OMPI_FREE_LIST_GET OMPI_FREE_LIST_WAIT __ompi_free_list_wait OMPI_FREE_LIST_RETURN" SEARCH_HEADER[2]="ompi/class/ompi_rb_tree.h ompi_rb_tree_nodecolor_t ompi_rb_tree_node_t ompi_rb_tree_comp_fn_t ompi_rb_tree_t ompi_rb_tree_condition_fn_t ompi_rb_tree_action_fn_t ompi_rb_tree_construct ompi_rb_tree_destruct ompi_rb_tree_init ompi_rb_tree_insert ompi_rb_tree_find_with ompi_rb_tree_find ompi_rb_tree_delete ompi_rb_tree_destroy ompi_rb_tree_traverse ompi_rb_tree_size" SEARCH_HEADER[3]="ompi/class/ompi_seq_tracker.h ompi_seq_tracker_range_t ompi_seq_tracker_t ompi_seq_tracker_check_duplicate ompi_seq_tracker_insert ompi_seq_tracker_copy" -SEARCH_HEADER[4]="ompi/communicator/communicator.h MPI_Comm MPI_COMM_WORLD ompi_communicator_t OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_world ompi_mpi_comm_self ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dyncomm ompi_mpi_cxx_comm_errhandler_invoke" -SEARCH_HEADER[5]="ompi/datatype/convertor.h OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dync CONVERTOR_DATATYPE_MASK CONVERTOR_SEND_CONVERSION CONVERTOR_RECV CONVERTOR_SEND CONVERTOR_HOMOGENEOUS CONVERTOR_NO_OP CONVERTOR_WITH_CHECKSUM CONVERTOR_TYPE_MASK CONVERTOR_STATE_START CONVERTOR_STATE_COMPLETE CONVERTOR_STATE_ALLOC CONVERTOR_COMPLETED ompi_convertor_t ompi_convertor_master_t dt_stack_t DT_STATIC_STACK_SIZE ompi_convertor_get_checksum ompi_convertor_pack ompi_convertor_unpack ompi_convertor_create ompi_convertor_cleanup ompi_convertor_need_buffers ompi_convertor_get_packed_size ompi_convertor_get_unpacked_size ompi_convertor_get_current_pointer ompi_convertor_prepare_for_send ompi_convertor_copy_and_prepare_for_send ompi_convertor_prepare_for_recv ompi_convertor_copy_and_prepare_for_recv ompi_convertor_raw ompi_convertor_set_position_nocheck ompi_convertor_set_position ompi_convertor_personalize ompi_convertor_clone ompi_convertor_clone_with_position ompi_convertor_dump ompi_ddt_dump_stack ompi_convertor_generic_simple_position MPI_Datatype" +SEARCH_HEADER[4]="ompi/communicator/communicator.h MPI_Comm MPI_COMM_WORLD ompi_communicator_t OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_DISJOINT_SET OMPI_COMM_DISJOINT OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_world ompi_mpi_comm_self ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dyncomm ompi_mpi_cxx_comm_errhandler_invoke" +SEARCH_HEADER[5]="ompi/datatype/convertor.h OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_DISJOINT_SET OMPI_COMM_DISJOINT OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dync CONVERTOR_DATATYPE_MASK CONVERTOR_SEND_CONVERSION CONVERTOR_RECV CONVERTOR_SEND CONVERTOR_HOMOGENEOUS CONVERTOR_NO_OP CONVERTOR_WITH_CHECKSUM CONVERTOR_TYPE_MASK CONVERTOR_STATE_START CONVERTOR_STATE_COMPLETE CONVERTOR_STATE_ALLOC CONVERTOR_COMPLETED ompi_convertor_t ompi_convertor_master_t dt_stack_t DT_STATIC_STACK_SIZE ompi_convertor_get_checksum ompi_convertor_pack ompi_convertor_unpack ompi_convertor_create ompi_convertor_cleanup ompi_convertor_need_buffers ompi_convertor_get_packed_size ompi_convertor_get_unpacked_size ompi_convertor_get_current_pointer ompi_convertor_prepare_for_send ompi_convertor_copy_and_prepare_for_send ompi_convertor_prepare_for_recv ompi_convertor_copy_and_prepare_for_recv ompi_convertor_raw ompi_convertor_set_position_nocheck ompi_convertor_set_position ompi_convertor_personalize ompi_convertor_clone ompi_convertor_clone_with_position ompi_convertor_dump ompi_ddt_dump_stack ompi_convertor_generic_simple_position MPI_Datatype" SEARCH_HEADER[6]="ompi/datatype/datatype.h MPI_Datatype DT_MAX_PREDEFINED DT_FLAG_ MAX_DT_COMPONENT_COUNT opal_ddt_count_t dt_type_desc_t ompi_datatype_t ompi_predefined_datatype_t ompi_ddt_init ompi_ddt_finalize ompi_ddt_create_ ompi_ddt_duplicate ompi_ddt_is_predefined ompi_ddt_create_from_packed_description" SEARCH_HEADER[7]="ompi/datatype/datatype_internal.h DDT_DUMP_STACK DT_ ddt_elem_id_description ddt_elem_desc ddt_elem_desc_t ddt_loop_desc ddt_loop_desc_t ddt_endloop_desc ddt_endloop_desc_t dt_elem_desc CREATE_LOOP_START CREATE_LOOP_END CREATE_ELEM ompi_complex_float_t ompi_complex_double_t ompi_complex_long_double_t ompi_ddt_basicDatatypes BASIC_DDT_FROM_ELEM ompi_ddt_default_convertors_init ompi_ddt_default_convertors_fini SAVE_STACK PUSH_STACK ompi_ddt_safeguard_pointer_debug_breakpoint OMPI_DDT_SAFEGUARD_POINTER GET_FIRST_NON_LOOP UPDATE_INTERNAL_COUNTERS ompi_ddt_print_args" SEARCH_HEADER[8]="ompi/errhandler/errhandler.h OMPI_ERRHANDLER_LANG_ ompi_errhandler_lang_t OMPI_ERRHANDLER_TYPE_ ompi_errhandler_type_t ompi_errhandler_t ompi_predefined_errhandler_t ompi_mpi_errhandler_null OMPI_ERRHANDLER_CHECK OMPI_ERRHANDLER_RETURN ompi_errhandler_init ompi_errhandler_finalize OMPI_ERRHANDLER_INVOKE ompi_errhandler_invoke ompi_errhandler_request_invoke ompi_errhandler_create ompi_errhandler_is_intrinsic ompi_errhandler_fortran_handler_fn_t OMPI_ERR_INIT_FINALIZE MPI_Errhandler" diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 3b66d416cea..1a83258cc96 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -458,7 +458,6 @@ int ompi_comm_create_w_info (ompi_communicator_t *comm, ompi_group_t *group, opa goto exit; } - /* Check whether we are part of the new comm. If not, we have to free the structure again. However, we could not avoid the comm_nextcid step, since @@ -2690,6 +2689,49 @@ static int ompi_comm_copy_topo(ompi_communicator_t *oldcomm, return OMPI_SUCCESS; } +int ompi_comm_set_disjointness(ompi_communicator_t *newcomm, ompi_communicator_t *oldcomm) +{ + int local_peers = 0, rc = OMPI_ERROR; + + if (OMPI_COMM_IS_DISJOINT_SET(newcomm)) { + rc = OMPI_SUCCESS; + goto out; + } + + if (NULL != oldcomm && OMPI_COMM_IS_DISJOINT(oldcomm)) { + /** + * A communicator splitted from a disjoint + * communicator(1 process per node) is also disjoint + */ + newcomm->c_flags |= (OMPI_COMM_DISJOINT_SET | OMPI_COMM_DISJOINT); + rc = OMPI_SUCCESS; + goto out; + } + + if (!newcomm->c_coll) { + rc = OMPI_ERR_NOT_AVAILABLE; + goto out; + } + + local_peers = ompi_group_count_local_peers(newcomm->c_local_group); + rc = newcomm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_peers, 1, MPI_INT, MPI_MAX, newcomm, + newcomm->c_coll->coll_allreduce_module); + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { + goto out; + } + + if (1 == local_peers) { + newcomm->c_flags |= OMPI_COMM_DISJOINT; + } else { + newcomm->c_flags &= ~OMPI_COMM_DISJOINT; + } + + newcomm->c_flags |= OMPI_COMM_DISJOINT_SET; + +out: + return rc; +} + char *ompi_comm_print_cid (const ompi_communicator_t *comm) { #if OPAL_HAVE_THREAD_LOCAL diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index 07970e8354f..d5a79859066 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -811,6 +811,16 @@ static int ompi_comm_activate_complete (ompi_communicator_t **newcomm, ompi_comm return ret; } + /** + * Use the initialized collective component to determine whether the processes are located on + * individual nodes + */ + if (OMPI_SUCCESS != ompi_comm_set_disjointness(*newcomm, comm)) { + OBJ_RELEASE(*newcomm); + *newcomm = MPI_COMM_NULL; + return ret; + } + /* For an inter communicator, we have to deal with the potential * problem of what is happening if the local_comm that we created * has a lower CID than the parent comm. This is not a problem diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 7e41afb8631..12d7d391d81 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -62,6 +62,8 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t); #define OMPI_COMM_DYNAMIC 0x00000008 #define OMPI_COMM_ISFREED 0x00000010 #define OMPI_COMM_INVALID 0x00000020 +#define OMPI_COMM_DISJOINT_SET 0x00000040 +#define OMPI_COMM_DISJOINT 0x00000080 #define OMPI_COMM_CART 0x00000100 #define OMPI_COMM_GRAPH 0x00000200 #define OMPI_COMM_DIST_GRAPH 0x00000400 @@ -80,6 +82,8 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t); #define OMPI_COMM_IS_FREED(comm) ((comm)->c_flags & OMPI_COMM_ISFREED) #define OMPI_COMM_IS_DYNAMIC(comm) ((comm)->c_flags & OMPI_COMM_DYNAMIC) #define OMPI_COMM_IS_INVALID(comm) ((comm)->c_flags & OMPI_COMM_INVALID) +#define OMPI_COMM_IS_DISJOINT_SET(comm) ((comm)->c_flags & OMPI_COMM_DISJOINT_SET) +#define OMPI_COMM_IS_DISJOINT(comm) ((comm)->c_flags & OMPI_COMM_DISJOINT) #define OMPI_COMM_IS_PML_ADDED(comm) ((comm)->c_flags & OMPI_COMM_PML_ADDED) #define OMPI_COMM_IS_EXTRA_RETAIN(comm) ((comm)->c_flags & OMPI_COMM_EXTRA_RETAIN) #define OMPI_COMM_IS_TOPO(comm) (OMPI_COMM_IS_CART((comm)) || \ @@ -897,6 +901,19 @@ OMPI_DECLSPEC int ompi_comm_split_type(ompi_communicator_t *comm, struct opal_info_t *info, ompi_communicator_t** newcomm); +/** + * Set newcomm's disjoint flags based on oldcomm if provided. In the case where oldcomm + * is disjoint, the function will short circuit and set newcomm to be disjoint. + * Otherwise, the function will carry out a collective communication on all processes + * in newcomm. Therefore this function should only be called **after** the collectives + * modules are initialized on newcomm. + * + * @param newcomm: new communicator + * @param oldcomm: parent communictator or NULL + * + */ +OMPI_DECLSPEC int ompi_comm_set_disjointness(ompi_communicator_t *newcomm, ompi_communicator_t *oldcomm); + /** * dup a communicator. Parameter are identical to the MPI-counterpart * of the function. It has been extracted, since we need to be able From 1396585210730d881baeb38290dbb5b66133e6ca Mon Sep 17 00:00:00 2001 From: Vishwanath Venkatesan Date: Thu, 26 May 2022 22:55:17 -0700 Subject: [PATCH 48/73] SHMEM_LOCKS: MCS implementation of SHMEM LOCKS Adding MCS algorithm-based implementation for shmem_locks to improve performance for large scale SHMEM applications using locks. MCS lock is now the default algorithm, use the following MCA parameter to disable. --mca oshmem_enable_mcs_lock 0 to disable mcs locks and revert to default ticket locking. --mca oshmem_api_verbose 10 for debug information on shmem_locks. Signed-off-by: Vishwanath Venkatesan --- oshmem/runtime/oshmem_shmem_params.c | 20 ++- oshmem/runtime/params.h | 7 + oshmem/shmem/c/Makefile.am | 3 +- oshmem/shmem/c/shmem_clear_lock.c | 11 +- oshmem/shmem/c/shmem_mcs_lock.c | 239 +++++++++++++++++++++++++++ oshmem/shmem/c/shmem_set_lock.c | 11 +- oshmem/shmem/c/shmem_test_lock.c | 11 +- oshmem/shmem/shmem_lock.h | 3 + 8 files changed, 298 insertions(+), 7 deletions(-) create mode 100644 oshmem/shmem/c/shmem_mcs_lock.c diff --git a/oshmem/runtime/oshmem_shmem_params.c b/oshmem/runtime/oshmem_shmem_params.c index 24035be24ee..3d68fcb927a 100644 --- a/oshmem/runtime/oshmem_shmem_params.c +++ b/oshmem/runtime/oshmem_shmem_params.c @@ -17,9 +17,10 @@ #include "oshmem/constants.h" -int oshmem_shmem_lock_recursive = 0; -int oshmem_shmem_api_verbose = 0; -int oshmem_preconnect_all = 0; +int oshmem_shmem_lock_recursive = 0; +int oshmem_shmem_api_verbose = 0; +int oshmem_shmem_enable_mcs_locks = 1; +int oshmem_preconnect_all = 0; int oshmem_shmem_register_params(void) { @@ -38,6 +39,19 @@ int oshmem_shmem_register_params(void) MCA_BASE_VAR_SCOPE_READONLY, &oshmem_shmem_lock_recursive); + (void) mca_base_var_register("oshmem", + "oshmem", + NULL, + "enable_mcs_lock", + "enable mcs locks", + MCA_BASE_VAR_TYPE_INT, + NULL, + 1, + MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &oshmem_shmem_enable_mcs_locks); + (void) mca_base_var_register("oshmem", "oshmem", NULL, diff --git a/oshmem/runtime/params.h b/oshmem/runtime/params.h index e1a2d8cf1d8..979b1125d08 100644 --- a/oshmem/runtime/params.h +++ b/oshmem/runtime/params.h @@ -37,6 +37,13 @@ OSHMEM_DECLSPEC extern int oshmem_shmem_api_verbose; */ OSHMEM_DECLSPEC extern int oshmem_preconnect_all; + +/** + * Whether to force SHMEM processes to use MCS locking + * for shmem_locks + */ +OSHMEM_DECLSPEC extern int oshmem_shmem_enable_mcs_locks; + END_C_DECLS #endif /* OSHMEM_RUNTIME_PARAMS_H */ diff --git a/oshmem/shmem/c/Makefile.am b/oshmem/shmem/c/Makefile.am index d2c152073c0..194de248008 100644 --- a/oshmem/shmem/c/Makefile.am +++ b/oshmem/shmem/c/Makefile.am @@ -13,7 +13,8 @@ OSHMEM_AUX_SOURCES = \ - shmem_lock.c + shmem_lock.c \ + shmem_mcs_lock.c OSHMEM_API_SOURCES = \ shmem_init.c \ diff --git a/oshmem/shmem/c/shmem_clear_lock.c b/oshmem/shmem/c/shmem_clear_lock.c index 3051047a686..4c94038d316 100644 --- a/oshmem/shmem/c/shmem_clear_lock.c +++ b/oshmem/shmem/c/shmem_clear_lock.c @@ -1,4 +1,6 @@ /* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. * Copyright (c) 2013-2016 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2019 Research Organization for Information Science @@ -18,6 +20,7 @@ #include "oshmem/shmem/shmem_api_logger.h" #include "oshmem/runtime/runtime.h" #include "oshmem/shmem/shmem_lock.h" +#include "oshmem/runtime/params.h" #if OSHMEM_PROFILING #include "oshmem/include/pshmem.h" @@ -27,5 +30,11 @@ void shmem_clear_lock(volatile long *lock) { - _shmem_clear_lock((void *)lock, sizeof(long)); + if (oshmem_shmem_enable_mcs_locks) { + SHMEM_API_VERBOSE(10, "Clear Lock with MCS Lock implementation"); + _shmem_mcs_clear_lock((long *)lock); + } else { + SHMEM_API_VERBOSE(10, "Clear Lock with Ticket Lock implementation"); + _shmem_clear_lock((void *)lock, sizeof(long)); + } } diff --git a/oshmem/shmem/c/shmem_mcs_lock.c b/oshmem/shmem/c/shmem_mcs_lock.c new file mode 100644 index 00000000000..3d7e97ee7b4 --- /dev/null +++ b/oshmem/shmem/c/shmem_mcs_lock.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "oshmem_config.h" + +#include "oshmem/constants.h" +#include "oshmem/include/shmem.h" +#include "oshmem/runtime/params.h" +#include "oshmem/runtime/runtime.h" +#include +#include + +#include "oshmem/shmem/shmem_api_logger.h" +#include "oshmem/shmem/shmem_lock.h" +#include "oshmem/mca/memheap/memheap.h" +#include "oshmem/mca/memheap/base/base.h" +#include "oshmem/mca/atomic/atomic.h" + +#define OPAL_BITWISE_SIZEOF_LONG (SIZEOF_LONG * 8) + + +/** Use basic MCS distributed lock algorithm for lock */ +struct shmem_mcs_lock { + /** has meaning only on MCSQ_TAIL OWNER */ + int tail; + /** It has meaning on all PEs */ + /** The next pointer is a combination of the PE ID and wait signal */ + int next; +}; +typedef struct shmem_mcs_lock shmem_mcs_lock_t; + +#define SHMEM_MCSL_TAIL_OWNER(lock_ptr)\ + (((uintptr_t)(lock_ptr) / sizeof(long)) % shmem_n_pes()) + +#define SHMEM_MCSL_NEXT_MASK 0x7FFFFFFFU +#define SHMEM_MCSL_SIGNAL_MASK 0x80000000U /** Wait signal mask */ +#define SHMEM_MCSL_NEXT(lock_val) ((lock_val) & SHMEM_MCSL_NEXT_MASK) +/** Improve readability */ +#define SHMEM_MCSL_GET_PE(tail_val) ((tail_val) & SHMEM_MCSL_NEXT_MASK) +#define SHMEM_MCSL_SIGNAL(lock_val) ((lock_val) & SHMEM_MCSL_SIGNAL_MASK) +#define SHMEM_MCSL_SET_SIGNAL(lock_val) ((lock_val) | SHMEM_MCSL_SIGNAL_MASK) + +void +_shmem_mcs_set_lock(long *lockp) +{ + shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp; + int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock); + int new_tail_req = 0; + int *tail = &(lock->tail); + int *next = &(lock->next); + int my_pe = shmem_my_pe(); + int curr = 0; + int out_value = 0; + int prev_tail = 0; + int prev_tailpe = 0; + int tval = 0; + int tmp_val = 0; + int retv = 0; + uint64_t value_tmp = 0; + + RUNTIME_CHECK_INIT(); + /** + * Initializing next pointer to next mask + * Done atomically to avoid races as NEXT pointer + * can be modified by other PEs while acquiring or + * releasing it. + */ + /** + * Can make this to be shmem_atomic_set to be safe + * in non-cc architectures + * has an impact on performance + */ + value_tmp = SHMEM_MCSL_NEXT_MASK; + out_value = SHMEM_MCSL_NEXT_MASK; + retv = MCA_ATOMIC_CALL(swap(oshmem_ctx_default, (void*)next, + (void*)&out_value, value_tmp, + sizeof(int), my_pe)); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); + + /** Signal for setting lock */ + new_tail_req = SHMEM_MCSL_SET_SIGNAL(my_pe); + /** + * Swap and make me the new tail and update in tail owner + * Get the previous tail PE. + */ + retv = MCA_ATOMIC_CALL(swap(oshmem_ctx_default, (void *)tail, + (void*)&prev_tail, + OSHMEM_ATOMIC_PTR_2_INT(&new_tail_req, + sizeof(new_tail_req)), + sizeof(int), mcs_tail_owner)); + RUNTIME_CHECK_RC(retv); + + prev_tailpe = SHMEM_MCSL_GET_PE(prev_tail); + if (SHMEM_MCSL_SIGNAL(prev_tail)) { + /** + * Someone else has got the lock before this PE + * Adding this PE to the previous tail PE's Next pointer + * Substract the SIGNAL Bit to avoid changing it. + */ + tmp_val = my_pe - SHMEM_MCSL_NEXT_MASK; + retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, (void*)next, tmp_val, + sizeof(int), prev_tailpe)); + RUNTIME_CHECK_RC(retv); + /** + * This value to be changed eventually by predecessor + * when its lock is released. + * Need to be done atomically to avoid any races where + * next pointer is modified by another PE acquiring or + * releasing this. + */ + retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, (void *)next, + SHMEM_MCSL_SIGNAL_MASK, sizeof(int), + my_pe)); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); + /** Wait for predecessor release lock to this PE signal to false. */ + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&curr, tval, sizeof(int), my_pe)); + RUNTIME_CHECK_RC(retv); + + while (SHMEM_MCSL_SIGNAL(curr)) { + retv = MCA_SPML_CALL(wait((void*)next, SHMEM_CMP_NE, + (void*)&curr, SHMEM_INT)); + RUNTIME_CHECK_RC(retv); + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&curr, tval, sizeof(int), + my_pe)); + RUNTIME_CHECK_RC(retv); + } + } +/** else.. this pe has got the lock as no one else had it */ +} + +void +_shmem_mcs_clear_lock(long *lockp) +{ + shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp; + int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock); + int *tail = &(lock->tail); + int *next = &(lock->next); + int my_pe = shmem_my_pe(); + int next_value = 0; + int swap_cond = 0; + int prev_value = 0; + int tval = 0; + int val_tmp = 0; + int nmask = 0; + int a_val = 0; + int retv = 0; + + /** + * Can make atomic fetch to be safe in non-cc architectures + * Has impact on performance + */ + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&next_value, tval, sizeof(int), + my_pe)); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); + + if (next_value == SHMEM_MCSL_NEXT_MASK) { + swap_cond = SHMEM_MCSL_SET_SIGNAL(my_pe); + retv = MCA_ATOMIC_CALL(cswap(oshmem_ctx_default, + (void *)tail, (uint64_t *)&(prev_value), + OSHMEM_ATOMIC_PTR_2_INT(&swap_cond, + sizeof(swap_cond)), + OSHMEM_ATOMIC_PTR_2_INT(&val_tmp, + sizeof(val_tmp)), sizeof(int), + mcs_tail_owner)); + RUNTIME_CHECK_RC(retv); + + /** I am the tail.. and lock is released */ + if (prev_value == swap_cond) { + return; + } + /** + * I am not the tail, another PE maybe racing to acquire lock, + * let them complete setting themselves as our next + */ + nmask = SHMEM_MCSL_NEXT_MASK; + while(next_value == nmask) { + retv = MCA_SPML_CALL(wait((void*)next, SHMEM_CMP_NE, + (void*)&nmask, SHMEM_INT)); + RUNTIME_CHECK_RC(retv); + retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next, + (void*)&next_value, tval, + sizeof(int), my_pe)); + RUNTIME_CHECK_RC(retv); + } + } + /** There is a successor release lock to the successor */ + a_val = SHMEM_MCSL_SIGNAL_MASK; + retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, + (void *)next, a_val, sizeof(a_val), + SHMEM_MCSL_NEXT(next_value))); + RUNTIME_CHECK_RC(retv); + MCA_SPML_CALL(quiet(oshmem_ctx_default)); +} + +int +_shmem_mcs_test_lock(long *lockp) +{ + shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp; + int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock); + int new_tail_req = 0; + int prev_tail = 0; + int tmp_cond = 0; + int *tail = &(lock->tail); + int *next = &(lock->next); + int my_pe = shmem_my_pe(); + int retv = 0; + + /** Initializing next pointer to next mask */ + *next = SHMEM_MCSL_NEXT_MASK; + + /** Signal for setting lock */ + new_tail_req = SHMEM_MCSL_SET_SIGNAL(my_pe); + + /** Check if previously cleared before swapping */ + retv = MCA_ATOMIC_CALL(cswap(oshmem_ctx_default, + (void *)tail, (uint64_t *)&(prev_tail), + OSHMEM_ATOMIC_PTR_2_INT(&tmp_cond, + sizeof(tmp_cond)), + OSHMEM_ATOMIC_PTR_2_INT(&new_tail_req, + sizeof(new_tail_req)), + sizeof(int), mcs_tail_owner)); + RUNTIME_CHECK_RC(retv); + + return (0 != prev_tail); +} diff --git a/oshmem/shmem/c/shmem_set_lock.c b/oshmem/shmem/c/shmem_set_lock.c index 514cb2111c3..90cc9bb706e 100644 --- a/oshmem/shmem/c/shmem_set_lock.c +++ b/oshmem/shmem/c/shmem_set_lock.c @@ -1,4 +1,6 @@ /* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. * Copyright (c) 2013-2016 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2019 Research Organization for Information Science @@ -18,6 +20,7 @@ #include "oshmem/shmem/shmem_api_logger.h" #include "oshmem/runtime/runtime.h" #include "oshmem/shmem/shmem_lock.h" +#include "oshmem/runtime/params.h" #if OSHMEM_PROFILING #include "oshmem/include/pshmem.h" @@ -27,5 +30,11 @@ void shmem_set_lock(volatile long *lock) { - _shmem_set_lock((void *)lock, sizeof(long)); + if (oshmem_shmem_enable_mcs_locks) { + SHMEM_API_VERBOSE(10, "Set Lock with MCS Lock implementation"); + _shmem_mcs_set_lock((long *)lock); + } else { + SHMEM_API_VERBOSE(10, "Set Lock with Ticket Lock implementation"); + _shmem_set_lock((void *)lock, sizeof(long)); + } } diff --git a/oshmem/shmem/c/shmem_test_lock.c b/oshmem/shmem/c/shmem_test_lock.c index 217b9afde02..0cae5576f5f 100644 --- a/oshmem/shmem/c/shmem_test_lock.c +++ b/oshmem/shmem/c/shmem_test_lock.c @@ -1,4 +1,6 @@ /* + * Copyright (c) 2023 NVIDIA Corporation. + * All rights reserved. * Copyright (c) 2013-2016 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2019 Research Organization for Information Science @@ -18,6 +20,7 @@ #include "oshmem/include/shmem.h" #include "oshmem/shmem/shmem_api_logger.h" #include "oshmem/runtime/runtime.h" +#include "oshmem/runtime/params.h" #include "oshmem/shmem/shmem_lock.h" #if OSHMEM_PROFILING @@ -28,5 +31,11 @@ int shmem_test_lock(volatile long *lock) { - return _shmem_test_lock((void *)lock, sizeof(long)); + if (oshmem_shmem_enable_mcs_locks) { + SHMEM_API_VERBOSE(10, "Test lock using MCS Lock implementation"); + return _shmem_mcs_test_lock((long *)lock); + } else { + SHMEM_API_VERBOSE(10, "Test_lock using Ticket Lock implementation"); + return _shmem_test_lock((void *)lock, sizeof(long)); + } } diff --git a/oshmem/shmem/shmem_lock.h b/oshmem/shmem/shmem_lock.h index c338339c529..de138f45ff9 100644 --- a/oshmem/shmem/shmem_lock.h +++ b/oshmem/shmem/shmem_lock.h @@ -22,5 +22,8 @@ void _shmem_set_lock(void *lock, int lock_size); int _shmem_test_lock(void *lock, int lock_size); void _shmem_clear_lock(void *lock, int lock_size); +void _shmem_mcs_set_lock(long *lock); +void _shmem_mcs_clear_lock(long *lock); +int _shmem_mcs_test_lock(long *lock); #endif /*SHMEM_LOCK_H*/ From b02555e9f19e79d642daadc6a0e7a77f7a868f71 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Sun, 1 Oct 2023 17:43:59 +0000 Subject: [PATCH 49/73] communicator: add assertion on OMPI_COMM_DISJOINT flag This patch introduces assertions to verify that sub-communicators are created with the expected OMPI_COMM_DISJOINT* flags. Signed-off-by: Wenduo Wang --- ompi/mca/coll/han/coll_han_subcomms.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index 90bc0d1d972..fe6e197cfb8 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -282,6 +282,7 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, opal_info_set(&comm_info, "ompi_comm_coll_preference", "tuned,^han"); ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, &comm_info, &(low_comms[0])); + assert(OMPI_COMM_IS_DISJOINT_SET(low_comms[0]) && !OMPI_COMM_IS_DISJOINT(low_comms[0])); /* * Get my local rank and the local size @@ -296,6 +297,7 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, &comm_info, &(low_comms[1])); + assert(OMPI_COMM_IS_DISJOINT_SET(low_comms[1]) && !OMPI_COMM_IS_DISJOINT(low_comms[1])); /* * Upgrade libnbc module priority to set up up_comms[0] with libnbc module @@ -304,8 +306,8 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, */ opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han"); ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false); - up_rank = ompi_comm_rank(up_comms[0]); + assert(OMPI_COMM_IS_DISJOINT_SET(up_comms[0]) && OMPI_COMM_IS_DISJOINT(up_comms[0])); /* * Upgrade adapt module priority to set up up_comms[0] with adapt module @@ -313,6 +315,7 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, */ opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han"); ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false); + assert(OMPI_COMM_IS_DISJOINT_SET(up_comms[1]) && OMPI_COMM_IS_DISJOINT(up_comms[1])); /* * Set my virtual rank number. @@ -350,5 +353,3 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, OBJ_DESTRUCT(&comm_info); return OMPI_SUCCESS; } - - From c265ea268418af64016a7e2bbf192d5232dde3b3 Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Tue, 17 Oct 2023 09:20:27 -0400 Subject: [PATCH 50/73] Update PMIx and PRRTe pointers. Signed-off-by: Austen Lauria --- 3rd-party/openpmix | 2 +- 3rd-party/prrte | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/3rd-party/openpmix b/3rd-party/openpmix index 4c444462d2b..977a06751e0 160000 --- a/3rd-party/openpmix +++ b/3rd-party/openpmix @@ -1 +1 @@ -Subproject commit 4c444462d2bb0102faa6fda8410ca8e50a365e78 +Subproject commit 977a06751e0918faf8da209c07fda138ee10cbcd diff --git a/3rd-party/prrte b/3rd-party/prrte index 9015ca02cce..f930dcd9945 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 9015ca02cce72acc03f86d399f939843c42b3dc8 +Subproject commit f930dcd9945cb5ce89789c3a51f2c2062faddeb6 From 37eee1bcef5ba2498dbbd780c338bf34271621d1 Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Tue, 17 Oct 2023 09:19:32 -0400 Subject: [PATCH 51/73] Revert "Patch the prrte.spec file." This reverts commit 3ef5dc9a0c901322a6aa190f63f8dbc6af75626d. Signed-off-by: Austen Lauria --- autogen.pl | 4 ---- config/prrte.spec.diff | 20 -------------------- 2 files changed, 24 deletions(-) delete mode 100644 config/prrte.spec.diff diff --git a/autogen.pl b/autogen.pl index 3cb79025dbf..5af4704f2a1 100755 --- a/autogen.pl +++ b/autogen.pl @@ -1643,10 +1643,6 @@ sub replace_config_sub_guess { if (! -f "3rd-party/prrte/configure.ac") { my_die("Could not find pmix files\n"); } - - verbose "Patching prrte.spec file\n"; - system("$patch_prog -N -p0 < ./config/prrte.spec.diff > /dev/null 2>&1"); - push(@subdirs, "3rd-party/prrte/"); $m4 .= "m4_define([package_prrte], [1])\n"; diff --git a/config/prrte.spec.diff b/config/prrte.spec.diff deleted file mode 100644 index 4e8b1a86eb1..00000000000 --- a/config/prrte.spec.diff +++ /dev/null @@ -1,20 +0,0 @@ ---- 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:43.842625000 -0400 -+++ 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:27.849686000 -0400 -@@ -612,7 +612,7 @@ - %{shell_scripts_path}/%{shell_scripts_basename}.sh - %{shell_scripts_path}/%{shell_scripts_basename}.csh - %endif --%doc README INSTALL LICENSE -+%doc README.md LICENSE - - %else - -@@ -656,7 +656,7 @@ - %{shell_scripts_path}/%{shell_scripts_basename}.sh - %{shell_scripts_path}/%{shell_scripts_basename}.csh - %endif --%doc README INSTALL LICENSE -+%doc README.md LICENSE - %{_pkgdatadir} - - %files devel -f devel.files From 5e695cd5de406c5ad3b3a6a533952bd3801ca426 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 23 Oct 2023 15:19:52 -0400 Subject: [PATCH 52/73] dpm: update PMIX attribute No longer used the ancient/deprecate PMIX_MAPBY, and instead use PMIX_DISPLAY_MAP. Signed-off-by: Jeff Squyres --- ompi/dpm/dpm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 656a45d8a41..16bc764a7eb 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -24,6 +24,7 @@ * Copyright (c) 2018-2022 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. + * Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1243,7 +1244,7 @@ int ompi_dpm_spawn(int count, const char *array_of_commands[], /* check for 'display_map' - a job-level key */ ompi_info_get_bool(array_of_info[i], "display_map", &local_spawn, &flag); if ( flag ) { - rc = dpm_convert(&job_info, "display_map", PMIX_MAPBY, NULL, "DISPLAY", true); + rc = dpm_convert(&job_info, "display_map", PMIX_DISPLAY_MAP, NULL, "DISPLAY", true); if (OMPI_SUCCESS != rc) { OPAL_LIST_DESTRUCT(&job_info); OPAL_LIST_DESTRUCT(&app_info); From 75e3d33991e58fadbc12997fbe8b21bd3e37c5c4 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Thu, 19 Oct 2023 13:37:39 -0600 Subject: [PATCH 53/73] Update processing of "display_map" info key Translates to its own unique PMIX_DISPLAY_MAP attribute and is no longer a qualifier to the PMIX_MAPBY attr. Note that a previous commit (5e695cd on main) converted the PMIX_MAPBY attribute to PMIX_DISPLAY_MAP; this commit completes the job by updating the surrounding logic accordingly. Signed-off-by: Ralph Castain Signed-off-by: Jeff Squyres --- ompi/dpm/dpm.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/ompi/dpm/dpm.c b/ompi/dpm/dpm.c index 16bc764a7eb..25f60586893 100644 --- a/ompi/dpm/dpm.c +++ b/ompi/dpm/dpm.c @@ -1244,19 +1244,9 @@ int ompi_dpm_spawn(int count, const char *array_of_commands[], /* check for 'display_map' - a job-level key */ ompi_info_get_bool(array_of_info[i], "display_map", &local_spawn, &flag); if ( flag ) { - rc = dpm_convert(&job_info, "display_map", PMIX_DISPLAY_MAP, NULL, "DISPLAY", true); - if (OMPI_SUCCESS != rc) { - OPAL_LIST_DESTRUCT(&job_info); - OPAL_LIST_DESTRUCT(&app_info); - PMIX_APP_FREE(apps, scount); - if (NULL != hostfiles) { - opal_argv_free(hostfiles); - } - if (NULL != dash_host) { - opal_argv_free(dash_host); - } - return MPI_ERR_SPAWN; - } + info = OBJ_NEW(opal_info_item_t); + PMIX_INFO_LOAD(&info->info, PMIX_DISPLAY_MAP, &local_spawn, PMIX_BOOL); + opal_list_append(&job_info, &info->super); } /* check for 'npernode' and 'ppr' - job-level key */ From a9f0dd5c8ec8147594b57bc62faabc2f4bb85807 Mon Sep 17 00:00:00 2001 From: Tomislav Janjusic Date: Tue, 17 Oct 2023 11:47:07 -0500 Subject: [PATCH 54/73] update news and version Signed-off-by: Tomislav Janjusic --- docs/news/news-v5.0.x.rst | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index eac2fa71d8f..c225fa87ba4 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -4,9 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI version 5.0.0rc13 +Open MPI version 5.0.0rc14 -------------------------- -:Date: 29 September 2023 +:Date: 17 October 2023 .. admonition:: The MPIR API has been removed :class: warning @@ -66,20 +66,25 @@ Open MPI version 5.0.0rc13 Libevent symbols and then statically pulled the library into ``libmpi.so``. -- Changes since rc12: - - - Update PMIx to the ``v4.2.6`` release tag. Hash: ``f20e0d5``. - - Update PRRTE to the ``v3.0.1`` release tag. Hash: ``63370ca``. - - Lots of documentation updates. - - Fixed parameter name in ``MPI_Intercomm_merge``. Thanks to Yan Wu for the report. - - ``OFI``: Update NIC selection to determine optimal interfaces from the current process. - - Fix reordering of received data in ``MPI_Gather``. - - Disable builds with ``HWLOC`` versions >= 3.0.0. This is currently not supported. - - Fix re-ordering of ranks in ``MPI_Dist_graph_create``. - - ``coll/HAN``: Fix bug when using ``MPI_IN_PLACE`` with ``MPI_Reduce``. - - Fix ``MPI_Type_Dup`` to propagate errors from inner calls. - - Fix the compilation of the monitoring infrastructure. - - Various other bug fixes. +- Changes since rc13: + + - Update PMIx to hash: ``f8f578392ec77dd7a1d76ca697da4f15afcb0161``. + - Update PRRTE to hash: ``bb4085053a0b268ae2a2e04ed56387f53e4a3e7a``. + - Documentation updates + - Fix build case with --disable-prrte + - Update PRRTe and PMIx pointers to pull in fixes, including spurious log messages, and also + RPM fixes. + - pcomm: fix fortran interface for precv/psend. + - Fix UCX support level check. + - Add support for MPI_ERR_VALUE_TOO_LARGE + - ofi - add MCA parameters to not use FI_HMEM + This commit adds two MCA parameters: + mtl_ofi_disable_hmem + btl_ofi_disable_hmem + - oshmem: + Add symmetric remote key handling + Fixed DEVICE_NIC_MEM support to use RDMA memory type. + - Fix a small issue in properly setting filename when building the empty schizo rst file. - All other notable updates for v5.0.0: From 4a5eb46040a1f2365965b37a98b304b34a71ee4a Mon Sep 17 00:00:00 2001 From: Tomislav Janjusic Date: Wed, 18 Oct 2023 05:34:12 -0500 Subject: [PATCH 55/73] v5.0.0rc15 news and version update Signed-off-by: Tomislav Janjusic --- docs/news/news-v5.0.x.rst | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index c225fa87ba4..69599423fef 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -4,9 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI version 5.0.0rc14 +Open MPI version 5.0.0rc15 -------------------------- -:Date: 17 October 2023 +:Date: 19 October 2023 .. admonition:: The MPIR API has been removed :class: warning @@ -66,25 +66,11 @@ Open MPI version 5.0.0rc14 Libevent symbols and then statically pulled the library into ``libmpi.so``. -- Changes since rc13: - - - Update PMIx to hash: ``f8f578392ec77dd7a1d76ca697da4f15afcb0161``. - - Update PRRTE to hash: ``bb4085053a0b268ae2a2e04ed56387f53e4a3e7a``. - - Documentation updates - - Fix build case with --disable-prrte - - Update PRRTe and PMIx pointers to pull in fixes, including spurious log messages, and also - RPM fixes. - - pcomm: fix fortran interface for precv/psend. - - Fix UCX support level check. - - Add support for MPI_ERR_VALUE_TOO_LARGE - - ofi - add MCA parameters to not use FI_HMEM - This commit adds two MCA parameters: - mtl_ofi_disable_hmem - btl_ofi_disable_hmem - - oshmem: - Add symmetric remote key handling - Fixed DEVICE_NIC_MEM support to use RDMA memory type. - - Fix a small issue in properly setting filename when building the empty schizo rst file. +- Changes since rc14: + + - Update PMIx to hash: ``17a7bf1a8886a8ad16dff5bc0791e226b0937106``. + - Update PRRTE to hash: ``09c4212bb491c8d2db9b12179379748b6d86520e``. + Includes a fix to correctly forward stdin to remote processes. - All other notable updates for v5.0.0: From 2865f20851fc8e002d3c74b3c98e21c88314628f Mon Sep 17 00:00:00 2001 From: Tomislav Janjusic Date: Tue, 24 Oct 2023 09:33:14 -0500 Subject: [PATCH 56/73] update news and version for rc16 Signed-off-by: Tomislav Janjusic --- docs/news/news-v5.0.x.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 69599423fef..a501509e8cd 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -66,11 +66,10 @@ Open MPI version 5.0.0rc15 Libevent symbols and then statically pulled the library into ``libmpi.so``. -- Changes since rc14: +- Changes since rc15: - - Update PMIx to hash: ``17a7bf1a8886a8ad16dff5bc0791e226b0937106``. - - Update PRRTE to hash: ``09c4212bb491c8d2db9b12179379748b6d86520e``. - Includes a fix to correctly forward stdin to remote processes. + - Update PMIx to release tag v4.2.7 hash: ``57c405c52ad76bab0be9f95e29a6df660673081e``. + - Update PRRTE to release tag v3.0.2 hash: ``1552e36f0852bbc6d901ec95983369f0a3c283f6``. - All other notable updates for v5.0.0: From 8c784e942041def9b0ce19546adf35da8a9fee8c Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Wed, 25 Oct 2023 13:31:31 -0400 Subject: [PATCH 57/73] Change date for rc16 Signed-off-by: Austen Lauria --- docs/news/news-v5.0.x.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index a501509e8cd..9681ef9e8bf 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -4,9 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI version 5.0.0rc15 +Open MPI version 5.0.0rc16 -------------------------- -:Date: 19 October 2023 +:Date: 24 October 2023 .. admonition:: The MPIR API has been removed :class: warning From 439295421be8b4c67685f7062512c44b4026aa1f Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Wed, 25 Oct 2023 13:40:18 -0400 Subject: [PATCH 58/73] Cleanup the news in prep for v5.0.0 release. Signed-off-by: Austen Lauria --- docs/news/news-v5.0.x.rst | 181 +++++++++++--------------------------- 1 file changed, 53 insertions(+), 128 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 9681ef9e8bf..0f453884b6d 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -73,15 +73,35 @@ Open MPI version 5.0.0rc16 - All other notable updates for v5.0.0: + - MPI-4.0 updates and additions: + + - Support for MPI Sessions has been added. + - Added partitioned communication using persistent sends + and persistent receives. + - Added persistent collectives to the ``MPI_`` namespace + (they were previously available via the ``MPIX_`` prefix). + - Added ``MPI_Isendrecv()`` and its variants. + - Added support for ``MPI_Comm_idup_with_info()``. + - Added support for ``MPI_Info_get_string()``. + - Added support for ``initial_error_handler`` and the + ``ERRORS_ABORT`` infrastructure. + - Added error handling for unbound errors to ``MPI_COMM_SELF``. + - Made ``MPI_Comm_get_info()``, ``MPI_File_get_info()``, and + ``MPI_Win_get_info()`` compliant to the standard. + - Droped unknown/ignored info keys on communicators, files, + and windows. + - Initial implementations of ``MPI_COMM_TYPE_HW_GUIDED`` and + ``MPI_COMM_TYPE_HW_GUIDED`` added. + - ``MPI_Info_get()`` and ``MPI_Info_get_valuelen()`` are now + deprecated. + - Issue a deprecation warning when ``MPI_Cancel()`` is called for + a non-blocking send request. + - New Features: - ULFM Fault Tolerance support has been added. See :ref:`the ULFM section `. - CUDA is now supported in the ``ofi`` MTL. - - New MCA parameter ``ompi_display_comm``, enabling a - communication report. When set to ``mpi_init``, display the - report when ``MPI_Init()`` is invoked. When set to - ``mpi_finalize``, display the report during ``MPI_Finalize()``. - A threading framework has been added to allow building Open MPI with different threading libraries. It currently supports `Argobots `_, `Qthreads @@ -106,30 +126,14 @@ Open MPI version 5.0.0rc16 ``memory_patcher``. Thanks to Rich Welch for the contribution. - ``coll/ucc``: Added support for the ``MPI_Scatter()`` and ``MPI_Iscatter()`` collectives. - - - MPI-4.0 updates and additions: - - - Support for MPI Sessions has been added. - - Added partitioned communication using persistent sends - and persistent receives. - - Added persistent collectives to the ``MPI_`` namespace - (they were previously available via the ``MPIX_`` prefix). - - Added ``MPI_Isendrecv()`` and its variants. - - Added support for ``MPI_Comm_idup_with_info()``. - - Added support for ``MPI_Info_get_string()``. - - Added support for ``initial_error_handler`` and the - ``ERRORS_ABORT`` infrastructure. - - Added error handling for unbound errors to ``MPI_COMM_SELF``. - - Made ``MPI_Comm_get_info()``, ``MPI_File_get_info()``, and - ``MPI_Win_get_info()`` compliant to the standard. - - Droped unknown/ignored info keys on communicators, files, - and windows. - - Initial implementations of ``MPI_COMM_TYPE_HW_GUIDED`` and - ``MPI_COMM_TYPE_HW_GUIDED`` added. - - ``MPI_Info_get()`` and ``MPI_Info_get_valuelen()`` are now - deprecated. - - Issue a deprecation warning when ``MPI_Cancel()`` is called for - a non-blocking send request. + - New algorithm for Allgather and Allgatherv has been added, based + on the paper *"Sparbit: a new logarithmic-cost and data + locality-aware MPI Allgather algorithm"*. Default algorithm + selection rules are unchanged; to use these algorithms add: + ``--mca coll_tuned_allgather_algorithm sparbit`` and/or ``--mca + coll_tuned_allgatherv_algorithm sparbit`` to your ``mpirun`` + command. Thanks to Wilton Jaciel Loch and Guilherme Koslovski + for their contribution. - Transport updates and improvements @@ -163,10 +167,6 @@ Open MPI version 5.0.0rc16 - Shared Memory: - - The legacy ``sm`` (shared memory) BTL has been removed. The - next-generation shared memory BTL ``vader`` replaces it, and - has been renamed to be ``sm`` (``vader`` will still work as an - alias). - Update the new ``sm`` BTL to not use Linux Cross Memory Attach (CMA) in user namespaces. - Fixed a crash when using the new ``sm`` BTL when compiled with @@ -177,6 +177,10 @@ Open MPI version 5.0.0rc16 - Deprecations and removals: + - The legacy ``sm`` (shared memory) BTL has been removed. The + next-generation shared memory BTL ``vader`` replaces it, and + has been renamed to be ``sm`` (``vader`` will still work as an + alias). - ORTE, the underlying Open MPI launcher has been removed, and replaced with the `PMIx Reference RunTime Environment `_ (``PRTE``). @@ -204,13 +208,24 @@ Open MPI version 5.0.0rc16 environment is no longer supported. 32 bit support is still available in the v4.x series. - - Hardware Locality updates: + - Other updates and bug fixes: - - Open MPI now requires Hardware Locality v1.11.0 or later. - - The internally-bundled Hardware Locality shipped with Open MPI - has been updated to v2.7.1. - - Open MPI builds Hardware Locality with ``--enable-plugins`` when - appropriate. + - Updated Open MPI to use ``ROMIO`` v3.4.1. + - Add missing ``MPI_Status`` conversion subroutines: + ``MPI_Status_c2f08()``, ``MPI_Status_f082c()``, + ``MPI_Status_f082f()``, ``MPI_Status_f2f08()`` and the + ``PMPI_*`` related subroutines. + - MPI module: added the ``mpi_f08`` ``TYPE(MPI_*)`` types for + Fortran. Thanks to George Katevenis for the report and their + contribution to the patch. + - The default atomics have been changed to be GCC, with C11 as a + fallback. C11 atomics incurs sequential memory ordering, which + in most cases is not desired. + - Various datatype bugfixes and performance improvements. + - Various pack/unpack bugfixes and performance improvements. + - Various OSHMEM bugfixes and performance improvements. + - Thanks to Jeff Hammond, Pak Lui, Felix Uhl, Naribayashi Akira, + Julien Emmanuel, and Yaz Saito for their invaluable contributions. - Documentation updates and improvements: @@ -252,93 +267,3 @@ Open MPI version 5.0.0rc16 - Yixin Zhang - William Zhang - - Build updates and fixes: - - - Various changes and cleanup to fix, and better support the - static building of Open MPI. - - Change the default component build behavior to prefer building - components as part of the core Open MPI library instead of - individual DSOs. Currently, this means the Open SHMEM layer - will only build if the UCX library is found. - - ``autogen.pl`` now supports a ``-j`` option to run - multi-threaded. Users can also use the environment variable - ``AUTOMAKE_JOBS``. - - Updated ``autogen.pl`` to support macOS Big Sur. Thanks to - @fxcoudert for reporting the issue. - - Fixed bug where ``autogen.pl`` would not ignore all excluded - components when using the ``--exclude`` option. - - Fixed a bug the ``-r`` option of ``buildrpm.sh`` which would - result in an rpm build failure. Thanks to John K. McIver III for - reporting and fixing. - - Removed the ``C++`` compiler requirement to build Open MPI. - - Updates to improve the handling of the compiler version string - in the build system. This fixes a compiler error with clang and - armclang. - - Added OpenPMIx binaries to the build, including ``pmix_info``. - Thanks to Mamzi Bayatpour for their contribution to this effort. - - Open MPI now links to Libevent using ``-levent_core`` - and ``-levent_pthread`` instead of ``-levent``. - - Added support for setting the wrapper C compiler. This adds a - new option: ``--with-wrapper-cc=NAME`` to the ``configure`` command. - - Fixed compilation errors when running on IME file systems due to - a missing header inclusion. Thanks to Sylvain Didelot for - finding and fixing this issue. - - Add support for GNU Autoconf v2.7.x. - - - Other updates and bug fixes: - - - Updated Open MPI to use ``ROMIO`` v3.4.1. - - ``common/ompio``: implement pipelined read and write operation. - This new new code path shows significant performance - improvements for reading/writing device buffers compared to the - previous implementation, and reduces the memory footprint of - Open MPI IO ("OMPIO") by allocating smaller temporary buffers. - - Fixed Fortran-8-byte-INTEGER vs. C-4-byte-int issue in the - ``mpi_f08`` MPI Fortran bindings module. Thanks to @ahaichen for - reporting the bug. - - Add missing ``MPI_Status`` conversion subroutines: - ``MPI_Status_c2f08()``, ``MPI_Status_f082c()``, - ``MPI_Status_f082f()``, ``MPI_Status_f2f08()`` and the - ``PMPI_*`` related subroutines. - - Fixed Fortran keyword issue when compiling ``oshmem_info``. - Thanks to Pak Lui for finding and fixing the bug. - - Added check for Fortran ``ISO_FORTRAN_ENV:REAL16``. Thanks to - Jeff Hammond for reporting this issue. - - Fixed Fortran preprocessor issue with ``CPPFLAGS``. - Thanks to Jeff Hammond for reporting this issue. - - MPI module: added the ``mpi_f08`` ``TYPE(MPI_*)`` types for - Fortran. Thanks to George Katevenis for the report and their - contribution to the patch. - - Fixed a typo in an error string when showing the stack - frame. Thanks to Naribayashi Akira for finding and fixing the - bug. - - Fixed output error strings and some comments in the Open MPI - code base. Thanks to Julien Emmanuel for tirelessly finding and - fixing these issues. - - The ``uct`` BTL transport now supports ``UCX`` v1.9 and higher. - There is no longer a maximum supported version. - - Updated the UCT BTL defaults to allow NVIDIA/Mellanox HCAs - (``mlx4_0``, and ``mlx5_0``) for compatibility with the - one-sided ``rdma`` component. - - Fixed a crash during CUDA initialization. - Thanks to Yaz Saito for finding and fixing the bug. - - Singleton ``MPI_Comm_spawn()`` support has been fixed. - - PowerPC atomics: Force usage of ppc assembly by default. - - The default atomics have been changed to be GCC, with C11 as a - fallback. C11 atomics incurs sequential memory ordering, which - in most cases is not desired. - - Various datatype bugfixes and performance improvements. - - Various pack/unpack bugfixes and performance improvements. - - Various OSHMEM bugfixes and performance improvements. - - New algorithm for Allgather and Allgatherv has been added, based - on the paper *"Sparbit: a new logarithmic-cost and data - locality-aware MPI Allgather algorithm"*. Default algorithm - selection rules are unchanged; to use these algorithms add: - ``--mca coll_tuned_allgather_algorithm sparbit`` and/or ``--mca - coll_tuned_allgatherv_algorithm sparbit`` to your ``mpirun`` - command. Thanks to Wilton Jaciel Loch and Guilherme Koslovski - for their contribution. - - Updated the usage of ``.gitmodules`` to use relative paths from - absolute paths. This allows the submodule cloning to use the - same protocol as Open MPI cloning. Thanks to Felix Uhl for the - contribution. From c401a29f6636693b45fbf4c23f2430f057c54dc2 Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Wed, 25 Oct 2023 14:10:25 -0400 Subject: [PATCH 59/73] v5.0.x news: Missing period. Signed-off-by: Austen Lauria --- docs/news/news-v5.0.x.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 0f453884b6d..2ca28dc4bc4 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -230,7 +230,7 @@ Open MPI version 5.0.0rc16 - Documentation updates and improvements: - Open MPI has consolidated and converted all of its documentation - to use `ReStructured Text + to use `ReStructured Text. `_ and `Sphinx `_. From a08c60c3376ddb89496cc87e66c7560479cd1662 Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Wed, 25 Oct 2023 14:18:51 -0400 Subject: [PATCH 60/73] Update news in prep for v5.0.0 release. - Remove rc mentions. Signed-off-by: Austen Lauria --- docs/news/news-v5.0.x.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 2ca28dc4bc4..d5ebfbc3c19 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -4,9 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI version 5.0.0rc16 +Open MPI version 5.0.0 -------------------------- -:Date: 24 October 2023 +:Date: 25 October 2023 .. admonition:: The MPIR API has been removed :class: warning @@ -66,10 +66,10 @@ Open MPI version 5.0.0rc16 Libevent symbols and then statically pulled the library into ``libmpi.so``. -- Changes since rc15: +- Internal PMIx and PRRTe versions: - - Update PMIx to release tag v4.2.7 hash: ``57c405c52ad76bab0be9f95e29a6df660673081e``. - - Update PRRTE to release tag v3.0.2 hash: ``1552e36f0852bbc6d901ec95983369f0a3c283f6``. + - PMIx release tag v4.2.7. Commit hash: ``57c405c52ad76bab0be9f95e29a6df660673081e``. + - PRRTE release tag v3.0.2. Commit hash: ``1552e36f0852bbc6d901ec95983369f0a3c283f6``. - All other notable updates for v5.0.0: From cdd3218fd6fc4964d81b9f2623776293d93b5241 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Wed, 25 Oct 2023 13:31:11 -0600 Subject: [PATCH 61/73] Revert "Merge pull request #11864 from wenduwan/topo_aware_coll_comm" This reverts commit 57bb6dc5b4d418952a72da177cf1b39892626803, reversing changes made to f0c69b7f784838ebae2d354e585a0fb445e7de11. Signed-off-by: Howard Pritchard --- contrib/check_unnecessary_headers.sh | 6 ++-- ompi/communicator/comm.c | 44 +-------------------------- ompi/communicator/comm_cid.c | 10 ------ ompi/communicator/communicator.h | 17 ----------- ompi/mca/coll/han/coll_han_subcomms.c | 7 ++--- 5 files changed, 7 insertions(+), 77 deletions(-) diff --git a/contrib/check_unnecessary_headers.sh b/contrib/check_unnecessary_headers.sh index bac0e427951..15edb513c45 100644 --- a/contrib/check_unnecessary_headers.sh +++ b/contrib/check_unnecessary_headers.sh @@ -11,7 +11,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2009 Oak Ridge National Labs. All rights reserved. -# Copyright (c) Amazon.com, Inc. or its affiliates. +# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. # All Rights reserved. # # @@ -181,8 +181,8 @@ SEARCH_HEADER[0]="ompi/attribute/attribute.h ATTR_HASH_SIZE OMPI_KEYVAL_PREDEFIN SEARCH_HEADER[1]="ompi/class/ompi_free_list.h ompi_free_list_item_init_fn_t ompi_free_list_t ompi_free_list_item_t ompi_free_list_init_ex ompi_free_list_init ompi_free_list_init_ex_new ompi_free_list_init_new ompi_free_list_grow ompi_free_list_resize ompi_free_list_pos_t OMPI_FREE_LIST_POS_BEGINNING ompi_free_list_parse OMPI_FREE_LIST_GET OMPI_FREE_LIST_WAIT __ompi_free_list_wait OMPI_FREE_LIST_RETURN" SEARCH_HEADER[2]="ompi/class/ompi_rb_tree.h ompi_rb_tree_nodecolor_t ompi_rb_tree_node_t ompi_rb_tree_comp_fn_t ompi_rb_tree_t ompi_rb_tree_condition_fn_t ompi_rb_tree_action_fn_t ompi_rb_tree_construct ompi_rb_tree_destruct ompi_rb_tree_init ompi_rb_tree_insert ompi_rb_tree_find_with ompi_rb_tree_find ompi_rb_tree_delete ompi_rb_tree_destroy ompi_rb_tree_traverse ompi_rb_tree_size" SEARCH_HEADER[3]="ompi/class/ompi_seq_tracker.h ompi_seq_tracker_range_t ompi_seq_tracker_t ompi_seq_tracker_check_duplicate ompi_seq_tracker_insert ompi_seq_tracker_copy" -SEARCH_HEADER[4]="ompi/communicator/communicator.h MPI_Comm MPI_COMM_WORLD ompi_communicator_t OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_DISJOINT_SET OMPI_COMM_DISJOINT OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_world ompi_mpi_comm_self ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dyncomm ompi_mpi_cxx_comm_errhandler_invoke" -SEARCH_HEADER[5]="ompi/datatype/convertor.h OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_DISJOINT_SET OMPI_COMM_DISJOINT OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dync CONVERTOR_DATATYPE_MASK CONVERTOR_SEND_CONVERSION CONVERTOR_RECV CONVERTOR_SEND CONVERTOR_HOMOGENEOUS CONVERTOR_NO_OP CONVERTOR_WITH_CHECKSUM CONVERTOR_TYPE_MASK CONVERTOR_STATE_START CONVERTOR_STATE_COMPLETE CONVERTOR_STATE_ALLOC CONVERTOR_COMPLETED ompi_convertor_t ompi_convertor_master_t dt_stack_t DT_STATIC_STACK_SIZE ompi_convertor_get_checksum ompi_convertor_pack ompi_convertor_unpack ompi_convertor_create ompi_convertor_cleanup ompi_convertor_need_buffers ompi_convertor_get_packed_size ompi_convertor_get_unpacked_size ompi_convertor_get_current_pointer ompi_convertor_prepare_for_send ompi_convertor_copy_and_prepare_for_send ompi_convertor_prepare_for_recv ompi_convertor_copy_and_prepare_for_recv ompi_convertor_raw ompi_convertor_set_position_nocheck ompi_convertor_set_position ompi_convertor_personalize ompi_convertor_clone ompi_convertor_clone_with_position ompi_convertor_dump ompi_ddt_dump_stack ompi_convertor_generic_simple_position MPI_Datatype" +SEARCH_HEADER[4]="ompi/communicator/communicator.h MPI_Comm MPI_COMM_WORLD ompi_communicator_t OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_world ompi_mpi_comm_self ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dyncomm ompi_mpi_cxx_comm_errhandler_invoke" +SEARCH_HEADER[5]="ompi/datatype/convertor.h OMPI_COMM_INTER OMPI_COMM_CART OMPI_COMM_GRAPH OMPI_COMM_NAMEISSET OMPI_COMM_ISFREED OMPI_COMM_INTRINSIC OMPI_COMM_DYNAMIC OMPI_COMM_INVALID OMPI_COMM_PML_ADDED OMPI_COMM_IS_ OMPI_COMM_SET_ OMPI_COMM_ALLGATHER_TAG OMPI_COMM_BARRIER_TAG OMPI_COMM_ALLREDUCE_TAG OMPI_COMM_CID_ OMPI_COMM_BLOCK_ ompi_predefined_communicator_t ompi_mpi_comm_parent ompi_mpi_comm_null ompi_comm_invalid ompi_comm_rank ompi_comm_size ompi_comm_remote_size ompi_comm_get_cid ompi_comm_lookup ompi_comm_peer_lookup ompi_comm_peer_invalid ompi_comm_init ompi_comm_link_function ompi_comm_group ompi_comm_create ompi_topo_create ompi_comm_split ompi_comm_dup ompi_comm_compare ompi_comm_free ompi_comm_allocate ompi_comm_nextcid ompi_comm_finalize ompi_comm_set ompi_comm_get_rprocs ompi_comm_overlapping_groups ompi_comm_determine_first ompi_comm_activate ompi_comm_dump ompi_comm_set_name ompi_comm_reg_init ompi_comm_reg_finalize ompi_comm_num_dync CONVERTOR_DATATYPE_MASK CONVERTOR_SEND_CONVERSION CONVERTOR_RECV CONVERTOR_SEND CONVERTOR_HOMOGENEOUS CONVERTOR_NO_OP CONVERTOR_WITH_CHECKSUM CONVERTOR_TYPE_MASK CONVERTOR_STATE_START CONVERTOR_STATE_COMPLETE CONVERTOR_STATE_ALLOC CONVERTOR_COMPLETED ompi_convertor_t ompi_convertor_master_t dt_stack_t DT_STATIC_STACK_SIZE ompi_convertor_get_checksum ompi_convertor_pack ompi_convertor_unpack ompi_convertor_create ompi_convertor_cleanup ompi_convertor_need_buffers ompi_convertor_get_packed_size ompi_convertor_get_unpacked_size ompi_convertor_get_current_pointer ompi_convertor_prepare_for_send ompi_convertor_copy_and_prepare_for_send ompi_convertor_prepare_for_recv ompi_convertor_copy_and_prepare_for_recv ompi_convertor_raw ompi_convertor_set_position_nocheck ompi_convertor_set_position ompi_convertor_personalize ompi_convertor_clone ompi_convertor_clone_with_position ompi_convertor_dump ompi_ddt_dump_stack ompi_convertor_generic_simple_position MPI_Datatype" SEARCH_HEADER[6]="ompi/datatype/datatype.h MPI_Datatype DT_MAX_PREDEFINED DT_FLAG_ MAX_DT_COMPONENT_COUNT opal_ddt_count_t dt_type_desc_t ompi_datatype_t ompi_predefined_datatype_t ompi_ddt_init ompi_ddt_finalize ompi_ddt_create_ ompi_ddt_duplicate ompi_ddt_is_predefined ompi_ddt_create_from_packed_description" SEARCH_HEADER[7]="ompi/datatype/datatype_internal.h DDT_DUMP_STACK DT_ ddt_elem_id_description ddt_elem_desc ddt_elem_desc_t ddt_loop_desc ddt_loop_desc_t ddt_endloop_desc ddt_endloop_desc_t dt_elem_desc CREATE_LOOP_START CREATE_LOOP_END CREATE_ELEM ompi_complex_float_t ompi_complex_double_t ompi_complex_long_double_t ompi_ddt_basicDatatypes BASIC_DDT_FROM_ELEM ompi_ddt_default_convertors_init ompi_ddt_default_convertors_fini SAVE_STACK PUSH_STACK ompi_ddt_safeguard_pointer_debug_breakpoint OMPI_DDT_SAFEGUARD_POINTER GET_FIRST_NON_LOOP UPDATE_INTERNAL_COUNTERS ompi_ddt_print_args" SEARCH_HEADER[8]="ompi/errhandler/errhandler.h OMPI_ERRHANDLER_LANG_ ompi_errhandler_lang_t OMPI_ERRHANDLER_TYPE_ ompi_errhandler_type_t ompi_errhandler_t ompi_predefined_errhandler_t ompi_mpi_errhandler_null OMPI_ERRHANDLER_CHECK OMPI_ERRHANDLER_RETURN ompi_errhandler_init ompi_errhandler_finalize OMPI_ERRHANDLER_INVOKE ompi_errhandler_invoke ompi_errhandler_request_invoke ompi_errhandler_create ompi_errhandler_is_intrinsic ompi_errhandler_fortran_handler_fn_t OMPI_ERR_INIT_FINALIZE MPI_Errhandler" diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index 1a83258cc96..3b66d416cea 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -458,6 +458,7 @@ int ompi_comm_create_w_info (ompi_communicator_t *comm, ompi_group_t *group, opa goto exit; } + /* Check whether we are part of the new comm. If not, we have to free the structure again. However, we could not avoid the comm_nextcid step, since @@ -2689,49 +2690,6 @@ static int ompi_comm_copy_topo(ompi_communicator_t *oldcomm, return OMPI_SUCCESS; } -int ompi_comm_set_disjointness(ompi_communicator_t *newcomm, ompi_communicator_t *oldcomm) -{ - int local_peers = 0, rc = OMPI_ERROR; - - if (OMPI_COMM_IS_DISJOINT_SET(newcomm)) { - rc = OMPI_SUCCESS; - goto out; - } - - if (NULL != oldcomm && OMPI_COMM_IS_DISJOINT(oldcomm)) { - /** - * A communicator splitted from a disjoint - * communicator(1 process per node) is also disjoint - */ - newcomm->c_flags |= (OMPI_COMM_DISJOINT_SET | OMPI_COMM_DISJOINT); - rc = OMPI_SUCCESS; - goto out; - } - - if (!newcomm->c_coll) { - rc = OMPI_ERR_NOT_AVAILABLE; - goto out; - } - - local_peers = ompi_group_count_local_peers(newcomm->c_local_group); - rc = newcomm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_peers, 1, MPI_INT, MPI_MAX, newcomm, - newcomm->c_coll->coll_allreduce_module); - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { - goto out; - } - - if (1 == local_peers) { - newcomm->c_flags |= OMPI_COMM_DISJOINT; - } else { - newcomm->c_flags &= ~OMPI_COMM_DISJOINT; - } - - newcomm->c_flags |= OMPI_COMM_DISJOINT_SET; - -out: - return rc; -} - char *ompi_comm_print_cid (const ompi_communicator_t *comm) { #if OPAL_HAVE_THREAD_LOCAL diff --git a/ompi/communicator/comm_cid.c b/ompi/communicator/comm_cid.c index d5a79859066..07970e8354f 100644 --- a/ompi/communicator/comm_cid.c +++ b/ompi/communicator/comm_cid.c @@ -811,16 +811,6 @@ static int ompi_comm_activate_complete (ompi_communicator_t **newcomm, ompi_comm return ret; } - /** - * Use the initialized collective component to determine whether the processes are located on - * individual nodes - */ - if (OMPI_SUCCESS != ompi_comm_set_disjointness(*newcomm, comm)) { - OBJ_RELEASE(*newcomm); - *newcomm = MPI_COMM_NULL; - return ret; - } - /* For an inter communicator, we have to deal with the potential * problem of what is happening if the local_comm that we created * has a lower CID than the parent comm. This is not a problem diff --git a/ompi/communicator/communicator.h b/ompi/communicator/communicator.h index 12d7d391d81..7e41afb8631 100644 --- a/ompi/communicator/communicator.h +++ b/ompi/communicator/communicator.h @@ -62,8 +62,6 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t); #define OMPI_COMM_DYNAMIC 0x00000008 #define OMPI_COMM_ISFREED 0x00000010 #define OMPI_COMM_INVALID 0x00000020 -#define OMPI_COMM_DISJOINT_SET 0x00000040 -#define OMPI_COMM_DISJOINT 0x00000080 #define OMPI_COMM_CART 0x00000100 #define OMPI_COMM_GRAPH 0x00000200 #define OMPI_COMM_DIST_GRAPH 0x00000400 @@ -82,8 +80,6 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t); #define OMPI_COMM_IS_FREED(comm) ((comm)->c_flags & OMPI_COMM_ISFREED) #define OMPI_COMM_IS_DYNAMIC(comm) ((comm)->c_flags & OMPI_COMM_DYNAMIC) #define OMPI_COMM_IS_INVALID(comm) ((comm)->c_flags & OMPI_COMM_INVALID) -#define OMPI_COMM_IS_DISJOINT_SET(comm) ((comm)->c_flags & OMPI_COMM_DISJOINT_SET) -#define OMPI_COMM_IS_DISJOINT(comm) ((comm)->c_flags & OMPI_COMM_DISJOINT) #define OMPI_COMM_IS_PML_ADDED(comm) ((comm)->c_flags & OMPI_COMM_PML_ADDED) #define OMPI_COMM_IS_EXTRA_RETAIN(comm) ((comm)->c_flags & OMPI_COMM_EXTRA_RETAIN) #define OMPI_COMM_IS_TOPO(comm) (OMPI_COMM_IS_CART((comm)) || \ @@ -901,19 +897,6 @@ OMPI_DECLSPEC int ompi_comm_split_type(ompi_communicator_t *comm, struct opal_info_t *info, ompi_communicator_t** newcomm); -/** - * Set newcomm's disjoint flags based on oldcomm if provided. In the case where oldcomm - * is disjoint, the function will short circuit and set newcomm to be disjoint. - * Otherwise, the function will carry out a collective communication on all processes - * in newcomm. Therefore this function should only be called **after** the collectives - * modules are initialized on newcomm. - * - * @param newcomm: new communicator - * @param oldcomm: parent communictator or NULL - * - */ -OMPI_DECLSPEC int ompi_comm_set_disjointness(ompi_communicator_t *newcomm, ompi_communicator_t *oldcomm); - /** * dup a communicator. Parameter are identical to the MPI-counterpart * of the function. It has been extracted, since we need to be able diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index fe6e197cfb8..90bc0d1d972 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -282,7 +282,6 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, opal_info_set(&comm_info, "ompi_comm_coll_preference", "tuned,^han"); ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, &comm_info, &(low_comms[0])); - assert(OMPI_COMM_IS_DISJOINT_SET(low_comms[0]) && !OMPI_COMM_IS_DISJOINT(low_comms[0])); /* * Get my local rank and the local size @@ -297,7 +296,6 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, &comm_info, &(low_comms[1])); - assert(OMPI_COMM_IS_DISJOINT_SET(low_comms[1]) && !OMPI_COMM_IS_DISJOINT(low_comms[1])); /* * Upgrade libnbc module priority to set up up_comms[0] with libnbc module @@ -306,8 +304,8 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, */ opal_info_set(&comm_info, "ompi_comm_coll_preference", "libnbc,^han"); ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[0]), false); + up_rank = ompi_comm_rank(up_comms[0]); - assert(OMPI_COMM_IS_DISJOINT_SET(up_comms[0]) && OMPI_COMM_IS_DISJOINT(up_comms[0])); /* * Upgrade adapt module priority to set up up_comms[0] with adapt module @@ -315,7 +313,6 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, */ opal_info_set(&comm_info, "ompi_comm_coll_preference", "adapt,^han"); ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, &(up_comms[1]), false); - assert(OMPI_COMM_IS_DISJOINT_SET(up_comms[1]) && OMPI_COMM_IS_DISJOINT(up_comms[1])); /* * Set my virtual rank number. @@ -353,3 +350,5 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, OBJ_DESTRUCT(&comm_info); return OMPI_SUCCESS; } + + From cd4f1b97777931a1796e8af38caf513d2d9fbb03 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 27 Oct 2023 15:53:59 -0400 Subject: [PATCH 62/73] docs: clarify --enable-mca-dso Add explicit text statting that --enable-mca-dso (with no LIST/argument value) will build *all* components as DSO. Signed-off-by: Jeff Squyres --- .../configure-cli-options/installation.rst | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/installing-open-mpi/configure-cli-options/installation.rst b/docs/installing-open-mpi/configure-cli-options/installation.rst index 05da73ef917..2a6be26bd1b 100644 --- a/docs/installing-open-mpi/configure-cli-options/installation.rst +++ b/docs/installing-open-mpi/configure-cli-options/installation.rst @@ -193,7 +193,10 @@ be used with ``configure``: options, but have no impact on the selection logic described below. Only affirmative options change the selection process. - ``LIST`` is a comma-delimited list of Open MPI frameworks and/or + If ``LIST`` is not specified (e.g., ``--enable-mca-dso`` with no + ``LIST``), or if ``LIST`` is the special value ``yes``, then *all* + components will be selected. If ``LIST`` is specified, it is a + comma-delimited list of Open MPI frameworks and/or framework+component tuples. Examples: * ``btl`` specifies the entire BTL framework @@ -213,7 +216,7 @@ be used with ``configure``: #. Otherwise, ``configure`` uses the global default build behavior. At each level of the selection process, if the component is - specified to be built as both a static and dso component, the static + specified to be built as both a static and DSO component, the static option will win. .. note:: As of Open MPI |ompi_ver|, ``configure``'s global default @@ -234,6 +237,10 @@ be used with ``configure``: shell$ ./configure + #. Build all components as DSOs:: + + shell$ ./configure --enable-mca-dso + #. Build all components as static, except the TCP BTL, which will be built as a DSO:: From 940a414757c04a6575533406ea55cc3c71425425 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 27 Oct 2023 15:54:40 -0400 Subject: [PATCH 63/73] docs: fix spacing error Fix a whitespace error that caused some text to render incorrectly. No content changes. Signed-off-by: Jeff Squyres --- docs/installing-open-mpi/configure-cli-options/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/installing-open-mpi/configure-cli-options/installation.rst b/docs/installing-open-mpi/configure-cli-options/installation.rst index 2a6be26bd1b..7a11c75f77d 100644 --- a/docs/installing-open-mpi/configure-cli-options/installation.rst +++ b/docs/installing-open-mpi/configure-cli-options/installation.rst @@ -202,7 +202,7 @@ be used with ``configure``: * ``btl`` specifies the entire BTL framework * ``btl-tcp`` specifies just the TCP component in the BTL framework * ``mtl,btl-tcp`` specifies the entire MTL framework and the TCP - component in the BTL framework + component in the BTL framework Open MPI's ``configure`` script uses the values of these two options when evaluating each component to determine how it should be built From 7b325e3a62bcb1fb5fd96d2ebeee8a46462a59a7 Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Tue, 18 Jul 2023 09:38:41 +0900 Subject: [PATCH 64/73] opal_var_dump_color_keys: fix an array overflow since opal_var_dump_color_keys is used with opal_argv_count() and friends, make sure is is long enough and NULL terminated Thanks to Niv Shpak for reporting this and identifying the root cause. Refs. open-mpi/ompi#11826 Signed-off-by: Gilles Gouaillardet --- opal/runtime/opal_params_core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/opal/runtime/opal_params_core.c b/opal/runtime/opal_params_core.c index 48837bfdbdd..9e82ec7fee4 100644 --- a/opal/runtime/opal_params_core.c +++ b/opal/runtime/opal_params_core.c @@ -17,8 +17,8 @@ * Copyright (c) 2010-2014 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014 Hochschule Esslingen. All rights reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. + * Copyright (c) 2015-2023 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * Copyright (c) 2015 Mellanox Technologies, Inc. * All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. @@ -92,10 +92,11 @@ static bool opal_register_util_done = false; static char *opal_var_dump_color_string = NULL; -static char *opal_var_dump_color_keys[OPAL_VAR_DUMP_COLOR_KEY_COUNT] = { +static char *opal_var_dump_color_keys[OPAL_VAR_DUMP_COLOR_KEY_COUNT+1] = { [OPAL_VAR_DUMP_COLOR_VAR_NAME] = "name", [OPAL_VAR_DUMP_COLOR_VAR_VALUE] = "value", - [OPAL_VAR_DUMP_COLOR_VALID_VALUES] = "valid_values" + [OPAL_VAR_DUMP_COLOR_VALID_VALUES] = "valid_values", + [OPAL_VAR_DUMP_COLOR_KEY_COUNT] = NULL }; /** From fc1e8d562685dcb8a6bf5252cb0c448004ee446e Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 28 Oct 2023 11:46:45 -0400 Subject: [PATCH 65/73] docs/news-5: update credits for the docs revamp No need to list those who are already credited in the Git commit logs. Strengthen the language a bit to be a little more clear that the OMPI developer community played a large role. Signed-off-by: Jeff Squyres --- docs/news/news-v5.0.x.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index d5ebfbc3c19..bb7312e2e82 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -242,9 +242,12 @@ Open MPI version 5.0.0 directory. - Many, many people from the Open MPI community contributed to the - overall documentation effort |mdash| not only those who are - listed in the Git commit logs |mdash| including (but not limited - to): + overall documentation effort |mdash| not just those who are + listed in the Git commit logs. Indeed, many Open MPI core + developers contributed their time and effort, as did a fairly + large group of non-core developers (e.g., those who participated + just to help the documentation revamp), including (but not + limited to): - Lachlan Bell - Simon Byrne @@ -254,7 +257,6 @@ Open MPI version 5.0.0 - Sophia Fang - Rick Gleitz - Colton Kammes - - Quincey Koziol - Robert Langfield - Nick Papior - Luz Paz @@ -265,5 +267,3 @@ Open MPI version 5.0.0 - Fangcong Yin - Seth Zegelstein - Yixin Zhang - - William Zhang - From dc0ac61ae17f8f768eacb4e2b9f7f25a7645ea60 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Wed, 1 Nov 2023 15:06:05 -0400 Subject: [PATCH 66/73] docs: minor formatting updates for some man pages Signed-off-by: Jeff Squyres --- docs/man-openmpi/man1/mpisync.1.rst | 4 ++-- docs/man-openmpi/man1/ompi-wrapper-compiler.1.rst | 2 +- docs/man-openmpi/man1/ompi_info.1.rst | 2 +- docs/man-openmpi/man1/opal_wrapper.1.rst | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/man-openmpi/man1/mpisync.1.rst b/docs/man-openmpi/man1/mpisync.1.rst index 19c08f3ca25..80195aaaf49 100644 --- a/docs/man-openmpi/man1/mpisync.1.rst +++ b/docs/man-openmpi/man1/mpisync.1.rst @@ -1,4 +1,4 @@ -.. _mpisync: +.. _man1-mpisync: mpisync @@ -6,7 +6,7 @@ mpisync .. include_body -Open MPI timing tools +mpisync |mdash| Open MPI timing tools SYNTAX diff --git a/docs/man-openmpi/man1/ompi-wrapper-compiler.1.rst b/docs/man-openmpi/man1/ompi-wrapper-compiler.1.rst index 6a288b38279..d61ad6085b9 100644 --- a/docs/man-openmpi/man1/ompi-wrapper-compiler.1.rst +++ b/docs/man-openmpi/man1/ompi-wrapper-compiler.1.rst @@ -9,7 +9,7 @@ Open MPI Wrapper Compilers .. include_body -mpicc, mpic++, mpicxx, mpifort, mpijavac -- Open MPI wrapper compilers +mpicc, mpic++, mpicxx, mpifort, mpijavac |mdash| Open MPI wrapper compilers SYNTAX ------ diff --git a/docs/man-openmpi/man1/ompi_info.1.rst b/docs/man-openmpi/man1/ompi_info.1.rst index 21313d7bc3b..8b3e11c5761 100644 --- a/docs/man-openmpi/man1/ompi_info.1.rst +++ b/docs/man-openmpi/man1/ompi_info.1.rst @@ -6,7 +6,7 @@ ompi_info .. include_body -ompi_info - Display information about the Open MPI installation +ompi_info |mdash| Display information about the Open MPI installation SYNOPSIS diff --git a/docs/man-openmpi/man1/opal_wrapper.1.rst b/docs/man-openmpi/man1/opal_wrapper.1.rst index 80002dbd015..cd4d4513854 100644 --- a/docs/man-openmpi/man1/opal_wrapper.1.rst +++ b/docs/man-openmpi/man1/opal_wrapper.1.rst @@ -6,7 +6,7 @@ opal_wrapper .. include_body -opal_wrapper - Back-end Open MPI wrapper command +opal_wrapper |mdash| Back-end Open MPI wrapper command DESCRIPTION From bdfbc52427a08d6703c930b7594bd5ac62aa1b63 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Wed, 1 Nov 2023 15:06:48 -0400 Subject: [PATCH 67/73] docs/news: remove stale notation We no longer use this "also to appear" / "also appeared" notation, so remove the whole notice about it. Signed-off-by: Jeff Squyres --- docs/news/index.rst | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/docs/news/index.rst b/docs/news/index.rst index 6e377a4f6fd..c7a2ece2222 100644 --- a/docs/news/index.rst +++ b/docs/news/index.rst @@ -10,29 +10,6 @@ This file contains the main features as well as overviews of specific bug fixes (and other actions) for each version of Open MPI since version 1.0. -.. error:: GP - move elsewhere and refer to software versioning here. - - As more fully described in the "Software Version Number" section in - the README file, Open MPI typically releases two separate version - series simultaneously. Since these series have different goals and - are semi-independent of each other, a single NEWS-worthy item may be - introduced into different series at different times. For example, - feature F was introduced in the vA.B series at version vA.B.C, and was - later introduced into the vX.Y series at vX.Y.Z. - - The first time feature F is released, the item will be listed in the - vA.B.C section, denoted as: - - (** also to appear: X.Y.Z) -- indicating that this item is also - likely to be included in future release - version vX.Y.Z. - - When vX.Y.Z is later released, the same NEWS-worthy item will also be - included in the vX.Y.Z section and be denoted as: - - (** also appeared: A.B.C) -- indicating that this item was previously - included in release version vA.B.C. - :ref:`search` .. toctree:: From 59afe11a3a93b39affea920d34ae1a73fb14bd3f Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Wed, 1 Nov 2023 15:10:21 -0400 Subject: [PATCH 68/73] docs: add more info for packagers about DSOs Add a bunch more content about building components as static or DSOs, particularly for packagers. Add a few cross-references to this new content so that it can be found from a few different places in the docs. Signed-off-by: Jeff Squyres --- .../configure-cli-options/installation.rst | 12 +- docs/installing-open-mpi/packagers.rst | 136 +++++++++++++++++- docs/news/news-v5.0.x.rst | 11 +- 3 files changed, 150 insertions(+), 9 deletions(-) diff --git a/docs/installing-open-mpi/configure-cli-options/installation.rst b/docs/installing-open-mpi/configure-cli-options/installation.rst index 7a11c75f77d..783a6236bc7 100644 --- a/docs/installing-open-mpi/configure-cli-options/installation.rst +++ b/docs/installing-open-mpi/configure-cli-options/installation.rst @@ -174,6 +174,11 @@ be used with ``configure``: These two options, along with ``--enable-mca-no-build``, govern the behavior of how Open MPI's frameworks and components are built. + .. tip:: + + :ref:`See this section ` for + advice to packagers about these CLI options. + The ``--enable-mca-dso`` option specifies which frameworks and/or components are built as Dynamic Shared Objects (DSOs). Specifically, DSOs are built as "plugins" outside of the core Open @@ -222,7 +227,7 @@ be used with ``configure``: .. note:: As of Open MPI |ompi_ver|, ``configure``'s global default is to build all components as static (i.e., part of the Open MPI core libraries, not as DSOs). Prior to Open MPI - 5.0.0, the global default behavior was to build + v5.0.0, the global default behavior was to build most components as DSOs. .. important:: If the ``--disable-dlopen`` option is specified, then @@ -267,11 +272,6 @@ be used with ``configure``: shell$ ./configure --enable-mca-dso=btl-tcp --enable-mca-static=btl-tcp - .. tip:: - - :ref:`See this section ` for - advice to packagers about this CLI option. - * ``--enable-mca-no-build=LIST``: Comma-separated list of ``-`` pairs that will not be built. For example, ``--enable-mca-no-build=threads-qthreads,pml-monitoring`` will diff --git a/docs/installing-open-mpi/packagers.rst b/docs/installing-open-mpi/packagers.rst index e43d52b101a..a42f58b1af5 100644 --- a/docs/installing-open-mpi/packagers.rst +++ b/docs/installing-open-mpi/packagers.rst @@ -80,8 +80,8 @@ running Open MPI's ``configure`` script. .. _label-install-packagers-dso-or-not: -Components ("plugins"): DSO or no? ----------------------------------- +Components ("plugins"): static or DSO? +-------------------------------------- Open MPI contains a large number of components (sometimes called "plugins") to effect different types of functionality in MPI. For @@ -89,6 +89,69 @@ example, some components effect Open MPI's networking functionality: they may link against specialized libraries to provide highly-optimized network access. +Open MPI can build its components as Dynamic Shared Objects (DSOs) or +statically included in core libraries (regardless of whether those +libraries are built as shared or static libraries). + +.. note:: As of Open MPI |ompi_ver|, ``configure``'s global default is + to build all components as static (i.e., part of the Open + MPI core libraries, not as DSOs). Prior to Open MPI v5.0.0, + the global default behavior was to build most components as + DSOs. + +Why build components as DSOs? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There are advantages to building components as DSOs: + +* Open MPI's core libraries |mdash| and therefore MPI applications + |mdash| will have very few dependencies. For example, if you build + Open MPI with support for a specific network stack, the libraries in + that network stack will be dependencies of the DSOs, not Open MPI's + core libraries (or MPI applications). + +* Removing Open MPI functionality that you do not want is as simple as + removing a DSO from ``$libdir/open-mpi``. + +Why build components as part of Open MPI's core libraries? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The biggest advantage to building the components as part of Open MPI's +core libraries is when running at (very) large scales when Open MPI is +installed on a network filesystem (vs. being installed on a local +filesystem). + +For example, consider launching a single MPI process on each of 1,000 +nodes. In this scenario, the following is accessed from the network +filesystem: + +#. The MPI application +#. The core Open MPI libraries and their dependencies (e.g., + ``libmpi``) + + * Depending on your configuration, this is probably on the order of + 10-20 library files. + +#. All DSO component files and their dependencies + + * Depending on your configuration, this can be 200+ component + files. + +If all components are physically located in the libraries, then the +third step loads zero DSO component files. When using a networked +filesystem while launching at scale, this can translate to large +performance savings. + +.. note:: If not using a networked filesystem, or if not launching at + scale, loading a large number of DSO files may not consume a + noticeable amount of time during MPI process launch. Put + simply: loading DSOs as indvidual files generally only + matters when using a networked filesystem while launching at + scale. + +Direct controls for building components as DSOs or not +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + Open MPI |ompi_ver| has two ``configure``-time defaults regarding the treatment of components that may be of interest to packagers: @@ -151,3 +214,72 @@ binary package, and can install the additional "accelerator" Open MPI binary sub-package if they actually have accelerator hardware installed (which will cause the installation of additional dependencies). + +.. _label-install-packagers-gnu-libtool-dependency-flattening: + +GNU Libtool dependency flattening +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When compiling Open MPI's components statically as part of Open MPI's +core libraries, `GNU Libtool `_ +|mdash| which is used as part of Open MPI's build system |mdash| will +attempt to "flatten" dependencies. + +For example, the :ref:`ompi_info(1) ` command links +against the Open MPI core library ``libopen-pal``. This library will +have dependencies on various HPC-class network stack libraries. For +simplicity, the discussion below assumes that Open MPI was built with +support for `Libfabric `_ and `UCX +`_, and therefore ``libopen-pal`` has direct +dependencies on ``libfabric`` and ``libucx``. + +In this scenario, GNU Libtool will automatically attempt to "flatten" +these dependencies by linking :ref:`ompi_info(1) ` +directly to ``libfabric`` and ``libucx`` (vs. letting ``libopen-pal`` +pull the dependencies in at run time). + +* In some environments (e.g., Ubuntu 22.04), the compiler and/or + linker will automatically utilize the linker CLI flag + ``-Wl,--as-needed``, which will effectively cause these dependencies + to *not* be flattened: :ref:`ompi_info(1) ` will + *not* have a direct dependencies on either ``libfabric`` or + ``libucx``. + +* In other environments (e.g., Fedora 38), the compiler and linker + will *not* utilize the ``-Wl,--as-needed`` linker CLI flag. As + such, :ref:`ompi_info(1) ` will show direct + dependencies on ``libfabric`` and ``libucx``. + +**Just to be clear:** these flattened dependencies *are not a +problem*. Open MPI will function correctly with or without the +flattened dependencies. There is no performance impact associated +with having |mdash| or not having |mdash| the flattened dependencies. +We mention this situation here in the documentation simply because it +surprised some Open MPI downstream package managers to see that +:ref:`ompi_info(1) ` in Open MPI |ompi_ver| had more +shared library dependencies than it did in prior Open MPI releases. + +If packagers want :ref:`ompi_info(1) ` to not have +these flattened dependencies, use either of the following mechanisms: + +#. Use ``--enable-mca-dso`` to force all components to be built as + DSOs (this was actually the default behavior before Open MPI v5.0.0). + +#. Add ``LDFLAGS=-Wl,--as-needed`` to the ``configure`` command line + when building Open MPI. + + .. note:: The Open MPI community specifically chose not to + automatically utilize this linker flag for the following + reasons: + + #. Having the flattened dependencies does not cause any + correctness or performance problems. + #. There's multiple mechanisms (see above) for users or + packagers to change this behavior, if desired. + #. Certain environments have chosen to have |mdash| or + not have |mdash| this flattened dependency behavior. + It is not Open MPI's place to override these choices. + #. In general, Open MPI's ``configure`` script only + utilizes compiler and linker flags if they are + *needed*. All other flags should be the user's / + packager's choice. diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index bb7312e2e82..b6ce469c9cd 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -221,10 +221,19 @@ Open MPI version 5.0.0 - The default atomics have been changed to be GCC, with C11 as a fallback. C11 atomics incurs sequential memory ordering, which in most cases is not desired. + - The default build mode has changed from building Open MPI's + components as Dynamic Shared Objects (DSOs) to being statically + included in their respective libraries. + + .. important:: This has consequences for packagers. Be sure to + read the :ref:`GNU Libtool dependency flattening + ` + subsection. + - Various datatype bugfixes and performance improvements. - Various pack/unpack bugfixes and performance improvements. - Various OSHMEM bugfixes and performance improvements. - - Thanks to Jeff Hammond, Pak Lui, Felix Uhl, Naribayashi Akira, + - Thanks to Jeff Hammond, Pak Lui, Felix Uhl, Naribayashi Akira, Julien Emmanuel, and Yaz Saito for their invaluable contributions. - Documentation updates and improvements: From 01ec480cb19e65c3ac8cea0b0f9a7fd383f1095b Mon Sep 17 00:00:00 2001 From: Gilles Gouaillardet Date: Wed, 1 Nov 2023 10:50:04 +0900 Subject: [PATCH 69/73] MPI_Neighbor_alltoall{v,w}_init: correctly handle NULL parameters Thanks Junchao Zhang for the report. Refs. open-mpi/ompi#12037 Signed-off-by: Gilles Gouaillardet --- ompi/mpi/c/neighbor_alltoallv_init.c | 11 ++++++----- ompi/mpi/c/neighbor_alltoallw_init.c | 11 +++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ompi/mpi/c/neighbor_alltoallv_init.c b/ompi/mpi/c/neighbor_alltoallv_init.c index 57bba5afcb9..66bce716217 100644 --- a/ompi/mpi/c/neighbor_alltoallv_init.c +++ b/ompi/mpi/c/neighbor_alltoallv_init.c @@ -13,7 +13,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2019 Research Organization for Information Science + * Copyright (c) 2014-2023 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -102,14 +102,15 @@ int MPI_Neighbor_alltoallv_init(const void *sendbuf, const int sendcounts[], con } else if (! OMPI_COMM_IS_TOPO(comm)) { return OMPI_ERRHANDLER_NOHANDLE_INVOKE(MPI_ERR_TOPOLOGY, FUNC_NAME); - } else if ((NULL == sendcounts) || (NULL == sdispls) || - (NULL == recvcounts) || (NULL == rdispls) || - MPI_IN_PLACE == sendbuf || MPI_IN_PLACE == recvbuf) { - return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME); } err = mca_topo_base_neighbor_count (comm, &indegree, &outdegree); OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); + if (((0 < outdegree) && ((NULL == sendcounts) || (NULL == sdispls))) || + ((0 < indegree) && ((NULL == recvcounts) || (NULL == rdispls))) || + MPI_IN_PLACE == sendbuf || MPI_IN_PLACE == recvbuf) { + return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME); + } for (i = 0; i < outdegree; ++i) { OMPI_CHECK_DATATYPE_FOR_SEND(err, sendtype, sendcounts[i]); OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); diff --git a/ompi/mpi/c/neighbor_alltoallw_init.c b/ompi/mpi/c/neighbor_alltoallw_init.c index 040b6ed6f49..891f7706afb 100644 --- a/ompi/mpi/c/neighbor_alltoallw_init.c +++ b/ompi/mpi/c/neighbor_alltoallw_init.c @@ -13,7 +13,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2017 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2019 Research Organization for Information Science + * Copyright (c) 2014-2023 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -102,14 +102,13 @@ int MPI_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[], con FUNC_NAME); } - if ((NULL == sendcounts) || (NULL == sdispls) || (NULL == sendtypes) || - (NULL == recvcounts) || (NULL == rdispls) || (NULL == recvtypes) || + err = mca_topo_base_neighbor_count (comm, &indegree, &outdegree); + OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); + if (((0 < outdegree) && ((NULL == sendcounts) || (NULL == sdispls) || (NULL == sendtypes))) || + ((0 < indegree) && ((NULL == recvcounts) || (NULL == rdispls) || (NULL == recvtypes))) || MPI_IN_PLACE == sendbuf || MPI_IN_PLACE == recvbuf) { return OMPI_ERRHANDLER_INVOKE(comm, MPI_ERR_ARG, FUNC_NAME); } - - err = mca_topo_base_neighbor_count (comm, &indegree, &outdegree); - OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); for (i = 0; i < outdegree; ++i) { OMPI_CHECK_DATATYPE_FOR_SEND(err, sendtypes[i], sendcounts[i]); OMPI_ERRHANDLER_CHECK(err, comm, err, FUNC_NAME); From 340f8f017be16b561148af2d1444662defa4ae5c Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Wed, 1 Nov 2023 15:11:20 -0400 Subject: [PATCH 70/73] docs: Add info about building accelerators as DSOs Add information about building the accelerator-related components as DSOs, and an explanation of why pacakgers might want to do so. We need these docs for v5.0.0; if some of these specifics change later in the v5.0.x series (e.g., if we starting building these components as DSOs by default), this documentation can be updated to match. Signed-off-by: Jeff Squyres --- docs/installing-open-mpi/packagers.rst | 65 +++++++++++++++++++------- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/docs/installing-open-mpi/packagers.rst b/docs/installing-open-mpi/packagers.rst index a42f58b1af5..c0fb13ea23a 100644 --- a/docs/installing-open-mpi/packagers.rst +++ b/docs/installing-open-mpi/packagers.rst @@ -198,22 +198,9 @@ using ``--enable-mca-dso`` to selectively build some components as DSOs and leave the others included in their respective Open MPI libraries. -.. code:: sh - - # Build all the "accelerator" components as DSOs (all other - # components will default to being built in their respective - # libraries) - shell$ ./configure --enable-mca-dso=accelerator ... - -This allows packaging ``$libdir`` as part of the "main" Open MPI -binary package, but then packaging -``$libdir/openmpi/mca_accelerator_*.so`` as sub-packages. These -sub-packages may inherit dependencies on the CUDA and/or ROCM -packages, for example. User can always install the "main" Open MPI -binary package, and can install the additional "accelerator" Open MPI -binary sub-package if they actually have accelerator hardware -installed (which will cause the installation of additional -dependencies). +:ref:`See the section on building accelerator support +` for a +practical example where this can be useful. .. _label-install-packagers-gnu-libtool-dependency-flattening: @@ -283,3 +270,49 @@ these flattened dependencies, use either of the following mechanisms: utilizes compiler and linker flags if they are *needed*. All other flags should be the user's / packager's choice. + +.. _label-install-packagers-building-accelerator-support-as-dsos: + +Building accelerator support as DSOs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you are building a package that includes support for one or more +accelerators, it may be desirable to build accelerator-related +components as DSOs (see the :ref:`static or DSO? +` section for details). + +.. admonition:: Rationale + :class: tip + + Accelerator hardware is expensive, and may only be present on some + compute nodes in an HPC cluster. Specifically: there may not be + any accelerator hardware on "head" or compile nodes in an HPC + cluster. As such, invoking Open MPI commands on a "head" node with + an MPI that was built with static accelerator support but no + accelerator hardware may fail to launch because of run-time linker + issues (because the accelerator hardware support libraries are + likely not present). + + Building Open MPI's accelerator-related components as DSOs allows + Open MPI to *try* opening the accelerator components, but proceed + if those DSOs fail to open due to the lack of support libraries. + +Use the ``--enable-mca-dso`` command line parameter to Open MPI's +``configure`` command can allow packagers to build all +accelerator-related components as DSO. For example: + +.. code:: sh + + # Build all the accelerator-related components as DSOs (all other + # components will default to being built in their respective + # libraries) + shell$ ./configure --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator + +Per the example above, this allows packaging ``$libdir`` as part of +the "main" Open MPI binary package, but then packaging +``$libdir/openmpi/mca_accelerator_*.so`` and the other named +components as sub-packages. These sub-packages may inherit +dependencies on the CUDA and/or ROCM packages, for example. The +"main" package can be installed on all nodes, and the +accelerator-specific subpackage can be installed on only the nodes +with accelerator hardware and support libraries. From 7dd6bfaf0b141a74bcf673bf0abe4768323a4838 Mon Sep 17 00:00:00 2001 From: nileshnegi Date: Thu, 2 Nov 2023 21:39:34 +0000 Subject: [PATCH 71/73] opal/mca/accelerator: ROCm 6.0 incompatibility fix Signed-off-by: nileshnegi --- opal/mca/accelerator/rocm/accelerator_rocm_module.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c index bf7ea584b7a..d5640db2100 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c @@ -85,14 +85,22 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6 *flags = 0; err = hipPointerGetAttributes(&srcAttr, addr); if (hipSuccess == err) { +#if HIP_VERSION >= 50731921 + if (hipMemoryTypeDevice == srcAttr.type) { +#else if (hipMemoryTypeDevice == srcAttr.memoryType) { +#endif //We might want to set additional flags in a later iteration. //*flags |= MCA_ACCELERATOR_FLAGS_HOST_LDSTR; //*flags |= MCA_ACCELERATOR_FLAGS_HOST_ATOMICS; /* First access on a device pointer triggers ROCM support lazy initialization. */ opal_accelerator_rocm_lazy_init(); ret = 1; +#if HIP_VERSION >= 50731921 + } else if (hipMemoryTypeUnified == srcAttr.type) { +#else } else if (hipMemoryTypeUnified == srcAttr.memoryType) { +#endif *flags |= MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY; //*flags |= MCA_ACCELERATOR_FLAGS_HOST_LDSTR; //*flags |= MCA_ACCELERATOR_FLAGS_HOST_ATOMICS; From 2a93bda00af76537e45a8e934105670557e72fc7 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Fri, 3 Nov 2023 15:19:21 +0000 Subject: [PATCH 72/73] pr-checks: update compile-rocm workflow - switch to using ubuntu 22.04 base image - set LD_LIBRARY_PATH before compiling Open MPI to avoid a link error with ompi_info - remove the cleanup step Signed-off-by: Edgar Gabriel --- .github/workflows/compile-rocm.yaml | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/.github/workflows/compile-rocm.yaml b/.github/workflows/compile-rocm.yaml index cf4ad932032..d4bf54a6f7f 100644 --- a/.github/workflows/compile-rocm.yaml +++ b/.github/workflows/compile-rocm.yaml @@ -6,18 +6,21 @@ env: ROCM_VER: 5-4 jobs: compile-rocm: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Install dependencies run: | - sudo apt-get update - sudo apt-get install -y --no-install-recommends wget lsb-core software-properties-common gpg curl + sudo apt update + sudo apt install -y --no-install-recommends wget lsb-core software-properties-common gpg curl - name: Install extra dependencies run: | - curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg - echo 'deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/debian focal main' | sudo tee /etc/apt/sources.list.d/rocm.list - sudo apt-get update - sudo apt-get install -y rocm-hip-runtime + sudo mkdir --parents --mode=0755 /etc/apt/keyrings + wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/5.7.1/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7.1 jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list + echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 + sudo apt update + sudo apt install -y rocm-hip-runtime - uses: actions/checkout@v3 with: submodules: recursive @@ -25,10 +28,4 @@ jobs: run: | ./autogen.pl ./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran - make -j - - name: Clean up - run: | - ls -la ./ - rm -rf ./* - rm -rf ./.??* - ls -la ./ \ No newline at end of file + LD_LIBRARY_PATH=/opt/rocm/lib make -j \ No newline at end of file From 0254552c8519e980ddf20b4d571ba06570c0c466 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Fri, 3 Nov 2023 15:23:06 +0000 Subject: [PATCH 73/73] accelerator/rocm: update configure logic and Makefile update configure and compile logic to avoid adding hip libraries to the generic Open MPI LIBS. Signed-off-by: Edgar Gabriel --- config/opal_check_rocm.m4 | 12 ++++-------- opal/mca/accelerator/rocm/Makefile.am | 2 +- opal/mca/accelerator/rocm/configure.m4 | 1 + 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/config/opal_check_rocm.m4 b/config/opal_check_rocm.m4 index 17f671d6309..25ac54e438e 100644 --- a/config/opal_check_rocm.m4 +++ b/config/opal_check_rocm.m4 @@ -36,7 +36,7 @@ AC_DEFUN([OPAL_CHECK_ROCM],[ [ with_rocm="/opt/rocm"] ) rocm_CPPFLAGS="-D__HIP_PLATFORM_AMD__" - rocm_LDFLAGS="-L${with_rocm}/lib/hip" + rocm_LDFLAGS="-L${with_rocm}/lib/" AS_IF([ test -n "$with_rocm" && test "$with_rocm" != "no" ], [ OPAL_APPEND([CPPFLAGS], [$rocm_CPPFLAGS]) @@ -52,15 +52,11 @@ AC_DEFUN([OPAL_CHECK_ROCM],[ LDFLAGS="$rocm_save_LDFLAGS" LIBS="$rocm_save_LIBS" - OPAL_APPEND([CPPFLAGS], [${$1_CPPFLAGS}] ) - OPAL_APPEND([LDFLAGS], [${$1_LDFLAGS}] ) - OPAL_APPEND([LIBS], [${$1_LIBS}] ) - - AS_IF([ test "$opal_check_rocm_happy" = "no" ], - [ CPPFLAGS="$rocm_save_CPPFLAGS"]) + CPPFLAGS="$rocm_save_CPPFLAGS" AS_IF([ test "$opal_check_rocm_happy" = "yes" ], - [ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support]) + [ OPAL_APPEND([$1_CPPFLAGS], [$rocm_CPPFLAGS]) + AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [1], [Enable ROCm support]) ROCM_SUPPORT=1 ], [ AC_DEFINE_UNQUOTED([OPAL_ROCM_SUPPORT], [0], [Disable ROCm support]) ROCM_SUPPORT=0 ]) diff --git a/opal/mca/accelerator/rocm/Makefile.am b/opal/mca/accelerator/rocm/Makefile.am index f16e30ff3cd..cb89cae2697 100644 --- a/opal/mca/accelerator/rocm/Makefile.am +++ b/opal/mca/accelerator/rocm/Makefile.am @@ -13,7 +13,7 @@ # -AM_CPPFLAGS = $(common_rocm_CPPFLAGS) +AM_CPPFLAGS = $(opal_rocm_CPPFLAGS) sources = \ accelerator_rocm.h \ diff --git a/opal/mca/accelerator/rocm/configure.m4 b/opal/mca/accelerator/rocm/configure.m4 index bd3d1833d8b..780e0d8c793 100644 --- a/opal/mca/accelerator/rocm/configure.m4 +++ b/opal/mca/accelerator/rocm/configure.m4 @@ -24,4 +24,5 @@ AC_DEFUN([MCA_opal_accelerator_rocm_CONFIG],[ [$2]) AC_SUBST([opal_rocm_LDFLAGS]) AC_SUBST([opal_rocm_LIBS]) + AC_SUBST([opal_rocm_CPPFLAGS]) ])dnl