Support for dynamic MPI library loading (#641)

* MPI library can be now dynamically loaded at runtime. * Use new CMake option -DCORENRN_ENABLE_DYNAMIC_MPI=ON in order to enable dynamic mpi support. When enabled, separate mpi library is built and MPI itself is not linked to libcoreneuron.so library. * Changes done for dynamic MPI support - Avoid including MPI headers everywhere - mpi related code now separated into coreneuron/mpi/lib - mpi functions are renamed to separate functions with name <original_function>_impl - New code added to dynamically load coreneuron's mpi library function and dispatch them to appropriate functions - Dynamic mpi library also works when libcoreneuron is a stiatic library - Always use _impl version of functions inside mpi/lib - Avoid use of global variables, pass references/pointers to simplify dynamic library support - Protect mpi call at runtine if --mpi is not given on command line - Load dynamic mpi library only if --mpi is given * Code refactoring changes - Delete unused function pgvts_op - nrnmpi_initialized return boolean - Simplify include of extern nrnmpi_comm - Move nrnmpi_def_cinc.h to nrnmpi_dev_cinc.cpp - Use static_cast instead of c-cast - update code docs under comment (e.g. mkdynam.sh) - No more debug output on stdio - less GLOB in cmake - Remove unnecessary mpi communicators (_worlds), bb savestate * CI and testing - LFP test disabled with dynamic MPI as it uses MPI functions directly - Set set_tests_properties for DYLD/LD_LIBRARY_PATH * Future work * build a separate library for each MPI implementation * integration with neuron for wheel support * fix build/support for windows platform fixes #600 Co-authored-by: Olli Lupton <[email protected]>
BlueBrain · Oct 8, 2021 · 6342df2 · 6342df2
1 parent b5775c7
commit 6342df2
Show file tree

Hide file tree

Showing 44 changed files with 1,133 additions and 1,166 deletions.
diff --git a/.github/workflows/coreneuron-ci.yml b/.github/workflows/coreneuron-ci.yml
@@ -36,6 +36,8 @@ jobs:
         config:
           # Defaults: CORENRN_ENABLE_SOA=ON CORENRN_ENABLE_MPI=ON
           - {cmake_option: "-DCORENRN_ENABLE_MPI=ON", documentation: ON}
+          - {cmake_option: "-DCORENRN_ENABLE_DYNAMIC_MPI=ON"}
+          - {cmake_option: "-DCORENRN_ENABLE_DYNAMIC_MPI=ON -DCORENRN_ENABLE_SHARED=OFF"}
           - {cmake_option: "-DCORENRN_ENABLE_MPI=OFF"}
           - {cmake_option: "-DCORENRN_ENABLE_SOA=OFF"}
           - {use_nmodl: ON, py_version: 3.6.7}

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -88,6 +88,7 @@ option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation"
 option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
 option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
 option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
+option(CORENRN_ENABLE_DYNAMIC_MPI "Enable dynamic MPI support" OFF)
 option(CORENRN_ENABLE_SOA "Enable SoA Memory Layout" ON)
 option(CORENRN_ENABLE_HOC_EXP "Enable wrapping exp with hoc_exp()" OFF)
 option(CORENRN_ENABLE_SPLAYTREE_QUEUING "Enable use of Splay tree for spike queuing" ON)
@@ -331,6 +332,13 @@ set(NMODL_ENABLE_LEGACY_UNITS
     ${CORENRN_ENABLE_LEGACY_UNITS}
     CACHE BOOL "" FORCE)
 
+if(CORENRN_ENABLE_DYNAMIC_MPI)
+  if(NOT CORENRN_ENABLE_MPI)
+    message(FATAL_ERROR "Cannot enable dynamic mpi without mpi")
+  endif()
+  add_compile_definitions(CORENRN_ENABLE_DYNAMIC_MPI)
+endif()
+
 if(CORENRN_ENABLE_PRCELLSTATE)
   set(CORENRN_NRN_PRCELLSTATE 1)
 else()
@@ -489,6 +497,7 @@ message(STATUS "Build Type          | ${COMPILE_LIBRARY_TYPE}")
 message(STATUS "MPI                 | ${CORENRN_ENABLE_MPI}")
 if(CORENRN_ENABLE_MPI)
   message(STATUS "  INC               | ${MPI_CXX_INCLUDE_PATH}")
+  message(STATUS "  DYNAMIC           | ${CORENRN_ENABLE_DYNAMIC_MPI}")
 endif()
 message(STATUS "OpenMP              | ${CORENRN_ENABLE_OPENMP}")
 message(STATUS "Use legacy units    | ${CORENRN_ENABLE_LEGACY_UNITS}")

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
@@ -32,8 +32,11 @@ file(
   "utils/*/*.c"
   "utils/*/*.cpp")
 file(GLOB_RECURSE CORENEURON_CUDA_FILES "*.cu")
-file(GLOB SCOPMATH_CODE_FILES "sim/scopmath/*.cpp")
-file(GLOB MPI_CODE_FILES "mpi/*.cpp")
+set(SCOPMATH_CODE_FILES
+    "sim/scopmath/abort.cpp" "sim/scopmath/crout_thread.cpp" "sim/scopmath/newton_thread.cpp"
+    "sim/scopmath/sparse_thread.cpp" "sim/scopmath/ssimplic_thread.cpp")
+set(MPI_LIB_FILES "mpi/lib/mpispike.cpp" "mpi/lib/nrnmpi.cpp")
+set(MPI_CORE_FILES "mpi/core/nrnmpi_def_cinc.cpp" "mpi/core/nrnmpi.cpp" "mpi/core/nrnmpidec.cpp")
 file(COPY ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include/Random123
      DESTINATION ${CMAKE_BINARY_DIR}/include)
 list(APPEND CORENEURON_CODE_FILES ${PROJECT_BINARY_DIR}/coreneuron/config/config.cpp)
@@ -165,8 +168,13 @@ add_custom_target(kin_deriv_header DEPENDS "${KINDERIV_HEADER_FILE}")
 # create libraries
 # =============================================================================
 
-# mpi related target, this will be a separate library for dynamic MPI
-add_library(corenrn_mpi OBJECT ${MPI_CODE_FILES})
+# mpi related target, this is a separate library for dynamic MPI
+if(CORENRN_ENABLE_DYNAMIC_MPI)
+  add_library(corenrn_mpi SHARED ${MPI_LIB_FILES})
+else()
+  add_library(corenrn_mpi OBJECT ${MPI_LIB_FILES})
+  set(OBJ_MPI $<TARGET_OBJECTS:corenrn_mpi>)
+endif()
 target_include_directories(corenrn_mpi PRIVATE ${MPI_INCLUDE_PATH})
 set_property(TARGET corenrn_mpi PROPERTY POSITION_INDEPENDENT_CODE ON)
 
@@ -178,8 +186,17 @@ add_library(
   ${CORENEURON_TEMPLATE_FILES}
   ${CORENEURON_CODE_FILES}
   ${cudacorenrn_objs}
-  $<TARGET_OBJECTS:corenrn_mpi>
-  ${NMODL_INBUILT_MOD_OUTPUTS})
+  ${NMODL_INBUILT_MOD_OUTPUTS}
+  ${MPI_CORE_FILES}
+  ${OBJ_MPI})
+if(CORENRN_ENABLE_DYNAMIC_MPI)
+  target_link_libraries(coreneuron ${CMAKE_DL_LIBS})
+  target_link_libraries(corenrn_mpi ${MPI_CXX_LIBRARIES})
+  target_compile_definitions(coreneuron
+                             PUBLIC CMAKE_SHARED_LIBRARY_SUFFIX=${CMAKE_SHARED_LIBRARY_SUFFIX})
+else()
+  target_link_libraries(coreneuron ${MPI_CXX_LIBRARIES})
+endif()
 # Prevent CMake from running a device code link step when assembling libcoreneuron.a in GPU builds.
 # The device code linking needs to be deferred to the final step, where it is done by `nvc++ -cuda`.
 set_target_properties(coreneuron PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
@@ -194,14 +211,15 @@ add_dependencies(coreneuron kin_deriv_header nrnivmodl-core)
 add_library(scopmath STATIC ${CORENEURON_HEADER_FILES} ${SCOPMATH_CODE_FILES})
 
 target_link_libraries(coreneuron ${reportinglib_LIBRARY} ${sonatareport_LIBRARY} ${CALIPER_LIB}
-                      ${likwid_LIBRARIES} ${MPI_CXX_LIBRARIES})
+                      ${likwid_LIBRARIES})
+
 target_include_directories(coreneuron SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
 target_include_directories(coreneuron SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
 
 set_target_properties(
-  coreneuron scopmath
+  coreneuron scopmath corenrn_mpi
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              POSITION_INDEPENDENT_CODE ON)

diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
@@ -14,6 +14,7 @@
 
 #include <cstring>
 #include <climits>
+#include <dlfcn.h>
 #include <memory>
 #include <vector>
 
@@ -45,6 +46,7 @@
 #include "coreneuron/io/file_utils.hpp"
 #include "coreneuron/io/nrn2core_direct.h"
 #include "coreneuron/io/core2nrn_data_return.hpp"
+#include "coreneuron/utils/utils.hpp"
 
 extern "C" {
 const char* corenrn_version() {
@@ -201,7 +203,7 @@ void nrn_init_and_load_data(int argc,
 
 // if multi-threading enabled, make sure mpi library supports it
 #if NRNMPI
-    if (corenrn_param.threading) {
+    if (corenrn_param.mpi_enable && corenrn_param.threading) {
         nrnmpi_check_threading_support();
     }
 #endif
@@ -448,14 +450,35 @@ std::unique_ptr<ReportHandler> create_report_handler(ReportConfiguration& config
 
 using namespace coreneuron;
 
+#if NRNMPI
+#define STRINGIFY(x) #x
+#define TOSTRING(x)  STRINGIFY(x)
+static void* load_dynamic_mpi() {
+    dlerror();
+    void* handle = dlopen("libcorenrn_mpi" TOSTRING(CMAKE_SHARED_LIBRARY_SUFFIX),
+                          RTLD_NOW | RTLD_GLOBAL);
+    const char* error = dlerror();
+    if (error) {
+        std::string err_msg = std::string("Could not open dynamic MPI library: ") + error + "\n";
+        throw std::runtime_error(err_msg);
+    }
+    return handle;
+}
+#endif
 
 extern "C" void mk_mech_init(int argc, char** argv) {
     // read command line parameters and parameter config files
     corenrn_param.parse(argc, argv);
 
 #if NRNMPI
     if (corenrn_param.mpi_enable) {
-        nrnmpi_init(&argc, &argv);
+#ifdef CORENRN_ENABLE_DYNAMIC_MPI
+        auto mpi_handle = load_dynamic_mpi();
+        mpi_manager().resolve_symbols(mpi_handle);
+#endif
+        auto ret = nrnmpi_init(&argc, &argv);
+        nrnmpi_numprocs = ret.numprocs;
+        nrnmpi_myid = ret.myid;
     }
 #endif
 
@@ -514,7 +537,9 @@ extern "C" int run_solve_core(int argc, char** argv) {
         mkdir_p(output_dir.c_str());
     }
 #if NRNMPI
-    nrnmpi_barrier();
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_barrier();
+    }
 #endif
     bool compute_gpu = corenrn_param.gpu;
     bool skip_mpi_finalize = corenrn_param.skip_mpi_finalize;
@@ -643,7 +668,7 @@ extern "C" int run_solve_core(int argc, char** argv) {
 
 // mpi finalize
 #if NRNMPI
-    if (!skip_mpi_finalize) {
+    if (corenrn_param.mpi_enable && !skip_mpi_finalize) {
         nrnmpi_finalize();
     }
 #endif

diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -19,7 +19,8 @@
 #include "coreneuron/sim/scopmath/newton_struct.h"
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
-#include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/nrnmpidec.h"
+#include "coreneuron/utils/utils.hpp"
 
 #ifdef _OPENACC
 #include <openacc.h>

diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp
@@ -1,12 +1,12 @@
 #include "coreneuron/io/lfp.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 #include <cmath>
 #include <limits>
 #include <sstream>
 
 
 namespace coreneuron {
-
 // extern variables require acc declare
 #pragma acc declare create(pi)
 
@@ -112,12 +112,15 @@ inline void LFPCalculator<Type, SegmentIdTy>::lfp(const Vector& membrane_current
         }
     }
 #if NRNMPI
-    lfp_values_.resize(res.size());
-    int mpi_sum{1};
-    nrnmpi_dbl_allreduce_vec(res.data(), lfp_values_.data(), res.size(), mpi_sum);
-#else
-    std::swap(res, lfp_values_);
+    if (corenrn_param.mpi_enable) {
+        lfp_values_.resize(res.size());
+        int mpi_sum{1};
+        nrnmpi_dbl_allreduce_vec(res.data(), lfp_values_.data(), res.size(), mpi_sum);
+    } else
 #endif
+    {
+        std::swap(res, lfp_values_);
+    }
 }
 
 

diff --git a/coreneuron/io/lfp.hpp b/coreneuron/io/lfp.hpp
@@ -3,7 +3,7 @@
 #include <array>
 #include <vector>
 
-#include "coreneuron/mpi/nrnmpidec.h"
+#include "coreneuron/mpi/nrnmpi.h"
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/utils/nrn_assert.h"
 

diff --git a/coreneuron/io/mech_report.cpp b/coreneuron/io/mech_report.cpp
@@ -11,9 +11,9 @@
 
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 namespace coreneuron {
-
 /** display global mechanism count */
 void write_mech_report() {
     /// mechanim count across all gids, local to rank
@@ -33,15 +33,19 @@ void write_mech_report() {
     std::vector<long> total_mech_count(n_memb_func);
 
 #if NRNMPI
-    /// get global sum of all mechanism instances
-    nrnmpi_long_allreduce_vec(&local_mech_count[0],
-                              &total_mech_count[0],
-                              local_mech_count.size(),
-                              1);
-
-#else
-    total_mech_count = local_mech_count;
+    if (corenrn_param.mpi_enable) {
+        /// get global sum of all mechanism instances
+        nrnmpi_long_allreduce_vec(&local_mech_count[0],
+                                  &total_mech_count[0],
+                                  local_mech_count.size(),
+                                  1);
+
+    } else
 #endif
+    {
+        total_mech_count = local_mech_count;
+    }
+
     /// print global stats to stdout
     if (nrnmpi_myid == 0) {
         printf("\n================ MECHANISMS COUNT BY TYPE ==================\n");

diff --git a/coreneuron/io/nrn_checkpoint.cpp b/coreneuron/io/nrn_checkpoint.cpp
@@ -23,9 +23,9 @@
 #include "coreneuron/permute/node_permute.h"
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 namespace coreneuron {
-
 // Those functions comes from mod file directly
 extern int checkpoint_save_patternstim(_threadargsproto_);
 extern void checkpoint_restore_patternstim(int, double, _threadargsproto_);
@@ -62,7 +62,9 @@ void CheckPoints::write_checkpoint(NrnThread* nt, int nb_threads) const {
     }
 
 #if NRNMPI
-    nrnmpi_barrier();
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_barrier();
+    }
 #endif
 
     /**
@@ -79,7 +81,9 @@ void CheckPoints::write_checkpoint(NrnThread* nt, int nb_threads) const {
         write_time();
     }
 #if NRNMPI
-    nrnmpi_barrier();
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_barrier();
+    }
 #endif
 }
 

diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
@@ -22,7 +22,9 @@
 #include "coreneuron/utils/nrn_assert.h"
 #include "coreneuron/utils/nrnmutdec.h"
 #include "coreneuron/utils/memory.h"
+#include "coreneuron/utils/utils.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/io/nrn_setup.hpp"
 #include "coreneuron/network/partrans.hpp"
 #include "coreneuron/io/nrn_checkpoint.hpp"
@@ -147,8 +149,6 @@ void (*nrn2core_all_weights_return_)(std::vector<double*>& weights);
 // files with the first containing output_gids and netcon_srcgid which are
 // stored in the nt.presyns array and nt.netcons array respectively
 namespace coreneuron {
-extern corenrn_parameters corenrn_param;
-
 static OMP_Mutex mut;
 
 /// Vector of maps for negative presyns
@@ -1091,18 +1091,21 @@ size_t model_size(bool detailed_report) {
     if (detailed_report) {
         size_data[12] = nbyte;
 #if NRNMPI
-        // last arg is op type where 1 is sum, 2 is max and any other value is min
-        nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_sum[0], 13, 1);
-        nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_max[0], 13, 2);
-        nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_min[0], 13, 3);
-        for (int i = 0; i < 13; i++) {
-            global_size_data_avg[i] = global_size_data_sum[i] / float(nrnmpi_numprocs);
-        }
-#else
-        global_size_data_max = size_data;
-        global_size_data_min = size_data;
-        global_size_data_avg.assign(size_data.cbegin(), size_data.cend());
+        if (corenrn_param.mpi_enable) {
+            // last arg is op type where 1 is sum, 2 is max and any other value is min
+            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_sum[0], 13, 1);
+            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_max[0], 13, 2);
+            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_min[0], 13, 3);
+            for (int i = 0; i < 13; i++) {
+                global_size_data_avg[i] = global_size_data_sum[i] / float(nrnmpi_numprocs);
+            }
+        } else
 #endif
+        {
+            global_size_data_max = size_data;
+            global_size_data_min = size_data;
+            global_size_data_avg.assign(size_data.cbegin(), size_data.cend());
+        }
         // now print the collected data:
         if (nrnmpi_myid == 0) {
             printf("Memory size information for all NrnThreads per rank\n");
@@ -1197,9 +1200,11 @@ size_t model_size(bool detailed_report) {
     }
 
 #if NRNMPI
-    long global_nbyte = 0;
-    nrnmpi_long_allreduce_vec(&nbyte, &global_nbyte, 1, 1);
-    nbyte = global_nbyte;
+    if (corenrn_param.mpi_enable) {
+        long global_nbyte = 0;
+        nrnmpi_long_allreduce_vec(&nbyte, &global_nbyte, 1, 1);
+        nbyte = global_nbyte;
+    }
 #endif
 
     return nbyte;