From 6342df2b2aaac0f0f62095f4095c5c2723242af5 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Fri, 8 Oct 2021 23:48:46 +0200
Subject: [PATCH] Support for dynamic MPI library loading (#641)

* MPI library can be now dynamically loaded at runtime.
* Use new CMake option -DCORENRN_ENABLE_DYNAMIC_MPI=ON in order to
  enable dynamic mpi support. When enabled, separate mpi library is built and MPI
  itself is not linked to libcoreneuron.so library.
* Changes done for dynamic MPI support
   - Avoid including MPI headers everywhere
   - mpi related code now separated into  coreneuron/mpi/lib
   - mpi functions are renamed to separate functions with
     name <original_function>_impl
   - New code added to dynamically load coreneuron's mpi library function
     and dispatch them to appropriate functions
   - Dynamic mpi library also works when libcoreneuron is a stiatic library
   - Always use _impl version of functions inside mpi/lib
   - Avoid use of global variables, pass references/pointers to simplify
     dynamic library support
   - Protect mpi call at runtine if --mpi is not given on command line
   - Load dynamic mpi library only if --mpi is given
* Code refactoring changes
   - Delete unused function pgvts_op
   - nrnmpi_initialized return boolean
   - Simplify include of extern nrnmpi_comm
   - Move nrnmpi_def_cinc.h to nrnmpi_dev_cinc.cpp
   - Use static_cast instead of c-cast
   - update code docs under comment (e.g. mkdynam.sh)
   - No more debug output on stdio
   - less GLOB in cmake
   - Remove unnecessary mpi communicators (_worlds), bb savestate
* CI and testing
   - LFP test disabled with dynamic MPI as it uses MPI functions directly
   - Set set_tests_properties for DYLD/LD_LIBRARY_PATH
* Future work
  * build a separate library for each MPI implementation
  * integration with neuron for wheel support
  * fix build/support for windows platform

fixes #600

Co-authored-by: Olli Lupton <oliver.lupton@epfl.ch>
---
 .github/workflows/coreneuron-ci.yml           |   2 +
 CMakeLists.txt                                |   9 +
 coreneuron/CMakeLists.txt                     |  34 +-
 coreneuron/apps/main1.cpp                     |  33 +-
 coreneuron/gpu/nrn_acc_manager.cpp            |   3 +-
 coreneuron/io/lfp.cpp                         |  15 +-
 coreneuron/io/lfp.hpp                         |   2 +-
 coreneuron/io/mech_report.cpp                 |  22 +-
 coreneuron/io/nrn_checkpoint.cpp              |  10 +-
 coreneuron/io/nrn_setup.cpp                   |  37 +-
 coreneuron/io/output_spikes.cpp               |  12 +-
 coreneuron/io/phase2.cpp                      |   1 +
 .../reports/report_configuration_parser.cpp   |   1 +
 coreneuron/io/reports/report_handler.cpp      |   1 +
 coreneuron/mpi/core/nrnmpi.cpp                |  29 +
 coreneuron/mpi/core/nrnmpi.hpp                |   6 +
 .../nrnmpi_def_cinc.cpp}                      |  13 +-
 coreneuron/mpi/core/nrnmpidec.cpp             |  71 +++
 coreneuron/mpi/lib/mpispike.cpp               | 374 ++++++++++++
 coreneuron/mpi/{ => lib}/nrnmpi.cpp           | 144 ++---
 coreneuron/mpi/lib/nrnmpi.hpp                 |   8 +
 coreneuron/mpi/mpispike.cpp                   | 562 ------------------
 coreneuron/mpi/mpispike.hpp                   |  66 --
 coreneuron/mpi/nrnmpi.h                       |  94 ++-
 coreneuron/mpi/nrnmpi_def_cinc.h              |  35 --
 coreneuron/mpi/nrnmpidec.h                    | 174 +++---
 coreneuron/mpi/nrnmpiuse.h                    |   8 +-
 coreneuron/network/have2want.h                |  48 +-
 coreneuron/network/multisend_setup.cpp        |   2 +
 coreneuron/network/netpar.cpp                 | 364 ++++++------
 coreneuron/network/partrans.cpp               |   4 +-
 coreneuron/network/partrans_setup.cpp         |   1 +
 coreneuron/sim/fadvance_core.cpp              |   2 -
 coreneuron/sim/multicore.hpp                  |   1 +
 coreneuron/utils/memory_utils.cpp             |  16 +-
 coreneuron/utils/nrn_stats.cpp                |  13 +-
 coreneuron/utils/nrnoc_aux.cpp                |   5 +-
 coreneuron/utils/nrntimeout.cpp               |   1 +
 coreneuron/utils/randoms/nrnran123.cu         |   1 +
 coreneuron/utils/utils.cpp                    |  38 ++
 coreneuron/utils/utils.hpp                    |  15 +
 extra/instrumentation.tau                     |   3 +-
 tests/CMakeLists.txt                          |   6 +-
 tests/integration/CMakeLists.txt              |  13 +
 44 files changed, 1133 insertions(+), 1166 deletions(-)
 create mode 100644 coreneuron/mpi/core/nrnmpi.cpp
 create mode 100644 coreneuron/mpi/core/nrnmpi.hpp
 rename coreneuron/mpi/{nrnmpi_impl.h => core/nrnmpi_def_cinc.cpp} (65%)
 create mode 100644 coreneuron/mpi/core/nrnmpidec.cpp
 create mode 100644 coreneuron/mpi/lib/mpispike.cpp
 rename coreneuron/mpi/{ => lib}/nrnmpi.cpp (56%)
 create mode 100644 coreneuron/mpi/lib/nrnmpi.hpp
 delete mode 100644 coreneuron/mpi/mpispike.cpp
 delete mode 100644 coreneuron/mpi/mpispike.hpp
 delete mode 100644 coreneuron/mpi/nrnmpi_def_cinc.h
 create mode 100644 coreneuron/utils/utils.cpp
 create mode 100644 coreneuron/utils/utils.hpp

diff --git a/.github/workflows/coreneuron-ci.yml b/.github/workflows/coreneuron-ci.yml
index a5b1a3b81..a7adc90fb 100644
--- a/.github/workflows/coreneuron-ci.yml
+++ b/.github/workflows/coreneuron-ci.yml
@@ -36,6 +36,8 @@ jobs:
         config:
           # Defaults: CORENRN_ENABLE_SOA=ON CORENRN_ENABLE_MPI=ON
           - {cmake_option: "-DCORENRN_ENABLE_MPI=ON", documentation: ON}
+          - {cmake_option: "-DCORENRN_ENABLE_DYNAMIC_MPI=ON"}
+          - {cmake_option: "-DCORENRN_ENABLE_DYNAMIC_MPI=ON -DCORENRN_ENABLE_SHARED=OFF"}
           - {cmake_option: "-DCORENRN_ENABLE_MPI=OFF"}
           - {cmake_option: "-DCORENRN_ENABLE_SOA=OFF"}
           - {use_nmodl: ON, py_version: 3.6.7}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4771f4c6..c80a15df0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,7 @@ option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation"
 option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
 option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
 option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
+option(CORENRN_ENABLE_DYNAMIC_MPI "Enable dynamic MPI support" OFF)
 option(CORENRN_ENABLE_SOA "Enable SoA Memory Layout" ON)
 option(CORENRN_ENABLE_HOC_EXP "Enable wrapping exp with hoc_exp()" OFF)
 option(CORENRN_ENABLE_SPLAYTREE_QUEUING "Enable use of Splay tree for spike queuing" ON)
@@ -331,6 +332,13 @@ set(NMODL_ENABLE_LEGACY_UNITS
     ${CORENRN_ENABLE_LEGACY_UNITS}
     CACHE BOOL "" FORCE)
 
+if(CORENRN_ENABLE_DYNAMIC_MPI)
+  if(NOT CORENRN_ENABLE_MPI)
+    message(FATAL_ERROR "Cannot enable dynamic mpi without mpi")
+  endif()
+  add_compile_definitions(CORENRN_ENABLE_DYNAMIC_MPI)
+endif()
+
 if(CORENRN_ENABLE_PRCELLSTATE)
   set(CORENRN_NRN_PRCELLSTATE 1)
 else()
@@ -489,6 +497,7 @@ message(STATUS "Build Type          | ${COMPILE_LIBRARY_TYPE}")
 message(STATUS "MPI                 | ${CORENRN_ENABLE_MPI}")
 if(CORENRN_ENABLE_MPI)
   message(STATUS "  INC               | ${MPI_CXX_INCLUDE_PATH}")
+  message(STATUS "  DYNAMIC           | ${CORENRN_ENABLE_DYNAMIC_MPI}")
 endif()
 message(STATUS "OpenMP              | ${CORENRN_ENABLE_OPENMP}")
 message(STATUS "Use legacy units    | ${CORENRN_ENABLE_LEGACY_UNITS}")
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 75485df58..e5fd5bdc7 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -32,8 +32,11 @@ file(
   "utils/*/*.c"
   "utils/*/*.cpp")
 file(GLOB_RECURSE CORENEURON_CUDA_FILES "*.cu")
-file(GLOB SCOPMATH_CODE_FILES "sim/scopmath/*.cpp")
-file(GLOB MPI_CODE_FILES "mpi/*.cpp")
+set(SCOPMATH_CODE_FILES
+    "sim/scopmath/abort.cpp" "sim/scopmath/crout_thread.cpp" "sim/scopmath/newton_thread.cpp"
+    "sim/scopmath/sparse_thread.cpp" "sim/scopmath/ssimplic_thread.cpp")
+set(MPI_LIB_FILES "mpi/lib/mpispike.cpp" "mpi/lib/nrnmpi.cpp")
+set(MPI_CORE_FILES "mpi/core/nrnmpi_def_cinc.cpp" "mpi/core/nrnmpi.cpp" "mpi/core/nrnmpidec.cpp")
 file(COPY ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include/Random123
      DESTINATION ${CMAKE_BINARY_DIR}/include)
 list(APPEND CORENEURON_CODE_FILES ${PROJECT_BINARY_DIR}/coreneuron/config/config.cpp)
@@ -165,8 +168,13 @@ add_custom_target(kin_deriv_header DEPENDS "${KINDERIV_HEADER_FILE}")
 # create libraries
 # =============================================================================
 
-# mpi related target, this will be a separate library for dynamic MPI
-add_library(corenrn_mpi OBJECT ${MPI_CODE_FILES})
+# mpi related target, this is a separate library for dynamic MPI
+if(CORENRN_ENABLE_DYNAMIC_MPI)
+  add_library(corenrn_mpi SHARED ${MPI_LIB_FILES})
+else()
+  add_library(corenrn_mpi OBJECT ${MPI_LIB_FILES})
+  set(OBJ_MPI $<TARGET_OBJECTS:corenrn_mpi>)
+endif()
 target_include_directories(corenrn_mpi PRIVATE ${MPI_INCLUDE_PATH})
 set_property(TARGET corenrn_mpi PROPERTY POSITION_INDEPENDENT_CODE ON)
 
@@ -178,8 +186,17 @@ add_library(
   ${CORENEURON_TEMPLATE_FILES}
   ${CORENEURON_CODE_FILES}
   ${cudacorenrn_objs}
-  $<TARGET_OBJECTS:corenrn_mpi>
-  ${NMODL_INBUILT_MOD_OUTPUTS})
+  ${NMODL_INBUILT_MOD_OUTPUTS}
+  ${MPI_CORE_FILES}
+  ${OBJ_MPI})
+if(CORENRN_ENABLE_DYNAMIC_MPI)
+  target_link_libraries(coreneuron ${CMAKE_DL_LIBS})
+  target_link_libraries(corenrn_mpi ${MPI_CXX_LIBRARIES})
+  target_compile_definitions(coreneuron
+                             PUBLIC CMAKE_SHARED_LIBRARY_SUFFIX=${CMAKE_SHARED_LIBRARY_SUFFIX})
+else()
+  target_link_libraries(coreneuron ${MPI_CXX_LIBRARIES})
+endif()
 # Prevent CMake from running a device code link step when assembling libcoreneuron.a in GPU builds.
 # The device code linking needs to be deferred to the final step, where it is done by `nvc++ -cuda`.
 set_target_properties(coreneuron PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
@@ -194,14 +211,15 @@ add_dependencies(coreneuron kin_deriv_header nrnivmodl-core)
 add_library(scopmath STATIC ${CORENEURON_HEADER_FILES} ${SCOPMATH_CODE_FILES})
 
 target_link_libraries(coreneuron ${reportinglib_LIBRARY} ${sonatareport_LIBRARY} ${CALIPER_LIB}
-                      ${likwid_LIBRARIES} ${MPI_CXX_LIBRARIES})
+                      ${likwid_LIBRARIES})
+
 target_include_directories(coreneuron SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/Random123/include)
 target_include_directories(coreneuron SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
 
 set_target_properties(
-  coreneuron scopmath
+  coreneuron scopmath corenrn_mpi
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
              POSITION_INDEPENDENT_CODE ON)
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index d217397de..997171f01 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -14,6 +14,7 @@
 
 #include <cstring>
 #include <climits>
+#include <dlfcn.h>
 #include <memory>
 #include <vector>
 
@@ -45,6 +46,7 @@
 #include "coreneuron/io/file_utils.hpp"
 #include "coreneuron/io/nrn2core_direct.h"
 #include "coreneuron/io/core2nrn_data_return.hpp"
+#include "coreneuron/utils/utils.hpp"
 
 extern "C" {
 const char* corenrn_version() {
@@ -201,7 +203,7 @@ void nrn_init_and_load_data(int argc,
 
 // if multi-threading enabled, make sure mpi library supports it
 #if NRNMPI
-    if (corenrn_param.threading) {
+    if (corenrn_param.mpi_enable && corenrn_param.threading) {
         nrnmpi_check_threading_support();
     }
 #endif
@@ -448,6 +450,21 @@ std::unique_ptr<ReportHandler> create_report_handler(ReportConfiguration& config
 
 using namespace coreneuron;
 
+#if NRNMPI
+#define STRINGIFY(x) #x
+#define TOSTRING(x)  STRINGIFY(x)
+static void* load_dynamic_mpi() {
+    dlerror();
+    void* handle = dlopen("libcorenrn_mpi" TOSTRING(CMAKE_SHARED_LIBRARY_SUFFIX),
+                          RTLD_NOW | RTLD_GLOBAL);
+    const char* error = dlerror();
+    if (error) {
+        std::string err_msg = std::string("Could not open dynamic MPI library: ") + error + "\n";
+        throw std::runtime_error(err_msg);
+    }
+    return handle;
+}
+#endif
 
 extern "C" void mk_mech_init(int argc, char** argv) {
     // read command line parameters and parameter config files
@@ -455,7 +472,13 @@ extern "C" void mk_mech_init(int argc, char** argv) {
 
 #if NRNMPI
     if (corenrn_param.mpi_enable) {
-        nrnmpi_init(&argc, &argv);
+#ifdef CORENRN_ENABLE_DYNAMIC_MPI
+        auto mpi_handle = load_dynamic_mpi();
+        mpi_manager().resolve_symbols(mpi_handle);
+#endif
+        auto ret = nrnmpi_init(&argc, &argv);
+        nrnmpi_numprocs = ret.numprocs;
+        nrnmpi_myid = ret.myid;
     }
 #endif
 
@@ -514,7 +537,9 @@ extern "C" int run_solve_core(int argc, char** argv) {
         mkdir_p(output_dir.c_str());
     }
 #if NRNMPI
-    nrnmpi_barrier();
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_barrier();
+    }
 #endif
     bool compute_gpu = corenrn_param.gpu;
     bool skip_mpi_finalize = corenrn_param.skip_mpi_finalize;
@@ -643,7 +668,7 @@ extern "C" int run_solve_core(int argc, char** argv) {
 
 // mpi finalize
 #if NRNMPI
-    if (!skip_mpi_finalize) {
+    if (corenrn_param.mpi_enable && !skip_mpi_finalize) {
         nrnmpi_finalize();
     }
 #endif
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index f6e5634af..9d27f9939 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -19,7 +19,8 @@
 #include "coreneuron/sim/scopmath/newton_struct.h"
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
-#include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/nrnmpidec.h"
+#include "coreneuron/utils/utils.hpp"
 
 #ifdef _OPENACC
 #include <openacc.h>
diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp
index 64e7393c5..646fbf5a0 100644
--- a/coreneuron/io/lfp.cpp
+++ b/coreneuron/io/lfp.cpp
@@ -1,4 +1,5 @@
 #include "coreneuron/io/lfp.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 #include <cmath>
 #include <limits>
@@ -6,7 +7,6 @@
 
 
 namespace coreneuron {
-
 // extern variables require acc declare
 #pragma acc declare create(pi)
 
@@ -112,12 +112,15 @@ inline void LFPCalculator<Type, SegmentIdTy>::lfp(const Vector& membrane_current
         }
     }
 #if NRNMPI
-    lfp_values_.resize(res.size());
-    int mpi_sum{1};
-    nrnmpi_dbl_allreduce_vec(res.data(), lfp_values_.data(), res.size(), mpi_sum);
-#else
-    std::swap(res, lfp_values_);
+    if (corenrn_param.mpi_enable) {
+        lfp_values_.resize(res.size());
+        int mpi_sum{1};
+        nrnmpi_dbl_allreduce_vec(res.data(), lfp_values_.data(), res.size(), mpi_sum);
+    } else
 #endif
+    {
+        std::swap(res, lfp_values_);
+    }
 }
 
 
diff --git a/coreneuron/io/lfp.hpp b/coreneuron/io/lfp.hpp
index e47da3c7d..105d0163b 100644
--- a/coreneuron/io/lfp.hpp
+++ b/coreneuron/io/lfp.hpp
@@ -3,7 +3,7 @@
 #include <array>
 #include <vector>
 
-#include "coreneuron/mpi/nrnmpidec.h"
+#include "coreneuron/mpi/nrnmpi.h"
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/utils/nrn_assert.h"
 
diff --git a/coreneuron/io/mech_report.cpp b/coreneuron/io/mech_report.cpp
index d319bbfcb..7f5bc88f0 100644
--- a/coreneuron/io/mech_report.cpp
+++ b/coreneuron/io/mech_report.cpp
@@ -11,9 +11,9 @@
 
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 namespace coreneuron {
-
 /** display global mechanism count */
 void write_mech_report() {
     /// mechanim count across all gids, local to rank
@@ -33,15 +33,19 @@ void write_mech_report() {
     std::vector<long> total_mech_count(n_memb_func);
 
 #if NRNMPI
-    /// get global sum of all mechanism instances
-    nrnmpi_long_allreduce_vec(&local_mech_count[0],
-                              &total_mech_count[0],
-                              local_mech_count.size(),
-                              1);
-
-#else
-    total_mech_count = local_mech_count;
+    if (corenrn_param.mpi_enable) {
+        /// get global sum of all mechanism instances
+        nrnmpi_long_allreduce_vec(&local_mech_count[0],
+                                  &total_mech_count[0],
+                                  local_mech_count.size(),
+                                  1);
+
+    } else
 #endif
+    {
+        total_mech_count = local_mech_count;
+    }
+
     /// print global stats to stdout
     if (nrnmpi_myid == 0) {
         printf("\n================ MECHANISMS COUNT BY TYPE ==================\n");
diff --git a/coreneuron/io/nrn_checkpoint.cpp b/coreneuron/io/nrn_checkpoint.cpp
index b4b04cbf3..49ea21384 100644
--- a/coreneuron/io/nrn_checkpoint.cpp
+++ b/coreneuron/io/nrn_checkpoint.cpp
@@ -23,9 +23,9 @@
 #include "coreneuron/permute/node_permute.h"
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 namespace coreneuron {
-
 // Those functions comes from mod file directly
 extern int checkpoint_save_patternstim(_threadargsproto_);
 extern void checkpoint_restore_patternstim(int, double, _threadargsproto_);
@@ -62,7 +62,9 @@ void CheckPoints::write_checkpoint(NrnThread* nt, int nb_threads) const {
     }
 
 #if NRNMPI
-    nrnmpi_barrier();
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_barrier();
+    }
 #endif
 
     /**
@@ -79,7 +81,9 @@ void CheckPoints::write_checkpoint(NrnThread* nt, int nb_threads) const {
         write_time();
     }
 #if NRNMPI
-    nrnmpi_barrier();
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_barrier();
+    }
 #endif
 }
 
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index 62b55e7fe..a41103bb8 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -22,7 +22,9 @@
 #include "coreneuron/utils/nrn_assert.h"
 #include "coreneuron/utils/nrnmutdec.h"
 #include "coreneuron/utils/memory.h"
+#include "coreneuron/utils/utils.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/io/nrn_setup.hpp"
 #include "coreneuron/network/partrans.hpp"
 #include "coreneuron/io/nrn_checkpoint.hpp"
@@ -147,8 +149,6 @@ void (*nrn2core_all_weights_return_)(std::vector<double*>& weights);
 // files with the first containing output_gids and netcon_srcgid which are
 // stored in the nt.presyns array and nt.netcons array respectively
 namespace coreneuron {
-extern corenrn_parameters corenrn_param;
-
 static OMP_Mutex mut;
 
 /// Vector of maps for negative presyns
@@ -1091,18 +1091,21 @@ size_t model_size(bool detailed_report) {
     if (detailed_report) {
         size_data[12] = nbyte;
 #if NRNMPI
-        // last arg is op type where 1 is sum, 2 is max and any other value is min
-        nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_sum[0], 13, 1);
-        nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_max[0], 13, 2);
-        nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_min[0], 13, 3);
-        for (int i = 0; i < 13; i++) {
-            global_size_data_avg[i] = global_size_data_sum[i] / float(nrnmpi_numprocs);
-        }
-#else
-        global_size_data_max = size_data;
-        global_size_data_min = size_data;
-        global_size_data_avg.assign(size_data.cbegin(), size_data.cend());
+        if (corenrn_param.mpi_enable) {
+            // last arg is op type where 1 is sum, 2 is max and any other value is min
+            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_sum[0], 13, 1);
+            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_max[0], 13, 2);
+            nrnmpi_long_allreduce_vec(&size_data[0], &global_size_data_min[0], 13, 3);
+            for (int i = 0; i < 13; i++) {
+                global_size_data_avg[i] = global_size_data_sum[i] / float(nrnmpi_numprocs);
+            }
+        } else
 #endif
+        {
+            global_size_data_max = size_data;
+            global_size_data_min = size_data;
+            global_size_data_avg.assign(size_data.cbegin(), size_data.cend());
+        }
         // now print the collected data:
         if (nrnmpi_myid == 0) {
             printf("Memory size information for all NrnThreads per rank\n");
@@ -1197,9 +1200,11 @@ size_t model_size(bool detailed_report) {
     }
 
 #if NRNMPI
-    long global_nbyte = 0;
-    nrnmpi_long_allreduce_vec(&nbyte, &global_nbyte, 1, 1);
-    nbyte = global_nbyte;
+    if (corenrn_param.mpi_enable) {
+        long global_nbyte = 0;
+        nrnmpi_long_allreduce_vec(&nbyte, &global_nbyte, 1, 1);
+        nbyte = global_nbyte;
+    }
 #endif
 
     return nbyte;
diff --git a/coreneuron/io/output_spikes.cpp b/coreneuron/io/output_spikes.cpp
index ee1b20983..80fd29d78 100644
--- a/coreneuron/io/output_spikes.cpp
+++ b/coreneuron/io/output_spikes.cpp
@@ -19,9 +19,11 @@
 #include "coreneuron/io/nrn2core_direct.h"
 #include "coreneuron/io/output_spikes.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/utils/nrnmutdec.h"
 #include "coreneuron/mpi/nrnmpidec.h"
 #include "coreneuron/utils/string_utils.h"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 #ifdef ENABLE_SONATA_REPORTS
 #include "bbp/sonata/reports.h"
 #endif  // ENABLE_SONATA_REPORTS
@@ -39,7 +41,6 @@ static bool all_spikes_return(std::vector<double>& spiketvec, std::vector<int>&
 }
 
 namespace coreneuron {
-
 /// --> Coreneuron as SpikeBuffer class
 std::vector<double> spikevec_time;
 std::vector<int> spikevec_gid;
@@ -286,14 +287,13 @@ void output_spikes(const char* outpath,
         return;
     }
 #if NRNMPI
-    if (nrnmpi_initialized()) {
+    if (corenrn_param.mpi_enable && nrnmpi_initialized()) {
         output_spikes_parallel(outpath, population_name_offset);
-    } else {
+    } else
+#endif
+    {
         output_spikes_serial(outpath);
     }
-#else
-    output_spikes_serial(outpath);
-#endif
     clear_spike_vectors();
 }
 
diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index e30b5196a..4d6f81ed2 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -13,6 +13,7 @@
 #include "coreneuron/utils/nrnoc_aux.hpp"
 #include "coreneuron/permute/cellorder.hpp"
 #include "coreneuron/permute/node_permute.h"
+#include "coreneuron/utils/utils.hpp"
 #include "coreneuron/utils/vrecitem.h"
 #include "coreneuron/io/mem_layout_util.hpp"
 #include "coreneuron/io/setup_fornetcon.hpp"
diff --git a/coreneuron/io/reports/report_configuration_parser.cpp b/coreneuron/io/reports/report_configuration_parser.cpp
index 1b757878d..ede676e82 100644
--- a/coreneuron/io/reports/report_configuration_parser.cpp
+++ b/coreneuron/io/reports/report_configuration_parser.cpp
@@ -21,6 +21,7 @@
 #include "coreneuron/mechanism/mech_mapping.hpp"
 #include "coreneuron/sim/fast_imem.hpp"
 #include "coreneuron/utils/nrn_assert.h"
+#include "coreneuron/utils/utils.hpp"
 
 namespace coreneuron {
 
diff --git a/coreneuron/io/reports/report_handler.cpp b/coreneuron/io/reports/report_handler.cpp
index c94aa7dc6..84341e7a4 100644
--- a/coreneuron/io/reports/report_handler.cpp
+++ b/coreneuron/io/reports/report_handler.cpp
@@ -9,6 +9,7 @@
 #include "report_handler.hpp"
 #include "coreneuron/io/nrnsection_mapping.hpp"
 #include "coreneuron/mechanism/mech_mapping.hpp"
+#include "coreneuron/utils/utils.hpp"
 
 namespace coreneuron {
 
diff --git a/coreneuron/mpi/core/nrnmpi.cpp b/coreneuron/mpi/core/nrnmpi.cpp
new file mode 100644
index 000000000..17d7e55b3
--- /dev/null
+++ b/coreneuron/mpi/core/nrnmpi.cpp
@@ -0,0 +1,29 @@
+#if NRNMPI
+#include <dlfcn.h>
+#include <sstream>
+#include "../nrnmpi.h"
+
+namespace coreneuron {
+// Those functions are part of a mechanism to dynamically load mpi or not
+void mpi_manager_t::resolve_symbols(void* handle) {
+    for (auto* ptr: m_function_ptrs) {
+        assert(!(*ptr));
+        ptr->resolve(handle);
+        assert(*ptr);
+    }
+}
+
+void mpi_function_base::resolve(void* handle) {
+    dlerror();
+    void* ptr = dlsym(handle, m_name);
+    const char* error = dlerror();
+    if (error) {
+        std::ostringstream oss;
+        oss << "Could not get symbol " << m_name << " from handle " << handle << ": " << error;
+        throw std::runtime_error(oss.str());
+    }
+    assert(ptr);
+    m_fptr = ptr;
+}
+}  // namespace coreneuron
+#endif  // NRNMPI
diff --git a/coreneuron/mpi/core/nrnmpi.hpp b/coreneuron/mpi/core/nrnmpi.hpp
new file mode 100644
index 000000000..c9755eb40
--- /dev/null
+++ b/coreneuron/mpi/core/nrnmpi.hpp
@@ -0,0 +1,6 @@
+#pragma once
+
+namespace coreneuron {
+extern int nrnmpi_numprocs;
+extern int nrnmpi_myid;
+}  // namespace coreneuron
diff --git a/coreneuron/mpi/nrnmpi_impl.h b/coreneuron/mpi/core/nrnmpi_def_cinc.cpp
similarity index 65%
rename from coreneuron/mpi/nrnmpi_impl.h
rename to coreneuron/mpi/core/nrnmpi_def_cinc.cpp
index 1b063cefc..b2bf42159 100644
--- a/coreneuron/mpi/nrnmpi_impl.h
+++ b/coreneuron/mpi/core/nrnmpi_def_cinc.cpp
@@ -6,16 +6,7 @@
 # =============================================================================.
 */
 
-#ifndef nrnmpi_impl_h
-#define nrnmpi_impl_h
-
-#if NRNMPI
-
-#include <mpi.h>
 namespace coreneuron {
-extern MPI_Comm nrnmpi_world_comm;
-extern MPI_Comm nrnmpi_comm;
+int nrnmpi_numprocs = 1; /* size */
+int nrnmpi_myid = 0;     /* rank */
 }  // namespace coreneuron
-#endif  // NRNMPI
-
-#endif
diff --git a/coreneuron/mpi/core/nrnmpidec.cpp b/coreneuron/mpi/core/nrnmpidec.cpp
new file mode 100644
index 000000000..c7d80df66
--- /dev/null
+++ b/coreneuron/mpi/core/nrnmpidec.cpp
@@ -0,0 +1,71 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================.
+*/
+
+#if NRNMPI
+#include "../nrnmpi.h"
+
+namespace coreneuron {
+
+
+/* from nrnmpi.cpp */
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_init_impl)> nrnmpi_init{"nrnmpi_init_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_finalize_impl)> nrnmpi_finalize{
+    "nrnmpi_finalize_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_check_threading_support_impl)>
+    nrnmpi_check_threading_support{"nrnmpi_check_threading_support_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_write_file_impl)> nrnmpi_write_file{
+    "nrnmpi_write_file_impl"};
+
+/* from mpispike.c */
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_impl)> nrnmpi_spike_exchange{
+    "nrnmpi_spike_exchange_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_compressed_impl)>
+    nrnmpi_spike_exchange_compressed{"nrnmpi_spike_exchange_compressed_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allmax_impl)> nrnmpi_int_allmax{
+    "nrnmpi_int_allmax_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allgather_impl)> nrnmpi_int_allgather{
+    "nrnmpi_int_allgather_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoall_impl)> nrnmpi_int_alltoall{
+    "nrnmpi_int_alltoall_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoallv_impl)> nrnmpi_int_alltoallv{
+    "nrnmpi_int_alltoallv_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_alltoallv_impl)> nrnmpi_dbl_alltoallv{
+    "nrnmpi_dbl_alltoallv_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmin_impl)> nrnmpi_dbl_allmin{
+    "nrnmpi_dbl_allmin_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmax_impl)> nrnmpi_dbl_allmax{
+    "nrnmpi_dbl_allmax_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_barrier_impl)> nrnmpi_barrier{
+    "nrnmpi_barrier_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_impl)> nrnmpi_dbl_allreduce{
+    "nrnmpi_dbl_allreduce_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_vec_impl)> nrnmpi_dbl_allreduce_vec{
+    "nrnmpi_dbl_allreduce_vec_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_long_allreduce_vec_impl)>
+    nrnmpi_long_allreduce_vec{"nrnmpi_long_allreduce_vec_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_initialized_impl)> nrnmpi_initialized{
+    "nrnmpi_initialized_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_abort_impl)> nrnmpi_abort{"nrnmpi_abort_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_wtime_impl)> nrnmpi_wtime{"nrnmpi_wtime_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_rank_impl)> nrnmpi_local_rank{
+    "nrnmpi_local_rank_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_size_impl)> nrnmpi_local_size{
+    "nrnmpi_local_size_impl"};
+#if NRN_MULTISEND
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_comm_impl)> nrnmpi_multisend_comm{
+    "nrnmpi_multisend_comm_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_impl)> nrnmpi_multisend{
+    "nrnmpi_multisend_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_single_advance_impl)>
+    nrnmpi_multisend_single_advance{"nrnmpi_multisend_single_advance_impl"};
+mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_conserve_impl)>
+    nrnmpi_multisend_conserve{"nrnmpi_multisend_conserve_impl"};
+#endif  // NRN_MULTISEND
+
+}  // namespace coreneuron
+#endif  // NRNMPI
diff --git a/coreneuron/mpi/lib/mpispike.cpp b/coreneuron/mpi/lib/mpispike.cpp
new file mode 100644
index 000000000..b62ea86f0
--- /dev/null
+++ b/coreneuron/mpi/lib/mpispike.cpp
@@ -0,0 +1,374 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================.
+*/
+
+#include "coreneuron/nrnconf.h"
+/* do not want the redef in the dynamic load case */
+#include "coreneuron/mpi/nrnmpiuse.h"
+#include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/nrnmpidec.h"
+#include "nrnmpi.hpp"
+#include "coreneuron/utils/profile/profiler_interface.h"
+
+#if NRNMPI
+#include <mpi.h>
+
+#include <cstring>
+
+namespace coreneuron {
+extern MPI_Comm nrnmpi_comm;
+
+static int np;
+static int* displs{nullptr};
+static int* byteovfl; /* for the compressed transfer method */
+static MPI_Datatype spike_type;
+
+static void* emalloc(size_t size) {
+    void* memptr = malloc(size);
+    assert(memptr);
+    return memptr;
+}
+
+static void hoc_execerror(const char* s1, const char* s2) {
+    printf("error: %s %s\n", s1, s2 ? s2 : "");
+    abort();
+}
+
+// Register type NRNMPI_Spike
+void nrnmpi_spike_initialize() {
+    NRNMPI_Spike s;
+    int block_lengths[2] = {1, 1};
+    MPI_Aint addresses[3];
+
+    MPI_Get_address(&s, &addresses[0]);
+    MPI_Get_address(&(s.gid), &addresses[1]);
+    MPI_Get_address(&(s.spiketime), &addresses[2]);
+
+    MPI_Aint displacements[2] = {addresses[1] - addresses[0], addresses[2] - addresses[0]};
+
+    MPI_Datatype typelist[2] = {MPI_INT, MPI_DOUBLE};
+    MPI_Type_create_struct(2, block_lengths, displacements, typelist, &spike_type);
+    MPI_Type_commit(&spike_type);
+}
+
+#if nrn_spikebuf_size > 0
+
+static MPI_Datatype spikebuf_type;
+
+// Register type NRNMPI_Spikebuf
+static void make_spikebuf_type() {
+    NRNMPI_Spikebuf s;
+    int block_lengths[3] = {1, nrn_spikebuf_size, nrn_spikebuf_size};
+    MPI_Datatype typelist[3] = {MPI_INT, MPI_INT, MPI_DOUBLE};
+
+    MPI_Aint addresses[4];
+    MPI_Get_address(&s, &addresses[0]);
+    MPI_Get_address(&(s.nspike), &addresses[1]);
+    MPI_Get_address(&(s.gid[0]), &addresses[2]);
+    MPI_Get_address(&(s.spiketime[0]), &addresses[3]);
+
+    MPI_Aint displacements[3] = {addresses[1] - addresses[0],
+                                 addresses[2] - addresses[0],
+                                 addresses[3] - addresses[0]};
+
+    MPI_Type_create_struct(3, block_lengths, displacements, typelist, &spikebuf_type);
+    MPI_Type_commit(&spikebuf_type);
+}
+#endif
+
+void wait_before_spike_exchange() {
+    MPI_Barrier(nrnmpi_comm);
+}
+
+int nrnmpi_spike_exchange_impl(int* nin,
+                               NRNMPI_Spike* spikeout,
+                               int icapacity,
+                               NRNMPI_Spike* spikein,
+                               int& ovfl,
+                               int nout,
+                               NRNMPI_Spikebuf* spbufout,
+                               NRNMPI_Spikebuf* spbufin) {
+    Instrumentor::phase_begin("spike-exchange");
+
+    {
+        Instrumentor::phase p("imbalance");
+        wait_before_spike_exchange();
+    }
+
+    Instrumentor::phase_begin("communication");
+    if (!displs) {
+        np = nrnmpi_numprocs_;
+        displs = (int*) emalloc(np * sizeof(int));
+        displs[0] = 0;
+#if nrn_spikebuf_size > 0
+        make_spikebuf_type();
+#endif
+    }
+#if nrn_spikebuf_size == 0
+    MPI_Allgather(&nout, 1, MPI_INT, nin, 1, MPI_INT, nrnmpi_comm);
+    int n = nin[0];
+    for (int i = 1; i < np; ++i) {
+        displs[i] = n;
+        n += nin[i];
+    }
+    if (n) {
+        if (icapacity < n) {
+            icapacity = n + 10;
+            free(spikein);
+            spikein = (NRNMPI_Spike*) emalloc(icapacity * sizeof(NRNMPI_Spike));
+        }
+        MPI_Allgatherv(spikeout, nout, spike_type, spikein, nin, displs, spike_type, nrnmpi_comm);
+    }
+#else
+    MPI_Allgather(spbufout, 1, spikebuf_type, spbufin, 1, spikebuf_type, nrnmpi_comm);
+    int novfl = 0;
+    int n = spbufin[0].nspike;
+    if (n > nrn_spikebuf_size) {
+        nin[0] = n - nrn_spikebuf_size;
+        novfl += nin[0];
+    } else {
+        nin[0] = 0;
+    }
+    for (int i = 1; i < np; ++i) {
+        displs[i] = novfl;
+        int n1 = spbufin[i].nspike;
+        n += n1;
+        if (n1 > nrn_spikebuf_size) {
+            nin[i] = n1 - nrn_spikebuf_size;
+            novfl += nin[i];
+        } else {
+            nin[i] = 0;
+        }
+    }
+    if (novfl) {
+        if (icapacity < novfl) {
+            icapacity = novfl + 10;
+            free(spikein);
+            spikein = (NRNMPI_Spike*) emalloc(icapacity * sizeof(NRNMPI_Spike));
+        }
+        int n1 = (nout > nrn_spikebuf_size) ? nout - nrn_spikebuf_size : 0;
+        MPI_Allgatherv(spikeout, n1, spike_type, spikein, nin, displs, spike_type, nrnmpi_comm);
+    }
+    ovfl = novfl;
+#endif
+    Instrumentor::phase_end("communication");
+    Instrumentor::phase_end("spike-exchange");
+    return n;
+}
+
+/*
+The compressed spike format is restricted to the fixed step method and is
+a sequence of unsigned char.
+nspike = buf[0]*256 + buf[1]
+a sequence of spiketime, localgid pairs. There are nspike of them.
+        spiketime is relative to the last transfer time in units of dt.
+        note that this requires a mindelay < 256*dt.
+        localgid is an unsigned int, unsigned short,
+        or unsigned char in size depending on the range and thus takes
+        4, 2, or 1 byte respectively. To be machine independent we do our
+        own byte coding. When the localgid range is smaller than the true
+        gid range, the gid->PreSyn are remapped into
+        hostid specific	maps. If there are not many holes, i.e just about every
+        spike from a source machine is delivered to some cell on a
+        target machine, then instead of	a hash map, a vector is used.
+The allgather sends the first part of the buf and the allgatherv buffer
+sends any overflow.
+*/
+int nrnmpi_spike_exchange_compressed_impl(int localgid_size,
+                                          unsigned char* spfixin_ovfl,
+                                          int send_nspike,
+                                          int* nin,
+                                          int ovfl_capacity,
+                                          unsigned char* spikeout_fixed,
+                                          int ag_send_size,
+                                          unsigned char* spikein_fixed,
+                                          int& ovfl) {
+    if (!displs) {
+        np = nrnmpi_numprocs_;
+        displs = (int*) emalloc(np * sizeof(int));
+        displs[0] = 0;
+        byteovfl = (int*) emalloc(np * sizeof(int));
+    }
+
+    MPI_Allgather(
+        spikeout_fixed, ag_send_size, MPI_BYTE, spikein_fixed, ag_send_size, MPI_BYTE, nrnmpi_comm);
+    int novfl = 0;
+    int ntot = 0;
+    int bstot = 0;
+    for (int i = 0; i < np; ++i) {
+        displs[i] = bstot;
+        int idx = i * ag_send_size;
+        int n = spikein_fixed[idx++] * 256;
+        n += spikein_fixed[idx++];
+        ntot += n;
+        nin[i] = n;
+        if (n > send_nspike) {
+            int bs = 2 + n * (1 + localgid_size) - ag_send_size;
+            byteovfl[i] = bs;
+            bstot += bs;
+            novfl += n - send_nspike;
+        } else {
+            byteovfl[i] = 0;
+        }
+    }
+    if (novfl) {
+        if (ovfl_capacity < novfl) {
+            ovfl_capacity = novfl + 10;
+            free(spfixin_ovfl);
+            spfixin_ovfl = (unsigned char*) emalloc(ovfl_capacity * (1 + localgid_size) *
+                                                    sizeof(unsigned char));
+        }
+        int bs = byteovfl[nrnmpi_myid_];
+        /*
+        note that the spikeout_fixed buffer is one since the overflow
+        is contiguous to the first part. But the spfixin_ovfl is
+        completely separate from the spikein_fixed since the latter
+        dynamically changes its size during a run.
+        */
+        MPI_Allgatherv(spikeout_fixed + ag_send_size,
+                       bs,
+                       MPI_BYTE,
+                       spfixin_ovfl,
+                       byteovfl,
+                       displs,
+                       MPI_BYTE,
+                       nrnmpi_comm);
+    }
+    ovfl = novfl;
+    return ntot;
+}
+
+int nrnmpi_int_allmax_impl(int x) {
+    int result;
+    MPI_Allreduce(&x, &result, 1, MPI_INT, MPI_MAX, nrnmpi_comm);
+    return result;
+}
+
+extern void nrnmpi_int_alltoall_impl(int* s, int* r, int n) {
+    MPI_Alltoall(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
+}
+
+extern void nrnmpi_int_alltoallv_impl(const int* s,
+                                      const int* scnt,
+                                      const int* sdispl,
+                                      int* r,
+                                      int* rcnt,
+                                      int* rdispl) {
+    MPI_Alltoallv(s, scnt, sdispl, MPI_INT, r, rcnt, rdispl, MPI_INT, nrnmpi_comm);
+}
+
+extern void nrnmpi_dbl_alltoallv_impl(double* s,
+                                      int* scnt,
+                                      int* sdispl,
+                                      double* r,
+                                      int* rcnt,
+                                      int* rdispl) {
+    MPI_Alltoallv(s, scnt, sdispl, MPI_DOUBLE, r, rcnt, rdispl, MPI_DOUBLE, nrnmpi_comm);
+}
+
+/* following are for the partrans */
+
+void nrnmpi_int_allgather_impl(int* s, int* r, int n) {
+    MPI_Allgather(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
+}
+
+double nrnmpi_dbl_allmin_impl(double x) {
+    double result;
+    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MIN, nrnmpi_comm);
+    return result;
+}
+
+double nrnmpi_dbl_allmax_impl(double x) {
+    double result;
+    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MAX, nrnmpi_comm);
+    return result;
+}
+
+void nrnmpi_barrier_impl() {
+    MPI_Barrier(nrnmpi_comm);
+}
+
+double nrnmpi_dbl_allreduce_impl(double x, int type) {
+    double result;
+    MPI_Op tt;
+    if (type == 1) {
+        tt = MPI_SUM;
+    } else if (type == 2) {
+        tt = MPI_MAX;
+    } else {
+        tt = MPI_MIN;
+    }
+    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, tt, nrnmpi_comm);
+    return result;
+}
+
+void nrnmpi_dbl_allreduce_vec_impl(double* src, double* dest, int cnt, int type) {
+    MPI_Op tt;
+    assert(src != dest);
+    if (type == 1) {
+        tt = MPI_SUM;
+    } else if (type == 2) {
+        tt = MPI_MAX;
+    } else {
+        tt = MPI_MIN;
+    }
+    MPI_Allreduce(src, dest, cnt, MPI_DOUBLE, tt, nrnmpi_comm);
+    return;
+}
+
+void nrnmpi_long_allreduce_vec_impl(long* src, long* dest, int cnt, int type) {
+    MPI_Op tt;
+    assert(src != dest);
+    if (type == 1) {
+        tt = MPI_SUM;
+    } else if (type == 2) {
+        tt = MPI_MAX;
+    } else {
+        tt = MPI_MIN;
+    }
+    MPI_Allreduce(src, dest, cnt, MPI_LONG, tt, nrnmpi_comm);
+    return;
+}
+
+#if NRN_MULTISEND
+
+static MPI_Comm multisend_comm;
+
+void nrnmpi_multisend_comm_impl() {
+    if (!multisend_comm) {
+        MPI_Comm_dup(MPI_COMM_WORLD, &multisend_comm);
+    }
+}
+
+void nrnmpi_multisend_impl(NRNMPI_Spike* spk, int n, int* hosts) {
+    MPI_Request r;
+    for (int i = 0; i < n; ++i) {
+        MPI_Isend(spk, 1, spike_type, hosts[i], 1, multisend_comm, &r);
+        MPI_Request_free(&r);
+    }
+}
+
+int nrnmpi_multisend_single_advance_impl(NRNMPI_Spike* spk) {
+    int flag = 0;
+    MPI_Status status;
+    MPI_Iprobe(MPI_ANY_SOURCE, 1, multisend_comm, &flag, &status);
+    if (flag) {
+        MPI_Recv(spk, 1, spike_type, MPI_ANY_SOURCE, 1, multisend_comm, &status);
+    }
+    return flag;
+}
+
+int nrnmpi_multisend_conserve_impl(int nsend, int nrecv) {
+    int tcnts[2];
+    tcnts[0] = nsend - nrecv;
+    MPI_Allreduce(tcnts, tcnts + 1, 1, MPI_INT, MPI_SUM, multisend_comm);
+    return tcnts[1];
+}
+
+#endif /*NRN_MULTISEND*/
+}  // namespace coreneuron
+#endif /*NRNMPI*/
diff --git a/coreneuron/mpi/nrnmpi.cpp b/coreneuron/mpi/lib/nrnmpi.cpp
similarity index 56%
rename from coreneuron/mpi/nrnmpi.cpp
rename to coreneuron/mpi/lib/nrnmpi.cpp
index 6861092de..c54ee4495 100644
--- a/coreneuron/mpi/nrnmpi.cpp
+++ b/coreneuron/mpi/lib/nrnmpi.cpp
@@ -8,13 +8,12 @@
 
 #include <iostream>
 #include <string>
-#include <sys/time.h>
+#include <tuple>
 
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/mpi/nrnmpi.h"
-#include "coreneuron/mpi/mpispike.hpp"
-#include "coreneuron/mpi/nrnmpi_def_cinc.h"
 #include "coreneuron/utils/nrn_assert.h"
+#include "nrnmpi.hpp"
 #if _OPENMP
 #include <omp.h>
 #endif
@@ -26,22 +25,22 @@ namespace coreneuron {
 #if NRNMPI
 MPI_Comm nrnmpi_world_comm;
 MPI_Comm nrnmpi_comm;
-MPI_Comm nrn_bbs_comm;
-static MPI_Group grp_bbs;
-static MPI_Group grp_net;
+int nrnmpi_numprocs_;
+int nrnmpi_myid_;
 
-extern void nrnmpi_spike_initialize();
+static bool nrnmpi_under_nrncontrol_{false};
 
-static int nrnmpi_under_nrncontrol_;
+static void nrn_fatal_error(const char* msg) {
+    if (nrnmpi_myid_ == 0) {
+        printf("%s\n", msg);
+    }
+    nrnmpi_abort_impl(-1);
+}
 
-void nrnmpi_init(int* pargc, char*** pargv) {
-    nrnmpi_use = true;
-    nrnmpi_under_nrncontrol_ = 1;
+nrnmpi_init_ret_t nrnmpi_init_impl(int* pargc, char*** pargv) {
+    nrnmpi_under_nrncontrol_ = true;
 
-    int flag = 0;
-    MPI_Initialized(&flag);
-
-    if (!flag) {
+    if (!nrnmpi_initialized_impl()) {
 #if defined(_OPENMP)
         int required = MPI_THREAD_FUNNELED;
         int provided;
@@ -52,110 +51,59 @@ void nrnmpi_init(int* pargc, char*** pargv) {
         nrn_assert(MPI_Init(pargc, pargv) == MPI_SUCCESS);
 #endif
     }
-    grp_bbs = MPI_GROUP_NULL;
-    grp_net = MPI_GROUP_NULL;
     nrn_assert(MPI_Comm_dup(MPI_COMM_WORLD, &nrnmpi_world_comm) == MPI_SUCCESS);
     nrn_assert(MPI_Comm_dup(nrnmpi_world_comm, &nrnmpi_comm) == MPI_SUCCESS);
-    nrn_assert(MPI_Comm_dup(nrnmpi_world_comm, &nrn_bbs_comm) == MPI_SUCCESS);
-    nrn_assert(MPI_Comm_rank(nrnmpi_world_comm, &nrnmpi_myid_world) == MPI_SUCCESS);
-    nrn_assert(MPI_Comm_size(nrnmpi_world_comm, &nrnmpi_numprocs_world) == MPI_SUCCESS);
-    nrnmpi_numprocs = nrnmpi_numprocs_bbs = nrnmpi_numprocs_world;
-    nrnmpi_myid = nrnmpi_myid_bbs = nrnmpi_myid_world;
+    nrn_assert(MPI_Comm_rank(nrnmpi_world_comm, &nrnmpi_myid_) == MPI_SUCCESS);
+    nrn_assert(MPI_Comm_size(nrnmpi_world_comm, &nrnmpi_numprocs_) == MPI_SUCCESS);
     nrnmpi_spike_initialize();
 
-    if (nrnmpi_myid == 0) {
+    if (nrnmpi_myid_ == 0) {
 #if defined(_OPENMP)
-        printf(" num_mpi=%d\n num_omp_thread=%d\n\n", nrnmpi_numprocs_world, omp_get_max_threads());
+        printf(" num_mpi=%d\n num_omp_thread=%d\n\n", nrnmpi_numprocs_, omp_get_max_threads());
 #else
-        printf(" num_mpi=%d\n\n", nrnmpi_numprocs_world);
+        printf(" num_mpi=%d\n\n", nrnmpi_numprocs_);
 #endif
     }
+
+    return {nrnmpi_numprocs_, nrnmpi_myid_};
 }
 
-void nrnmpi_finalize(void) {
+void nrnmpi_finalize_impl(void) {
     if (nrnmpi_under_nrncontrol_) {
-        int flag = 0;
-        MPI_Initialized(&flag);
-        if (flag) {
+        if (nrnmpi_initialized_impl()) {
             MPI_Comm_free(&nrnmpi_world_comm);
             MPI_Comm_free(&nrnmpi_comm);
-            MPI_Comm_free(&nrn_bbs_comm);
             MPI_Finalize();
         }
     }
 }
 
-void nrnmpi_terminate() {
-    if (nrnmpi_use) {
-        if (nrnmpi_under_nrncontrol_) {
-            MPI_Finalize();
-        }
-        nrnmpi_use = false;
-    }
-}
-
 // check if appropriate threading level supported (i.e. MPI_THREAD_FUNNELED)
-void nrnmpi_check_threading_support() {
+void nrnmpi_check_threading_support_impl() {
     int th = 0;
-    if (nrnmpi_use) {
-        MPI_Query_thread(&th);
-        if (th < MPI_THREAD_FUNNELED) {
-            nrn_fatal_error(
-                "\n Current MPI library doesn't support MPI_THREAD_FUNNELED,\
-                        \n Run without enabling multi-threading!");
-        }
+    MPI_Query_thread(&th);
+    if (th < MPI_THREAD_FUNNELED) {
+        nrn_fatal_error(
+            "\n Current MPI library doesn't support MPI_THREAD_FUNNELED,\
+                    \n Run without enabling multi-threading!");
     }
 }
 
-/* so src/nrnpython/inithoc.cpp does not have to include a c++ mpi.h */
-int nrnmpi_wrap_mpi_init(int* flag) {
-    return MPI_Initialized(flag);
-}
-
-#endif
-
-// TODO nrn_wtime(), nrn_abort(int) and nrn_fatal_error() to be moved to tools
-
-double nrn_wtime() {
-#if NRNMPI
-    if (nrnmpi_use) {
-        return MPI_Wtime();
-    } else
-#endif
-    {
-        struct timeval time1;
-        gettimeofday(&time1, nullptr);
-        return (time1.tv_sec + time1.tv_usec / 1.e6);
-    }
+bool nrnmpi_initialized_impl() {
+    int flag = 0;
+    MPI_Initialized(&flag);
+    return flag != 0;
 }
 
-void nrn_abort(int errcode) {
-#if NRNMPI
-    int flag;
-    MPI_Initialized(&flag);
-    if (flag) {
-        MPI_Abort(MPI_COMM_WORLD, errcode);
-    } else
-#endif
-    {
-        abort();
-    }
+void nrnmpi_abort_impl(int errcode) {
+    MPI_Abort(MPI_COMM_WORLD, errcode);
 }
 
-void nrn_fatal_error(const char* msg) {
-    if (nrnmpi_myid == 0) {
-        printf("%s\n", msg);
-    }
-    nrn_abort(-1);
+double nrnmpi_wtime_impl() {
+    return MPI_Wtime();
 }
 
-int nrnmpi_initialized() {
-    int flag = 0;
-#if NRNMPI
-    MPI_Initialized(&flag);
 #endif
-    return flag;
-}
 
 /**
  * Return local mpi rank within a shared memory node
@@ -164,13 +112,13 @@ int nrnmpi_initialized() {
  * process on a given node. This function uses MPI 3 MPI_Comm_split_type
  * function and MPI_COMM_TYPE_SHARED key to find out the local rank.
  */
-int nrnmpi_local_rank() {
+int nrnmpi_local_rank_impl() {
     int local_rank = 0;
 #if NRNMPI
-    if (nrnmpi_initialized()) {
+    if (nrnmpi_initialized_impl()) {
         MPI_Comm local_comm;
         MPI_Comm_split_type(
-            MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, nrnmpi_myid_world, MPI_INFO_NULL, &local_comm);
+            MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, nrnmpi_myid_, MPI_INFO_NULL, &local_comm);
         MPI_Comm_rank(local_comm, &local_rank);
         MPI_Comm_free(&local_comm);
     }
@@ -184,13 +132,13 @@ int nrnmpi_local_rank() {
  * We use MPI 3 MPI_Comm_split_type function and MPI_COMM_TYPE_SHARED key to
  * determine number of mpi ranks within a shared memory node.
  */
-int nrnmpi_local_size() {
+int nrnmpi_local_size_impl() {
     int local_size = 1;
 #if NRNMPI
-    if (nrnmpi_initialized()) {
+    if (nrnmpi_initialized_impl()) {
         MPI_Comm local_comm;
         MPI_Comm_split_type(
-            MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, nrnmpi_myid_world, MPI_INFO_NULL, &local_comm);
+            MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, nrnmpi_myid_, MPI_INFO_NULL, &local_comm);
         MPI_Comm_size(local_comm, &local_size);
         MPI_Comm_free(&local_comm);
     }
@@ -213,7 +161,7 @@ int nrnmpi_local_size() {
  * @param buffer Buffer to write
  * @param length Length of the buffer to write
  */
-void nrnmpi_write_file(const std::string& filename, const char* buffer, size_t length) {
+void nrnmpi_write_file_impl(const std::string& filename, const char* buffer, size_t length) {
     MPI_File fh;
     MPI_Status status;
 
@@ -223,13 +171,13 @@ void nrnmpi_write_file(const std::string& filename, const char* buffer, size_t l
 
     int op_status = MPI_File_open(
         nrnmpi_comm, filename.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
-    if (op_status != MPI_SUCCESS && nrnmpi_myid == 0) {
+    if (op_status != MPI_SUCCESS && nrnmpi_myid_ == 0) {
         std::cerr << "Error while opening output file " << filename << std::endl;
         abort();
     }
 
     op_status = MPI_File_write_at_all(fh, offset, buffer, length, MPI_BYTE, &status);
-    if (op_status != MPI_SUCCESS && nrnmpi_myid == 0) {
+    if (op_status != MPI_SUCCESS && nrnmpi_myid_ == 0) {
         std::cerr << "Error while writing output " << std::endl;
         abort();
     }
diff --git a/coreneuron/mpi/lib/nrnmpi.hpp b/coreneuron/mpi/lib/nrnmpi.hpp
new file mode 100644
index 000000000..0509e6d2f
--- /dev/null
+++ b/coreneuron/mpi/lib/nrnmpi.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+// This file contains functions that does not go outside of the mpi library
+namespace coreneuron {
+extern int nrnmpi_numprocs_;
+extern int nrnmpi_myid_;
+void nrnmpi_spike_initialize();
+}  // namespace coreneuron
diff --git a/coreneuron/mpi/mpispike.cpp b/coreneuron/mpi/mpispike.cpp
deleted file mode 100644
index becbbf0a3..000000000
--- a/coreneuron/mpi/mpispike.cpp
+++ /dev/null
@@ -1,562 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================.
-*/
-
-#include "coreneuron/nrnconf.h"
-/* do not want the redef in the dynamic load case */
-#include "coreneuron/mpi/nrnmpiuse.h"
-#include "coreneuron/mpi/nrnmpi.h"
-#include "coreneuron/mpi/nrnmpidec.h"
-#include "coreneuron/mpi/nrnmpi_impl.h"
-#include "coreneuron/mpi/mpispike.hpp"
-#include "coreneuron/utils/profile/profiler_interface.h"
-#include "coreneuron/utils/nrnoc_aux.hpp"
-
-#if NRNMPI
-#include <mpi.h>
-
-#include <cstring>
-
-namespace coreneuron {
-static int np;
-static int* displs;
-static int* byteovfl; /* for the compressed transfer method */
-static MPI_Datatype spike_type;
-
-static void pgvts_op(double* in, double* inout, int* len, MPI_Datatype* dptr);
-static MPI_Op mpi_pgvts_op;
-
-static void make_spike_type() {
-    NRNMPI_Spike s;
-    int block_lengths[2];
-    MPI_Aint displacements[2];
-    MPI_Aint addresses[3];
-    MPI_Datatype typelist[2];
-
-    typelist[0] = MPI_INT;
-    typelist[1] = MPI_DOUBLE;
-
-    block_lengths[0] = block_lengths[1] = 1;
-
-    MPI_Get_address(&s, &addresses[0]);
-    MPI_Get_address(&(s.gid), &addresses[1]);
-    MPI_Get_address(&(s.spiketime), &addresses[2]);
-
-    displacements[0] = addresses[1] - addresses[0];
-    displacements[1] = addresses[2] - addresses[0];
-
-    MPI_Type_create_struct(2, block_lengths, displacements, typelist, &spike_type);
-    MPI_Type_commit(&spike_type);
-
-    MPI_Op_create((MPI_User_function*) pgvts_op, 1, &mpi_pgvts_op);
-}
-
-void nrnmpi_spike_initialize() {
-    make_spike_type();
-}
-
-#if nrn_spikebuf_size > 0
-
-static MPI_Datatype spikebuf_type;
-
-static void make_spikebuf_type() {
-    NRNMPI_Spikebuf s;
-    int block_lengths[3];
-    MPI_Aint displacements[3];
-    MPI_Aint addresses[4];
-    MPI_Datatype typelist[3];
-
-    typelist[0] = MPI_INT;
-    typelist[1] = MPI_INT;
-    typelist[2] = MPI_DOUBLE;
-
-    block_lengths[0] = 1;
-    block_lengths[1] = nrn_spikebuf_size;
-    block_lengths[2] = nrn_spikebuf_size;
-
-    MPI_Get_address(&s, &addresses[0]);
-    MPI_Get_address(&(s.nspike), &addresses[1]);
-    MPI_Get_address(&(s.gid[0]), &addresses[2]);
-    MPI_Get_address(&(s.spiketime[0]), &addresses[3]);
-
-    displacements[0] = addresses[1] - addresses[0];
-    displacements[1] = addresses[2] - addresses[0];
-    displacements[2] = addresses[3] - addresses[0];
-
-    MPI_Type_create_struct(3, block_lengths, displacements, typelist, &spikebuf_type);
-    MPI_Type_commit(&spikebuf_type);
-}
-#endif
-
-void wait_before_spike_exchange() {
-    MPI_Barrier(nrnmpi_comm);
-}
-
-int nrnmpi_spike_exchange() {
-    int n;
-    Instrumentor::phase_begin("spike-exchange");
-
-    {
-        Instrumentor::phase p("imbalance");
-        wait_before_spike_exchange();
-    }
-
-    Instrumentor::phase_begin("communication");
-    if (!displs) {
-        np = nrnmpi_numprocs;
-        displs = (int*) emalloc(np * sizeof(int));
-        displs[0] = 0;
-#if nrn_spikebuf_size > 0
-        make_spikebuf_type();
-#endif
-    }
-#if nrn_spikebuf_size == 0
-    MPI_Allgather(&nout_, 1, MPI_INT, nin_, 1, MPI_INT, nrnmpi_comm);
-    n = nin_[0];
-    for (int i = 1; i < np; ++i) {
-        displs[i] = n;
-        n += nin_[i];
-    }
-    if (n) {
-        if (icapacity_ < n) {
-            icapacity_ = n + 10;
-            free(spikein_);
-            spikein_ = (NRNMPI_Spike*) emalloc(icapacity_ * sizeof(NRNMPI_Spike));
-        }
-        MPI_Allgatherv(
-            spikeout_, nout_, spike_type, spikein_, nin_, displs, spike_type, nrnmpi_comm);
-    }
-#else
-    MPI_Allgather(spbufout_, 1, spikebuf_type, spbufin_, 1, spikebuf_type, nrnmpi_comm);
-    int novfl = 0;
-    n = spbufin_[0].nspike;
-    if (n > nrn_spikebuf_size) {
-        nin_[0] = n - nrn_spikebuf_size;
-        novfl += nin_[0];
-    } else {
-        nin_[0] = 0;
-    }
-    for (int i = 1; i < np; ++i) {
-        displs[i] = novfl;
-        int n1 = spbufin_[i].nspike;
-        n += n1;
-        if (n1 > nrn_spikebuf_size) {
-            nin_[i] = n1 - nrn_spikebuf_size;
-            novfl += nin_[i];
-        } else {
-            nin_[i] = 0;
-        }
-    }
-    if (novfl) {
-        if (icapacity_ < novfl) {
-            icapacity_ = novfl + 10;
-            free(spikein_);
-            spikein_ = (NRNMPI_Spike*) hoc_Emalloc(icapacity_ * sizeof(NRNMPI_Spike));
-            hoc_malchk();
-        }
-        int n1 = (nout_ > nrn_spikebuf_size) ? nout_ - nrn_spikebuf_size : 0;
-        MPI_Allgatherv(spikeout_, n1, spike_type, spikein_, nin_, displs, spike_type, nrnmpi_comm);
-    }
-    ovfl_ = novfl;
-#endif
-    Instrumentor::phase_end("communication");
-    Instrumentor::phase_end("spike-exchange");
-    return n;
-}
-
-/*
-The compressed spike format is restricted to the fixed step method and is
-a sequence of unsigned char.
-nspike = buf[0]*256 + buf[1]
-a sequence of spiketime, localgid pairs. There are nspike of them.
-        spiketime is relative to the last transfer time in units of dt.
-        note that this requires a mindelay < 256*dt.
-        localgid is an unsigned int, unsigned short,
-        or unsigned char in size depending on the range and thus takes
-        4, 2, or 1 byte respectively. To be machine independent we do our
-        own byte coding. When the localgid range is smaller than the true
-        gid range, the gid->PreSyn are remapped into
-        hostid specific	maps. If there are not many holes, i.e just about every
-        spike from a source machine is delivered to some cell on a
-        target machine, then instead of	a hash map, a vector is used.
-The allgather sends the first part of the buf and the allgatherv buffer
-sends any overflow.
-*/
-int nrnmpi_spike_exchange_compressed() {
-    if (!displs) {
-        np = nrnmpi_numprocs;
-        displs = (int*) emalloc(np * sizeof(int));
-        displs[0] = 0;
-        byteovfl = (int*) emalloc(np * sizeof(int));
-    }
-
-    MPI_Allgather(
-        spfixout_, ag_send_size_, MPI_BYTE, spfixin_, ag_send_size_, MPI_BYTE, nrnmpi_comm);
-    int novfl = 0;
-    int ntot = 0;
-    int bstot = 0;
-    for (int i = 0; i < np; ++i) {
-        displs[i] = bstot;
-        int idx = i * ag_send_size_;
-        int n = spfixin_[idx++] * 256;
-        n += spfixin_[idx++];
-        ntot += n;
-        nin_[i] = n;
-        if (n > ag_send_nspike_) {
-            int bs = 2 + n * (1 + localgid_size_) - ag_send_size_;
-            byteovfl[i] = bs;
-            bstot += bs;
-            novfl += n - ag_send_nspike_;
-        } else {
-            byteovfl[i] = 0;
-        }
-    }
-    if (novfl) {
-        if (ovfl_capacity_ < novfl) {
-            ovfl_capacity_ = novfl + 10;
-            free(spfixin_ovfl_);
-            spfixin_ovfl_ = (unsigned char*) emalloc(ovfl_capacity_ * (1 + localgid_size_) *
-                                                     sizeof(unsigned char));
-        }
-        int bs = byteovfl[nrnmpi_myid];
-        /*
-        note that the spfixout_ buffer is one since the overflow
-        is contiguous to the first part. But the spfixin_ovfl_ is
-        completely separate from the spfixin_ since the latter
-        dynamically changes its size during a run.
-        */
-        MPI_Allgatherv(spfixout_ + ag_send_size_,
-                       bs,
-                       MPI_BYTE,
-                       spfixin_ovfl_,
-                       byteovfl,
-                       displs,
-                       MPI_BYTE,
-                       nrnmpi_comm);
-    }
-    ovfl_ = novfl;
-    return ntot;
-}
-
-int nrnmpi_int_allmax(int x) {
-    int result;
-    if (nrnmpi_numprocs < 2) {
-        return x;
-    }
-    MPI_Allreduce(&x, &result, 1, MPI_INT, MPI_MAX, nrnmpi_comm);
-    return result;
-}
-
-extern void nrnmpi_int_gather(int* s, int* r, int cnt, int root) {
-    MPI_Gather(s, cnt, MPI_INT, r, cnt, MPI_INT, root, nrnmpi_comm);
-}
-
-extern void nrnmpi_int_gatherv(int* s, int scnt, int* r, int* rcnt, int* rdispl, int root) {
-    MPI_Gatherv(s, scnt, MPI_INT, r, rcnt, rdispl, MPI_INT, root, nrnmpi_comm);
-}
-
-extern void nrnmpi_int_alltoall(int* s, int* r, int n) {
-    MPI_Alltoall(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
-}
-
-extern void nrnmpi_int_alltoallv(const int* s,
-                                 const int* scnt,
-                                 const int* sdispl,
-                                 int* r,
-                                 int* rcnt,
-                                 int* rdispl) {
-    MPI_Alltoallv(s, scnt, sdispl, MPI_INT, r, rcnt, rdispl, MPI_INT, nrnmpi_comm);
-}
-
-extern void nrnmpi_dbl_alltoallv(double* s,
-                                 int* scnt,
-                                 int* sdispl,
-                                 double* r,
-                                 int* rcnt,
-                                 int* rdispl) {
-    MPI_Alltoallv(s, scnt, sdispl, MPI_DOUBLE, r, rcnt, rdispl, MPI_DOUBLE, nrnmpi_comm);
-}
-
-extern void nrnmpi_char_alltoallv(char* s,
-                                  int* scnt,
-                                  int* sdispl,
-                                  char* r,
-                                  int* rcnt,
-                                  int* rdispl) {
-    MPI_Alltoallv(s, scnt, sdispl, MPI_CHAR, r, rcnt, rdispl, MPI_CHAR, nrnmpi_comm);
-}
-
-/* following are for the partrans */
-
-void nrnmpi_int_allgather(int* s, int* r, int n) {
-    MPI_Allgather(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
-}
-
-void nrnmpi_int_allgatherv(int* s, int* r, int* n, int* dspl) {
-    MPI_Allgatherv(s, n[nrnmpi_myid], MPI_INT, r, n, dspl, MPI_INT, nrnmpi_comm);
-}
-
-void nrnmpi_dbl_allgatherv(double* s, double* r, int* n, int* dspl) {
-    MPI_Allgatherv(s, n[nrnmpi_myid], MPI_DOUBLE, r, n, dspl, MPI_DOUBLE, nrnmpi_comm);
-}
-
-void nrnmpi_dbl_broadcast(double* buf, int cnt, int root) {
-    MPI_Bcast(buf, cnt, MPI_DOUBLE, root, nrnmpi_comm);
-}
-
-void nrnmpi_int_broadcast(int* buf, int cnt, int root) {
-    MPI_Bcast(buf, cnt, MPI_INT, root, nrnmpi_comm);
-}
-
-void nrnmpi_char_broadcast(char* buf, int cnt, int root) {
-    MPI_Bcast(buf, cnt, MPI_CHAR, root, nrnmpi_comm);
-}
-
-int nrnmpi_int_sum_reduce(int in) {
-    int result;
-    MPI_Allreduce(&in, &result, 1, MPI_INT, MPI_SUM, nrnmpi_comm);
-    return result;
-}
-
-void nrnmpi_assert_opstep(int opstep, double tt) {
-    /* all machines in comm should have same opstep and same tt. */
-    double buf[2];
-    if (nrnmpi_numprocs < 2) {
-        return;
-    }
-    buf[0] = (double) opstep;
-    buf[1] = tt;
-    MPI_Bcast(buf, 2, MPI_DOUBLE, 0, nrnmpi_comm);
-    if (opstep != (int) buf[0] || tt != buf[1]) {
-        printf("%d opstep=%d %d  t=%g t-troot=%g\n",
-               nrnmpi_myid,
-               opstep,
-               (int) buf[0],
-               tt,
-               tt - buf[1]);
-        hoc_execerror("nrnmpi_assert_opstep failed", (char*) 0);
-    }
-}
-
-double nrnmpi_dbl_allmin(double x) {
-    double result;
-    if (!nrnmpi_use || (nrnmpi_numprocs < 2)) {
-        return x;
-    }
-    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MIN, nrnmpi_comm);
-    return result;
-}
-
-double nrnmpi_dbl_allmax(double x) {
-    double result;
-    if (!nrnmpi_use || (nrnmpi_numprocs < 2)) {
-        return x;
-    }
-    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MAX, nrnmpi_comm);
-    return result;
-}
-
-static void pgvts_op(double* in, double* inout, int* len, MPI_Datatype* dptr) {
-    int r = 0;
-    if (*dptr != MPI_DOUBLE)
-        printf("ERROR in mpispike.c! *dptr should be MPI_DOUBLE.");
-    if (*len != 4)
-        printf("ERROR in mpispike.c! *len should be 4.");
-    if (in[0] < inout[0]) {
-        /* least time has highest priority */
-        r = 1;
-    } else if (in[0] == inout[0]) {
-        /* when times are equal then */
-        if (in[1] < inout[1]) {
-            /* NetParEvent done last */
-            r = 1;
-        } else if (in[1] == inout[1]) {
-            /* when times and ops are equal then */
-            if (in[2] < inout[2]) {
-                /* init done next to last.*/
-                r = 1;
-            } else if (in[2] == inout[2]) {
-                /* when times, ops, and inits are equal then */
-                if (in[3] < inout[3]) {
-                    /* choose lowest rank */
-                    r = 1;
-                }
-            }
-        }
-    }
-    if (r) {
-        for (int i = 0; i < 4; ++i) {
-            inout[i] = in[i];
-        }
-    }
-}
-
-int nrnmpi_pgvts_least(double* tt, int* op, int* init) {
-    double ibuf[4], obuf[4];
-    ibuf[0] = *tt;
-    ibuf[1] = (double) (*op);
-    ibuf[2] = (double) (*init);
-    ibuf[3] = (double) nrnmpi_myid;
-    std::memcpy(obuf, ibuf, 4 * sizeof(double));
-
-    MPI_Allreduce(ibuf, obuf, 4, MPI_DOUBLE, mpi_pgvts_op, nrnmpi_comm);
-    assert(obuf[0] <= *tt);
-    if (obuf[0] == *tt) {
-        assert((int) obuf[1] <= *op);
-        if ((int) obuf[1] == *op) {
-            assert((int) obuf[2] <= *init);
-            if ((int) obuf[2] == *init) {
-                assert((int) obuf[3] <= nrnmpi_myid);
-            }
-        }
-    }
-    *tt = obuf[0];
-    *op = (int) obuf[1];
-    *init = (int) obuf[2];
-    if (nrnmpi_myid == (int) obuf[3]) {
-        return 1;
-    }
-    return 0;
-}
-
-/* following for splitcell.cpp transfer */
-void nrnmpi_send_doubles(double* pd, int cnt, int dest, int tag) {
-    MPI_Send(pd, cnt, MPI_DOUBLE, dest, tag, nrnmpi_comm);
-}
-
-void nrnmpi_recv_doubles(double* pd, int cnt, int src, int tag) {
-    MPI_Status status;
-    MPI_Recv(pd, cnt, MPI_DOUBLE, src, tag, nrnmpi_comm, &status);
-}
-
-void nrnmpi_postrecv_doubles(double* pd, int cnt, int src, int tag, void** request) {
-    MPI_Irecv(pd, cnt, MPI_DOUBLE, src, tag, nrnmpi_comm, (MPI_Request*) request);
-}
-
-void nrnmpi_wait(void** request) {
-    MPI_Status status;
-    MPI_Wait((MPI_Request*) request, &status);
-}
-
-void nrnmpi_barrier() {
-    if (nrnmpi_numprocs > 1) {
-        MPI_Barrier(nrnmpi_comm);
-    }
-}
-
-double nrnmpi_dbl_allreduce(double x, int type) {
-    double result;
-    MPI_Op tt;
-    if (nrnmpi_numprocs < 2) {
-        return x;
-    }
-    if (type == 1) {
-        tt = MPI_SUM;
-    } else if (type == 2) {
-        tt = MPI_MAX;
-    } else {
-        tt = MPI_MIN;
-    }
-    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, tt, nrnmpi_comm);
-    return result;
-}
-
-long nrnmpi_long_allreduce(long x, int type) {
-    long result;
-    MPI_Op tt;
-    if (nrnmpi_numprocs < 2) {
-        return x;
-    }
-    if (type == 1) {
-        tt = MPI_SUM;
-    } else if (type == 2) {
-        tt = MPI_MAX;
-    } else {
-        tt = MPI_MIN;
-    }
-    MPI_Allreduce(&x, &result, 1, MPI_LONG, tt, nrnmpi_comm);
-    return result;
-}
-
-void nrnmpi_dbl_allreduce_vec(double* src, double* dest, int cnt, int type) {
-    MPI_Op tt;
-    assert(src != dest);
-    if (nrnmpi_numprocs < 2) {
-        std::memcpy(dest, src, cnt * sizeof(double));
-        return;
-    }
-    if (type == 1) {
-        tt = MPI_SUM;
-    } else if (type == 2) {
-        tt = MPI_MAX;
-    } else {
-        tt = MPI_MIN;
-    }
-    MPI_Allreduce(src, dest, cnt, MPI_DOUBLE, tt, nrnmpi_comm);
-    return;
-}
-
-void nrnmpi_long_allreduce_vec(long* src, long* dest, int cnt, int type) {
-    MPI_Op tt;
-    assert(src != dest);
-    if (nrnmpi_numprocs < 2) {
-        std::memcpy(dest, src, cnt * sizeof(long));
-        return;
-    }
-    if (type == 1) {
-        tt = MPI_SUM;
-    } else if (type == 2) {
-        tt = MPI_MAX;
-    } else {
-        tt = MPI_MIN;
-    }
-    MPI_Allreduce(src, dest, cnt, MPI_LONG, tt, nrnmpi_comm);
-    return;
-}
-
-void nrnmpi_dbl_allgather(double* s, double* r, int n) {
-    MPI_Allgather(s, n, MPI_DOUBLE, r, n, MPI_DOUBLE, nrnmpi_comm);
-}
-
-#if NRN_MULTISEND
-
-static MPI_Comm multisend_comm;
-
-void nrnmpi_multisend_comm() {
-    if (!multisend_comm) {
-        MPI_Comm_dup(MPI_COMM_WORLD, &multisend_comm);
-    }
-}
-
-void nrnmpi_multisend(NRNMPI_Spike* spk, int n, int* hosts) {
-    MPI_Request r;
-    for (int i = 0; i < n; ++i) {
-        MPI_Isend(spk, 1, spike_type, hosts[i], 1, multisend_comm, &r);
-        MPI_Request_free(&r);
-    }
-}
-
-int nrnmpi_multisend_single_advance(NRNMPI_Spike* spk) {
-    int flag = 0;
-    MPI_Status status;
-    MPI_Iprobe(MPI_ANY_SOURCE, 1, multisend_comm, &flag, &status);
-    if (flag) {
-        MPI_Recv(spk, 1, spike_type, MPI_ANY_SOURCE, 1, multisend_comm, &status);
-    }
-    return flag;
-}
-
-int nrnmpi_multisend_conserve(int nsend, int nrecv) {
-    int tcnts[2];
-    tcnts[0] = nsend - nrecv;
-    MPI_Allreduce(tcnts, tcnts + 1, 1, MPI_INT, MPI_SUM, multisend_comm);
-    return tcnts[1];
-}
-
-#endif /*NRN_MULTISEND*/
-}  // namespace coreneuron
-#endif /*NRNMPI*/
diff --git a/coreneuron/mpi/mpispike.hpp b/coreneuron/mpi/mpispike.hpp
deleted file mode 100644
index 4ab5c2ab4..000000000
--- a/coreneuron/mpi/mpispike.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================.
-*/
-
-#ifndef nrnmpispike_h
-#define nrnmpispike_h
-
-#if NRNMPI
-
-#ifndef nrn_spikebuf_size
-#define nrn_spikebuf_size 0
-#endif
-
-namespace coreneuron {
-
-#if nrn_spikebuf_size > 0
-struct NRNMPI_Spikebuf {
-    int nspike;
-    int gid[nrn_spikebuf_size];
-    double spiketime[nrn_spikebuf_size];
-};
-#endif
-
-#define icapacity_ nrnmpi_i_capacity_
-#define spikeout_  nrnmpi_spikeout_
-#define spikein_   nrnmpi_spikein_
-#define nout_      nrnmpi_nout_
-#define nin_       nrnmpi_nin_
-extern int nout_;
-extern int* nin_;
-extern int icapacity_;
-extern NRNMPI_Spike* spikeout_;
-extern NRNMPI_Spike* spikein_;
-
-#define spfixout_       nrnmpi_spikeout_fixed_
-#define spfixin_        nrnmpi_spikein_fixed_
-#define spfixin_ovfl_   nrnmpi_spikein_fixed_ovfl_
-#define localgid_size_  nrnmpi_localgid_size_
-#define ag_send_size_   nrnmpi_ag_send_size_
-#define ag_send_nspike_ nrnmpi_send_nspike_
-#define ovfl_capacity_  nrnmpi_ovfl_capacity_
-#define ovfl_           nrnmpi_ovfl_
-extern int localgid_size_;  /* bytes */
-extern int ag_send_size_;   /* bytes */
-extern int ag_send_nspike_; /* spikes */
-extern int ovfl_capacity_;  /* spikes */
-extern int ovfl_;           /* spikes */
-extern unsigned char* spfixout_;
-extern unsigned char* spfixin_;
-extern unsigned char* spfixin_ovfl_;
-
-#if nrn_spikebuf_size > 0
-#define spbufout_ nrnmpi_spbufout_
-#define spbufin_  nrnmpi_spbufin_
-extern NRNMPI_Spikebuf* spbufout_;
-extern NRNMPI_Spikebuf* spbufin_;
-#endif
-
-}  // namespace coreneuron
-
-#endif  // NRNMPI
-#endif
diff --git a/coreneuron/mpi/nrnmpi.h b/coreneuron/mpi/nrnmpi.h
index 1513ca4b3..61d535b11 100644
--- a/coreneuron/mpi/nrnmpi.h
+++ b/coreneuron/mpi/nrnmpi.h
@@ -6,48 +6,88 @@
 # =============================================================================.
 */
 
-#ifndef nrnmpi_h
-#define nrnmpi_h
+#pragma once
 
+#include <cassert>
 #include <string>
+#include <type_traits>
+#include <vector>
 
 #include "coreneuron/mpi/nrnmpiuse.h"
 
+#if NRNMPI
+#ifndef nrn_spikebuf_size
+#define nrn_spikebuf_size 0
+#endif
+
 namespace coreneuron {
-/* by default nrnmpi_numprocs_world = nrnmpi_numprocs = nrnmpi_numsubworlds and
-   nrnmpi_myid_world = nrnmpi_myid and the bulletin board and network communication do
-   not easily coexist. ParallelContext.subworlds(nsmall) divides the world into
-   nrnmpi_numprocs_world/small subworlds of size nsmall.
-*/
-extern int nrnmpi_numprocs_world; /* size of entire world. total size of all subworlds */
-extern int nrnmpi_myid_world;     /* rank in entire world */
-extern int nrnmpi_numprocs;       /* size of subworld */
-extern int nrnmpi_myid;           /* rank in subworld */
-extern int nrnmpi_numprocs_bbs;   /* number of subworlds */
-extern int nrnmpi_myid_bbs;       /* rank in nrn_bbs_comm of rank 0 of a subworld */
-
-extern void nrn_abort(int errcode);
-extern void nrn_fatal_error(const char* msg);
-extern double nrn_wtime(void);
-extern int nrnmpi_local_rank();
-extern int nrnmpi_local_size();
+struct NRNMPI_Spikebuf {
+    int nspike;
+    int gid[nrn_spikebuf_size];
+    double spiketime[nrn_spikebuf_size];
+};
 }  // namespace coreneuron
 
-#if defined(NRNMPI)
-
 namespace coreneuron {
-typedef struct {
+struct NRNMPI_Spike {
     int gid;
     double spiketime;
-} NRNMPI_Spike;
+};
+
+// Those functions and classes are part of a mechanism to dynamically or statically load mpi functions
+struct mpi_function_base;
+
+struct mpi_manager_t {
+  void register_function(mpi_function_base* ptr) {
+    m_function_ptrs.push_back(ptr);
+  }
+  void resolve_symbols(void* dlsym_handle);
+private:
+  std::vector<mpi_function_base*> m_function_ptrs;
+  // true when symbols are resolved
+};
+
+inline mpi_manager_t& mpi_manager() {
+  static mpi_manager_t x;
+  return x;
+}
+
+struct mpi_function_base {
+  void resolve(void* dlsym_handle);
+  operator bool() const { return m_fptr; }
+  mpi_function_base(const char* name)
+  : m_name{name} {
+    mpi_manager().register_function(this);
+  }
+protected:
+  void* m_fptr{};
+  const char* m_name;
+};
 
-extern bool nrnmpi_use; /* NEURON does MPI init and terminate?*/
+// This could be done with a simpler
+//   template <auto fptr> struct function : function_base { ... };
+// pattern in C++17...
+template <typename>
+struct mpi_function {};
 
-// Write given buffer to a new file using MPI collective I/O
-extern void nrnmpi_write_file(const std::string& filename, const char* buffer, size_t length);
+#define cnrn_make_integral_constant_t(x) std::integral_constant<std::decay_t<decltype(x)>, x>
 
+template <typename function_ptr, function_ptr fptr>
+struct mpi_function<std::integral_constant<function_ptr, fptr>> : mpi_function_base {
+    using mpi_function_base::mpi_function_base;
+    template <typename... Args> // in principle deducible from `function_ptr`
+    auto operator()(Args&&... args) const {
+#ifdef CORENRN_ENABLE_DYNAMIC_MPI
+        // Dynamic MPI, m_fptr should have been initialised via dlsym.
+        assert(m_fptr);
+        return (*reinterpret_cast<decltype(fptr)>(m_fptr))(std::forward<Args>( args )...);
+#else
+        // No dynamic MPI, use `fptr` directly. Will produce link errors if libmpi.so is not linked.
+        return (*fptr)(std::forward<Args>(args)...);
+#endif
+    }
+};
 }  // namespace coreneuron
 #include "coreneuron/mpi/nrnmpidec.h"
 
 #endif /*NRNMPI*/
-#endif /*nrnmpi_h*/
diff --git a/coreneuron/mpi/nrnmpi_def_cinc.h b/coreneuron/mpi/nrnmpi_def_cinc.h
deleted file mode 100644
index 6dd551782..000000000
--- a/coreneuron/mpi/nrnmpi_def_cinc.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================.
-*/
-
-namespace coreneuron {
-bool nrnmpi_use;
-int nrnmpi_numprocs = 1; /* size */
-int nrnmpi_myid = 0;     /* rank */
-int nrnmpi_numprocs_world = 1;
-int nrnmpi_myid_world = 0;
-int nrnmpi_numprocs_bbs = 1;
-int nrnmpi_myid_bbs = 0;
-
-int nrnmpi_nout_;
-int* nrnmpi_nin_;
-int nrnmpi_i_capacity_;
-
-#if NRNMPI
-NRNMPI_Spike* nrnmpi_spikeout_;
-NRNMPI_Spike* nrnmpi_spikein_;
-#endif
-
-int nrnmpi_localgid_size_;
-int nrnmpi_ag_send_size_;
-int nrnmpi_send_nspike_;
-int nrnmpi_ovfl_capacity_;
-int nrnmpi_ovfl_;
-unsigned char* nrnmpi_spikeout_fixed_;
-unsigned char* nrnmpi_spikein_fixed_;
-unsigned char* nrnmpi_spikein_fixed_ovfl_;
-}  // namespace coreneuron
diff --git a/coreneuron/mpi/nrnmpidec.h b/coreneuron/mpi/nrnmpidec.h
index 74d282fb4..79160a9c0 100644
--- a/coreneuron/mpi/nrnmpidec.h
+++ b/coreneuron/mpi/nrnmpidec.h
@@ -11,109 +11,93 @@ This file is processed by mkdynam.sh and so it is important that
 the prototypes be of the form "type foo(type arg, ...)"
 */
 
-#ifndef nrnmpidec_h
-#define nrnmpidec_h
-
+#pragma once
 
 #if NRNMPI
 #include <stdlib.h>
-namespace coreneuron {
-/* from bbsmpipack.c */
-typedef struct bbsmpibuf {
-    char* buf;
-    int size;
-    int pkposition;
-    int upkpos;
-    int keypos;
-    int refcount;
-} bbsmpibuf;
-
-extern bbsmpibuf* nrnmpi_newbuf(int size);
-extern void nrnmpi_copy(bbsmpibuf* dest, bbsmpibuf* src);
-extern void nrnmpi_ref(bbsmpibuf* buf);
-extern void nrnmpi_unref(bbsmpibuf* buf);
-
-extern void nrnmpi_upkbegin(bbsmpibuf* buf);
-extern char* nrnmpi_getkey(bbsmpibuf* buf);
-extern int nrnmpi_getid(bbsmpibuf* buf);
-extern int nrnmpi_upkint(bbsmpibuf* buf);
-extern double nrnmpi_upkdouble(bbsmpibuf* buf);
-extern void nrnmpi_upkvec(int n, double* x, bbsmpibuf* buf);
-extern char* nrnmpi_upkstr(bbsmpibuf* buf);
-extern char* nrnmpi_upkpickle(size_t* size, bbsmpibuf* buf);
-
-extern void nrnmpi_pkbegin(bbsmpibuf* buf);
-extern void nrnmpi_enddata(bbsmpibuf* buf);
-extern void nrnmpi_pkint(int i, bbsmpibuf* buf);
-extern void nrnmpi_pkdouble(double x, bbsmpibuf* buf);
-extern void nrnmpi_pkvec(int n, double* x, bbsmpibuf* buf);
-extern void nrnmpi_pkstr(const char* s, bbsmpibuf* buf);
-extern void nrnmpi_pkpickle(const char* s, size_t size, bbsmpibuf* buf);
-
-extern int nrnmpi_iprobe(int* size, int* tag, int* source);
-extern void nrnmpi_bbssend(int dest, int tag, bbsmpibuf* r);
-extern int nrnmpi_bbsrecv(int source, bbsmpibuf* r);
-extern int nrnmpi_bbssendrecv(int dest, int tag, bbsmpibuf* s, bbsmpibuf* r);
 
+namespace coreneuron {
 /* from nrnmpi.cpp */
-extern void nrnmpi_init(int* pargc, char*** pargv);
-extern int nrnmpi_wrap_mpi_init(int* flag);
-extern void nrnmpi_finalize(void);
-extern void nrnmpi_terminate();
-extern void nrnmpi_subworld_size(int n);
-extern int nrn_wrap_mpi_init(int* flag);
-extern void nrnmpi_check_threading_support();
+struct nrnmpi_init_ret_t {
+    int numprocs;
+    int myid;
+};
+extern "C" nrnmpi_init_ret_t nrnmpi_init_impl(int* pargc, char*** pargv);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_init_impl)> nrnmpi_init;
+extern "C" void nrnmpi_finalize_impl(void);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_finalize_impl)> nrnmpi_finalize;
+extern "C" void nrnmpi_check_threading_support_impl();
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_check_threading_support_impl)>
+    nrnmpi_check_threading_support;
+// Write given buffer to a new file using MPI collective I/O
+extern "C" void nrnmpi_write_file_impl(const std::string& filename, const char* buffer, size_t length);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_write_file_impl)> nrnmpi_write_file;
 
-/* from mpispike.c */
-extern void nrnmpi_spike_initialize(void);
-extern int nrnmpi_spike_exchange(void);
-extern int nrnmpi_spike_exchange_compressed(void);
-extern int nrnmpi_int_allmax(int i);
-extern void nrnmpi_int_gather(int* s, int* r, int cnt, int root);
-extern void nrnmpi_int_gatherv(int* s, int scnt, int* r, int* rcnt, int* rdispl, int root);
-extern void nrnmpi_int_allgather(int* s, int* r, int n);
-extern void nrnmpi_int_allgatherv(int* s, int* r, int* n, int* dspl);
-extern void nrnmpi_int_alltoall(int* s, int* r, int n);
-extern void nrnmpi_int_alltoallv(const int* s,
-                                 const int* scnt,
-                                 const int* sdispl,
-                                 int* r,
-                                 int* rcnt,
-                                 int* rdispl);
-extern void nrnmpi_dbl_allgatherv(double* s, double* r, int* n, int* dspl);
-extern void nrnmpi_dbl_alltoallv(double* s,
-                                 int* scnt,
-                                 int* sdispl,
-                                 double* r,
-                                 int* rcnt,
-                                 int* rdispl);
-extern void nrnmpi_char_alltoallv(char* s, int* scnt, int* sdispl, char* r, int* rcnt, int* rdispl);
-extern void nrnmpi_dbl_broadcast(double* buf, int cnt, int root);
-extern void nrnmpi_int_broadcast(int* buf, int cnt, int root);
-extern void nrnmpi_char_broadcast(char* buf, int cnt, int root);
-extern int nrnmpi_int_sum_reduce(int in);
-extern void nrnmpi_assert_opstep(int opstep, double t);
-extern double nrnmpi_dbl_allmin(double x);
-extern double nrnmpi_dbl_allmax(double x);
-extern int nrnmpi_pgvts_least(double* t, int* op, int* init);
-extern void nrnmpi_send_doubles(double* pd, int cnt, int dest, int tag);
-extern void nrnmpi_recv_doubles(double* pd, int cnt, int src, int tag);
-extern void nrnmpi_postrecv_doubles(double* pd, int cnt, int src, int tag, void** request);
-extern void nrnmpi_wait(void** request);
-extern void nrnmpi_barrier(void);
-extern double nrnmpi_dbl_allreduce(double x, int type);
-extern long nrnmpi_long_allreduce(long x, int type);
-extern void nrnmpi_dbl_allreduce_vec(double* src, double* dest, int cnt, int type);
-extern void nrnmpi_long_allreduce_vec(long* src, long* dest, int cnt, int type);
-extern void nrnmpi_dbl_allgather(double* s, double* r, int n);
-extern int nrnmpi_initialized();
+
+/* from mpispike.cpp */
+extern "C" int nrnmpi_spike_exchange_impl(int* nin, NRNMPI_Spike* spikeout, int icapacity, NRNMPI_Spike* spikein, int& ovfl, int nout, NRNMPI_Spikebuf* spbufout, NRNMPI_Spikebuf* spbufin);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_impl)>
+    nrnmpi_spike_exchange;
+extern "C" int nrnmpi_spike_exchange_compressed_impl(int, unsigned char*, int, int*, int, unsigned char*, int, unsigned char*, int& ovfl);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_compressed_impl)>
+    nrnmpi_spike_exchange_compressed;
+extern "C" int nrnmpi_int_allmax_impl(int i);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allmax_impl)> nrnmpi_int_allmax;
+extern "C" void nrnmpi_int_allgather_impl(int* s, int* r, int n);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_allgather_impl)> nrnmpi_int_allgather;
+extern "C" void nrnmpi_int_alltoall_impl(int* s, int* r, int n);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoall_impl)> nrnmpi_int_alltoall;
+extern "C" void nrnmpi_int_alltoallv_impl(const int* s,
+                                      const int* scnt,
+                                      const int* sdispl,
+                                      int* r,
+                                      int* rcnt,
+                                      int* rdispl);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_int_alltoallv_impl)> nrnmpi_int_alltoallv;
+extern "C" void nrnmpi_dbl_alltoallv_impl(double* s,
+                                      int* scnt,
+                                      int* sdispl,
+                                      double* r,
+                                      int* rcnt,
+                                      int* rdispl);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_alltoallv_impl)> nrnmpi_dbl_alltoallv;
+extern "C" double nrnmpi_dbl_allmin_impl(double x);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmin_impl)> nrnmpi_dbl_allmin;
+extern "C" double nrnmpi_dbl_allmax_impl(double x);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allmax_impl)> nrnmpi_dbl_allmax;
+extern "C" void nrnmpi_barrier_impl(void);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_barrier_impl)> nrnmpi_barrier;
+extern "C" double nrnmpi_dbl_allreduce_impl(double x, int type);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_impl)> nrnmpi_dbl_allreduce;
+extern "C" void nrnmpi_dbl_allreduce_vec_impl(double* src, double* dest, int cnt, int type);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_dbl_allreduce_vec_impl)>
+    nrnmpi_dbl_allreduce_vec;
+extern "C" void nrnmpi_long_allreduce_vec_impl(long* src, long* dest, int cnt, int type);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_long_allreduce_vec_impl)>
+    nrnmpi_long_allreduce_vec;
+extern "C" bool nrnmpi_initialized_impl();
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_initialized_impl)> nrnmpi_initialized;
+extern "C" void nrnmpi_abort_impl(int);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_abort_impl)> nrnmpi_abort;
+extern "C" double nrnmpi_wtime_impl();
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_wtime_impl)> nrnmpi_wtime;
+extern "C" int nrnmpi_local_rank_impl();
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_rank_impl)> nrnmpi_local_rank;
+extern "C" int nrnmpi_local_size_impl();
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_local_size_impl)> nrnmpi_local_size;
 #if NRN_MULTISEND
-extern void nrnmpi_multisend_comm();
-extern void nrnmpi_multisend(NRNMPI_Spike* spk, int n, int* hosts);
-extern int nrnmpi_multisend_single_advance(NRNMPI_Spike* spk);
-extern int nrnmpi_multisend_conserve(int nsend, int nrecv);
+extern "C" void nrnmpi_multisend_comm_impl();
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_comm_impl)>
+    nrnmpi_multisend_comm;
+extern "C" void nrnmpi_multisend_impl(NRNMPI_Spike* spk, int n, int* hosts);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_impl)> nrnmpi_multisend;
+extern "C" int nrnmpi_multisend_single_advance_impl(NRNMPI_Spike* spk);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_single_advance_impl)>
+    nrnmpi_multisend_single_advance;
+extern "C" int nrnmpi_multisend_conserve_impl(int nsend, int nrecv);
+extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_multisend_conserve_impl)>
+    nrnmpi_multisend_conserve;
 #endif
 
 }  // namespace coreneuron
 #endif
-#endif
diff --git a/coreneuron/mpi/nrnmpiuse.h b/coreneuron/mpi/nrnmpiuse.h
index 058cddc34..2e9c090e7 100644
--- a/coreneuron/mpi/nrnmpiuse.h
+++ b/coreneuron/mpi/nrnmpiuse.h
@@ -6,8 +6,7 @@
 # =============================================================================.
 */
 
-#ifndef usenrnmpi_h
-#define usenrnmpi_h
+#pragma once
 
 /* define to 1 if you want MPI specific features activated
    (optionally provided by CMake option NRNMPI) */
@@ -23,9 +22,6 @@
 /* define to 1 if you want parallel distributed cells (and gap junctions) */
 #define PARANEURON 1
 
-/* define to 1 if you want mpi dynamically loaded instead of linked normally */
-#undef NRNMPI_DYNAMICLOAD
-
 /* define to 1 if you want the MUSIC - MUlti SImulation Coordinator */
 #undef NRN_MUSIC
 
@@ -37,5 +33,3 @@
 
 /* Define to 1 for possibility of rank 0 xopen/ropen a file and broadcast everywhere */
 #undef USE_NRNFILEWRAP
-
-#endif
diff --git a/coreneuron/network/have2want.h b/coreneuron/network/have2want.h
index c710cef00..9e1630577 100644
--- a/coreneuron/network/have2want.h
+++ b/coreneuron/network/have2want.h
@@ -19,6 +19,8 @@ Need to define HAVEWANT_t, HAVEWANT_alltoallv, and HAVEWANT2Int
 #define have2want_h
 
 #include "coreneuron/utils/nrnoc_aux.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 
 /*
 
@@ -28,7 +30,7 @@ ranks want their information. Ranks that want info do not know which ranks
 own that info.
 
 The have_to_want function returns two new vectors of keys along with
-associated count and displacement vectors of length nhost and nhost+1
+associated count and displacement vectors of length nrnmpi_numprocs and nrnmpi_numprocs+1
 respectively. Note that a send_to_want_displ[i+1] =
   send_to_want_cnt[i] + send_to_want_displ[i] .
 
@@ -66,7 +68,7 @@ static int* cnt2displ(int* cnt) {
 static int* srccnt2destcnt(int* srccnt) {
     int* destcnt = new int[nrnmpi_numprocs];
 #if NRNMPI
-    if (nrnmpi_numprocs > 1) {
+    if (corenrn_param.mpi_enable) {
         nrnmpi_int_alltoall(srccnt, destcnt, 1);
     } else
 #endif
@@ -87,11 +89,9 @@ static void rendezvous_rank_get(HAVEWANT_t* data,
                                 int*& rcnt,
                                 int*& rdispl,
                                 int (*rendezvous_rank)(HAVEWANT_t)) {
-    int nhost = nrnmpi_numprocs;
-
     // count what gets sent
-    scnt = new int[nhost];
-    for (int i = 0; i < nhost; ++i) {
+    scnt = new int[nrnmpi_numprocs];
+    for (int i = 0; i < nrnmpi_numprocs; ++i) {
         scnt[i] = 0;
     }
     for (int i = 0; i < size; ++i) {
@@ -102,10 +102,10 @@ static void rendezvous_rank_get(HAVEWANT_t* data,
     sdispl = cnt2displ(scnt);
     rcnt = srccnt2destcnt(scnt);
     rdispl = cnt2displ(rcnt);
-    sdata = new HAVEWANT_t[sdispl[nhost]];
-    rdata = new HAVEWANT_t[rdispl[nhost]];
+    sdata = new HAVEWANT_t[sdispl[nrnmpi_numprocs]];
+    rdata = new HAVEWANT_t[rdispl[nrnmpi_numprocs]];
     // scatter data into sdata by recalculating scnt.
-    for (int i = 0; i < nhost; ++i) {
+    for (int i = 0; i < nrnmpi_numprocs; ++i) {
         scnt[i] = 0;
     }
     for (int i = 0; i < size; ++i) {
@@ -114,12 +114,12 @@ static void rendezvous_rank_get(HAVEWANT_t* data,
         ++scnt[r];
     }
 #if NRNMPI
-    if (nhost > 1) {
+    if (corenrn_param.mpi_enable) {
         HAVEWANT_alltoallv(sdata, scnt, sdispl, rdata, rcnt, rdispl);
     } else
 #endif
     {
-        for (int i = 0; i < sdispl[nhost]; ++i) {
+        for (int i = 0; i < sdispl[nrnmpi_numprocs]; ++i) {
             rdata[i] = sdata[i];
         }
     }
@@ -141,8 +141,6 @@ static void have_to_want(HAVEWANT_t* have,
     // 3) Rendezvous ranks tell the want ranks which ranks own the keys
     // 4) Ranks that want tell owner ranks where to send.
 
-    int nhost = nrnmpi_numprocs;
-
     // 1) Send have and want to the rendezvous ranks.
     HAVEWANT_t *have_s_data, *have_r_data;
     int *have_s_cnt, *have_s_displ, *have_r_cnt, *have_r_displ;
@@ -158,7 +156,7 @@ static void have_to_want(HAVEWANT_t* have,
     // assume it is an error if two ranks have the same key so create
     // hash table of key2rank. Will also need it for matching have and want
     HAVEWANT2Int havekey2rank = HAVEWANT2Int();
-    for (int r = 0; r < nhost; ++r) {
+    for (int r = 0; r < nrnmpi_numprocs; ++r) {
         for (int i = 0; i < have_r_cnt[r]; ++i) {
             HAVEWANT_t key = have_r_data[have_r_displ[r] + i];
             if (havekey2rank.find(key) != havekey2rank.end()) {
@@ -192,9 +190,9 @@ static void have_to_want(HAVEWANT_t* have,
     //    we already have made the havekey2rank map.
     // Create an array parallel to want_r_data which contains the ranks that
     // have that data.
-    int n = want_r_displ[nhost];
+    int n = want_r_displ[nrnmpi_numprocs];
     int* want_r_ownerranks = new int[n];
-    for (int r = 0; r < nhost; ++r) {
+    for (int r = 0; r < nrnmpi_numprocs; ++r) {
         for (int i = 0; i < want_r_cnt[r]; ++i) {
             int ix = want_r_displ[r] + i;
             HAVEWANT_t key = want_r_data[ix];
@@ -213,9 +211,9 @@ static void have_to_want(HAVEWANT_t* have,
     // The want_s_ownerranks will be parallel to the want_s_data.
     // That is, each item defines the rank from which information associated
     // with that key is coming from
-    int* want_s_ownerranks = new int[want_s_displ[nhost]];
+    int* want_s_ownerranks = new int[want_s_displ[nrnmpi_numprocs]];
 #if NRNMPI
-    if (nhost > 1) {
+    if (corenrn_param.mpi_enable) {
         nrnmpi_int_alltoallv(want_r_ownerranks,
                              want_r_cnt,
                              want_r_displ,
@@ -225,7 +223,7 @@ static void have_to_want(HAVEWANT_t* have,
     } else
 #endif
     {
-        for (int i = 0; i < want_r_displ[nhost]; ++i) {
+        for (int i = 0; i < want_r_displ[nrnmpi_numprocs]; ++i) {
             want_s_ownerranks[i] = want_r_ownerranks[i];
         }
     }
@@ -240,9 +238,9 @@ static void have_to_want(HAVEWANT_t* have,
     // The parallel want_s_ownerranks and want_s_data are now uselessly ordered
     // by rendezvous rank. Reorganize so that want ranks can tell owner ranks
     // what they want.
-    n = want_s_displ[nhost];
+    n = want_s_displ[nrnmpi_numprocs];
     delete[] want_s_displ;
-    for (int i = 0; i < nhost; ++i) {
+    for (int i = 0; i < nrnmpi_numprocs; ++i) {
         want_s_cnt[i] = 0;
     }
     HAVEWANT_t* old_want_s_data = want_s_data;
@@ -253,7 +251,7 @@ static void have_to_want(HAVEWANT_t* have,
         ++want_s_cnt[r];
     }
     want_s_displ = cnt2displ(want_s_cnt);
-    for (int i = 0; i < nhost; ++i) {
+    for (int i = 0; i < nrnmpi_numprocs; ++i) {
         want_s_cnt[i] = 0;
     }  // recount while filling
     for (int i = 0; i < n; ++i) {
@@ -266,15 +264,15 @@ static void have_to_want(HAVEWANT_t* have,
     delete[] old_want_s_data;
     want_r_cnt = srccnt2destcnt(want_s_cnt);
     want_r_displ = cnt2displ(want_r_cnt);
-    want_r_data = new HAVEWANT_t[want_r_displ[nhost]];
+    want_r_data = new HAVEWANT_t[want_r_displ[nrnmpi_numprocs]];
 #if NRNMPI
-    if (nhost > 1) {
+    if (corenrn_param.mpi_enable) {
         HAVEWANT_alltoallv(
             want_s_data, want_s_cnt, want_s_displ, want_r_data, want_r_cnt, want_r_displ);
     } else
 #endif
     {
-        for (int i = 0; i < want_s_displ[nhost]; ++i) {
+        for (int i = 0; i < want_s_displ[nrnmpi_numprocs]; ++i) {
             want_r_data[i] = want_s_data[i];
         }
     }
diff --git a/coreneuron/network/multisend_setup.cpp b/coreneuron/network/multisend_setup.cpp
index ed721fd8e..c2fe25a6d 100644
--- a/coreneuron/network/multisend_setup.cpp
+++ b/coreneuron/network/multisend_setup.cpp
@@ -14,7 +14,9 @@
 #include "coreneuron/nrniv/nrniv_decl.h"
 #include "coreneuron/network/multisend.hpp"
 #include "coreneuron/mpi/nrnmpidec.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/utils/memory_utils.h"
+#include "coreneuron/utils/utils.hpp"
 /*
 For very large numbers of processors and cells and fanout, it is taking
 a long time to figure out each cells target list given the input gids
diff --git a/coreneuron/network/netpar.cpp b/coreneuron/network/netpar.cpp
index feaa981b4..cedd5feca 100644
--- a/coreneuron/network/netpar.cpp
+++ b/coreneuron/network/netpar.cpp
@@ -15,7 +15,6 @@
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/apps/corenrn_parameters.hpp"
 #include "coreneuron/sim/multicore.hpp"
-#include "coreneuron/mpi/nrnmpi.h"
 #include "coreneuron/mpi/nrnmpidec.h"
 
 #include "coreneuron/network/netcon.hpp"
@@ -25,14 +24,27 @@
 #include "coreneuron/network/multisend.hpp"
 #include "coreneuron/utils/nrn_assert.h"
 #include "coreneuron/utils/nrnoc_aux.hpp"
+#include "coreneuron/utils/utils.hpp"
 
 #if NRNMPI
-#include "coreneuron/mpi/mpispike.hpp"
+#include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
+int localgid_size_;
+int ag_send_nspike;
+int* nrnmpi_nin_;
+int ovfl_capacity;
+int icapacity;
+unsigned char* spikeout_fixed;
+unsigned char* spfixin_ovfl_;
+unsigned char* spikein_fixed;
+int ag_send_size;
+int ovfl;
+int nout;
+coreneuron::NRNMPI_Spikebuf* spbufout;
+coreneuron::NRNMPI_Spikebuf* spbufin;
 #endif
 
 namespace coreneuron {
-
-extern corenrn_parameters corenrn_param;
 class PreSyn;
 class InputPreSyn;
 
@@ -42,11 +54,11 @@ static double dt1_;  // 1/dt
 void nrn_spike_exchange_init();
 
 #if NRNMPI
+NRNMPI_Spike* spikeout;
+NRNMPI_Spike* spikein;
 
 void nrn_timeout(int);
 void nrn_spike_exchange(NrnThread*);
-extern int nrnmpi_int_allmax(int);
-extern void nrnmpi_int_allgather(int*, int*, int);
 void nrn2ncs_outputevent(int netcon_output_index, double firetime);
 
 // for compressed gid info during spike exchange
@@ -54,7 +66,7 @@ bool nrn_use_localgid_;
 void nrn_outputevent(unsigned char localgid, double firetime);
 std::vector<std::map<int, InputPreSyn*>> localmaps;
 
-static int ocapacity_;  // for spikeout_
+static int ocapacity_;  // for spikeout
 // require it to be smaller than  min_interprocessor_delay.
 static double wt_;   // wait time for nrnmpi_spike_exchange
 static double wt1_;  // time to find the PreSyns and send the spikes.
@@ -80,15 +92,15 @@ static OMP_Mutex mut;
 /// coming from nrnmpi.h and array of int of the global domain size
 static void alloc_mpi_space() {
 #if NRNMPI
-    if (!spikeout_) {
+    if (corenrn_param.mpi_enable && !spikeout) {
         ocapacity_ = 100;
-        spikeout_ = (NRNMPI_Spike*) emalloc(ocapacity_ * sizeof(NRNMPI_Spike));
-        icapacity_ = 100;
-        spikein_ = (NRNMPI_Spike*) malloc(icapacity_ * sizeof(NRNMPI_Spike));
-        nin_ = (int*) emalloc(nrnmpi_numprocs * sizeof(int));
+        spikeout = (NRNMPI_Spike*) emalloc(ocapacity_ * sizeof(NRNMPI_Spike));
+        icapacity = 100;
+        spikein = (NRNMPI_Spike*) malloc(icapacity * sizeof(NRNMPI_Spike));
+        nrnmpi_nin_ = (int*) emalloc(nrnmpi_numprocs * sizeof(int));
 #if nrn_spikebuf_size > 0
-        spbufout_ = (NRNMPI_Spikebuf*) emalloc(sizeof(NRNMPI_Spikebuf));
-        spbufin_ = (NRNMPI_Spikebuf*) emalloc(nrnmpi_numprocs * sizeof(NRNMPI_Spikebuf));
+        spbufout = (NRNMPI_Spikebuf*) emalloc(sizeof(NRNMPI_Spikebuf));
+        spbufin = (NRNMPI_Spikebuf*) emalloc(nrnmpi_numprocs * sizeof(NRNMPI_Spikebuf));
 #endif
     }
 #endif
@@ -135,18 +147,18 @@ void nrn_outputevent(unsigned char localgid, double firetime) {
         return;
     }
     std::lock_guard<OMP_Mutex> lock(mut);
-    nout_++;
+    nout++;
     int i = idxout_;
     idxout_ += 2;
     if (idxout_ >= spfixout_capacity_) {
         spfixout_capacity_ *= 2;
-        spfixout_ = (unsigned char*) erealloc(spfixout_,
-                                              spfixout_capacity_ * sizeof(unsigned char));
+        spikeout_fixed = (unsigned char*) erealloc(spikeout_fixed,
+                                                   spfixout_capacity_ * sizeof(unsigned char));
     }
-    spfixout_[i++] = (unsigned char) ((firetime - t_exchange_) * dt1_ + .5);
-    spfixout_[i] = localgid;
+    spikeout_fixed[i++] = (unsigned char) ((firetime - t_exchange_) * dt1_ + .5);
+    spikeout_fixed[i] = localgid;
     // printf("%d idx=%d lgid=%d firetime=%g t_exchange_=%g [0]=%d [1]=%d\n", nrnmpi_myid, i,
-    // (int)localgid, firetime, t_exchange_, (int)spfixout_[i-1], (int)spfixout_[i]);
+    // (int)localgid, firetime, t_exchange_, (int)spikeout_fixed[i-1], (int)spikeout_fixed[i]);
 }
 
 void nrn2ncs_outputevent(int gid, double firetime) {
@@ -155,47 +167,47 @@ void nrn2ncs_outputevent(int gid, double firetime) {
     }
     std::lock_guard<OMP_Mutex> lock(mut);
     if (use_compress_) {
-        nout_++;
+        nout++;
         int i = idxout_;
         idxout_ += 1 + localgid_size_;
         if (idxout_ >= spfixout_capacity_) {
             spfixout_capacity_ *= 2;
-            spfixout_ = (unsigned char*) erealloc(spfixout_,
-                                                  spfixout_capacity_ * sizeof(unsigned char));
+            spikeout_fixed = (unsigned char*) erealloc(spikeout_fixed,
+                                                       spfixout_capacity_ * sizeof(unsigned char));
         }
         // printf("%d nrnncs_outputevent %d %.20g %.20g %d\n", nrnmpi_myid, gid, firetime,
         // t_exchange_,
         //(int)((unsigned char)((firetime - t_exchange_)*dt1_ + .5)));
-        spfixout_[i++] = (unsigned char) ((firetime - t_exchange_) * dt1_ + .5);
+        spikeout_fixed[i++] = (unsigned char) ((firetime - t_exchange_) * dt1_ + .5);
         // printf("%d idx=%d firetime=%g t_exchange_=%g spfixout=%d\n", nrnmpi_myid, i, firetime,
-        // t_exchange_, (int)spfixout_[i-1]);
-        sppk(spfixout_ + i, gid);
-        // printf("%d idx=%d gid=%d spupk=%d\n", nrnmpi_myid, i, gid, spupk(spfixout_+i));
+        // t_exchange_, (int)spikeout_fixed[i-1]);
+        sppk(spikeout_fixed + i, gid);
+        // printf("%d idx=%d gid=%d spupk=%d\n", nrnmpi_myid, i, gid, spupk(spikeout_fixed+i));
     } else {
 #if nrn_spikebuf_size == 0
-        int i = nout_++;
+        int i = nout++;
         if (i >= ocapacity_) {
             ocapacity_ *= 2;
-            spikeout_ = (NRNMPI_Spike*) erealloc(spikeout_, ocapacity_ * sizeof(NRNMPI_Spike));
+            spikeout = (NRNMPI_Spike*) erealloc(spikeout, ocapacity_ * sizeof(NRNMPI_Spike));
         }
         // printf("%d cell %d in slot %d fired at %g\n", nrnmpi_myid, gid, i, firetime);
-        spikeout_[i].gid = gid;
-        spikeout_[i].spiketime = firetime;
+        spikeout[i].gid = gid;
+        spikeout[i].spiketime = firetime;
 #else
-        int i = nout_++;
+        int i = nout++;
         if (i >= nrn_spikebuf_size) {
             i -= nrn_spikebuf_size;
             if (i >= ocapacity_) {
                 ocapacity_ *= 2;
-                spikeout_ = (NRNMPI_Spike*) hoc_Erealloc(spikeout_,
-                                                         ocapacity_ * sizeof(NRNMPI_Spike));
+                spikeout = (NRNMPI_Spike*) hoc_Erealloc(spikeout,
+                                                        ocapacity_ * sizeof(NRNMPI_Spike));
                 hoc_malchk();
             }
-            spikeout_[i].gid = gid;
-            spikeout_[i].spiketime = firetime;
+            spikeout[i].gid = gid;
+            spikeout[i].spiketime = firetime;
         } else {
-            spbufout_->gid[i] = gid;
-            spbufout_->spiketime[i] = firetime;
+            spbufout->gid[i] = gid;
+            spbufout->spiketime[i] = firetime;
         }
 #endif
     }
@@ -226,7 +238,6 @@ void nrn_spike_exchange_init() {
         return;
     }
     alloc_mpi_space();
-    // printf("nrnmpi_use=%d active=%d\n", nrnmpi_use, active_);
     usable_mindelay_ = mindelay_;
 #if NRN_MULTISEND
     if (use_multisend_ && n_multisend_interval == 2) {
@@ -268,20 +279,22 @@ void nrn_spike_exchange_init() {
         npe_[i].send(t, net_cvode_instance, nrn_threads + i);
     }
 #if NRNMPI
-    if (use_compress_) {
-        idxout_ = 2;
-        t_exchange_ = t;
-        dt1_ = rev_dt;
-        usable_mindelay_ = floor(mindelay_ * dt1_ + 1e-9) * dt;
-        assert(usable_mindelay_ >= dt && (usable_mindelay_ * dt1_) < 255);
-    } else {
+    if (corenrn_param.mpi_enable) {
+        if (use_compress_) {
+            idxout_ = 2;
+            t_exchange_ = t;
+            dt1_ = rev_dt;
+            usable_mindelay_ = floor(mindelay_ * dt1_ + 1e-9) * dt;
+            assert(usable_mindelay_ >= dt && (usable_mindelay_ * dt1_) < 255);
+        } else {
 #if nrn_spikebuf_size > 0
-        if (spbufout_) {
-            spbufout_->nspike = 0;
-        }
+            if (spbufout) {
+                spbufout->nspike = 0;
+            }
 #endif
+        }
+        nout = 0;
     }
-    nout_ = 0;
 #endif  // NRNMPI
         // if (nrnmpi_myid == 0){printf("usable_mindelay_ = %g\n", usable_mindelay_);}
 }
@@ -306,48 +319,49 @@ void nrn_spike_exchange(NrnThread* nt) {
 #endif
 
 #if nrn_spikebuf_size > 0
-    spbufout_->nspike = nout_;
+    spbufout->nspike = nout;
 #endif
     double wt = nrn_wtime();
 
-    int n = nrnmpi_spike_exchange();
+    int n = nrnmpi_spike_exchange(
+        nrnmpi_nin_, spikeout, icapacity, spikein, ovfl, nout, spbufout, spbufin);
 
     wt_ = nrn_wtime() - wt;
     wt = nrn_wtime();
 #if TBUFSIZE
-    tbuf_[itbuf_++] = (unsigned long) nout_;
+    tbuf_[itbuf_++] = (unsigned long) nout;
     tbuf_[itbuf_++] = (unsigned long) n;
 #endif
 
     errno = 0;
     // if (n > 0) {
-    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout_, n);
+    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout, n);
     //}
-    nout_ = 0;
+    nout = 0;
     if (n == 0) {
         return;
     }
 #if nrn_spikebuf_size > 0
     for (int i = 0; i < nrnmpi_numprocs; ++i) {
-        int nn = spbufin_[i].nspike;
+        int nn = spbufin[i].nspike;
         if (nn > nrn_spikebuf_size) {
             nn = nrn_spikebuf_size;
         }
         for (int j = 0; j < nn; ++j) {
-            auto gid2in_it = gid2in.find(spbufin_[i].gid[j]);
+            auto gid2in_it = gid2in.find(spbufin[i].gid[j]);
             if (gid2in_it != gid2in.end()) {
                 InputPreSyn* ps = gid2in_it->second;
-                ps->send(spbufin_[i].spiketime[j], net_cvode_instance, nt);
+                ps->send(spbufin[i].spiketime[j], net_cvode_instance, nt);
             }
         }
     }
-    n = ovfl_;
+    n = ovfl;
 #endif  // nrn_spikebuf_size > 0
     for (int i = 0; i < n; ++i) {
-        auto gid2in_it = gid2in.find(spikein_[i].gid);
+        auto gid2in_it = gid2in.find(spikein[i].gid);
         if (gid2in_it != gid2in.end()) {
             InputPreSyn* ps = gid2in_it->second;
-            ps->send(spikein_[i].spiketime, net_cvode_instance, nt);
+            ps->send(spikein[i].spiketime, net_cvode_instance, nt);
         }
     }
     wt1_ = nrn_wtime() - wt;
@@ -361,23 +375,32 @@ void nrn_spike_exchange_compressed(NrnThread* nt) {
     nrnmpi_barrier();
 #endif
 
-    assert(nout_ < 0x10000);
-    spfixout_[1] = (unsigned char) (nout_ & 0xff);
-    spfixout_[0] = (unsigned char) (nout_ >> 8);
+    assert(nout < 0x10000);
+    spikeout_fixed[1] = (unsigned char) (nout & 0xff);
+    spikeout_fixed[0] = (unsigned char) (nout >> 8);
 
     double wt = nrn_wtime();
-    int n = nrnmpi_spike_exchange_compressed();
+
+    int n = nrnmpi_spike_exchange_compressed(localgid_size_,
+                                             spfixin_ovfl_,
+                                             ag_send_nspike,
+                                             nrnmpi_nin_,
+                                             ovfl_capacity,
+                                             spikeout_fixed,
+                                             ag_send_size,
+                                             spikein_fixed,
+                                             ovfl);
     wt_ = nrn_wtime() - wt;
     wt = nrn_wtime();
 #if TBUFSIZE
-    tbuf_[itbuf_++] = (unsigned long) nout_;
+    tbuf_[itbuf_++] = (unsigned long) nout;
     tbuf_[itbuf_++] = (unsigned long) n;
 #endif
     errno = 0;
     // if (n > 0) {
-    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout_, n);
+    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout, n);
     //}
-    nout_ = 0;
+    nout = 0;
     idxout_ = 2;
     if (n == 0) {
         t_exchange_ = nrn_threads->_t;
@@ -387,25 +410,25 @@ void nrn_spike_exchange_compressed(NrnThread* nt) {
         int idxov = 0;
         for (int i = 0; i < nrnmpi_numprocs; ++i) {
             int j, nnn;
-            int nn = nin_[i];
+            int nn = nrnmpi_nin_[i];
             if (nn) {
                 if (i == nrnmpi_myid) {  // skip but may need to increment idxov.
-                    if (nn > ag_send_nspike_) {
-                        idxov += (nn - ag_send_nspike_) * (1 + localgid_size_);
+                    if (nn > ag_send_nspike) {
+                        idxov += (nn - ag_send_nspike) * (1 + localgid_size_);
                     }
                     continue;
                 }
                 std::map<int, InputPreSyn*> gps = localmaps[i];
-                if (nn > ag_send_nspike_) {
-                    nnn = ag_send_nspike_;
+                if (nn > ag_send_nspike) {
+                    nnn = ag_send_nspike;
                 } else {
                     nnn = nn;
                 }
-                int idx = 2 + i * ag_send_size_;
+                int idx = 2 + i * ag_send_size;
                 for (j = 0; j < nnn; ++j) {
                     // order is (firetime,gid) pairs.
-                    double firetime = spfixin_[idx++] * dt + t_exchange_;
-                    int lgid = (int) spfixin_[idx];
+                    double firetime = spikein_fixed[idx++] * dt + t_exchange_;
+                    int lgid = (int) spikein_fixed[idx];
                     idx += localgid_size_;
                     auto gid2in_it = gps.find(lgid);
                     if (gid2in_it != gps.end()) {
@@ -427,15 +450,15 @@ void nrn_spike_exchange_compressed(NrnThread* nt) {
         }
     } else {
         for (int i = 0; i < nrnmpi_numprocs; ++i) {
-            int nn = nin_[i];
-            if (nn > ag_send_nspike_) {
-                nn = ag_send_nspike_;
+            int nn = nrnmpi_nin_[i];
+            if (nn > ag_send_nspike) {
+                nn = ag_send_nspike;
             }
-            int idx = 2 + i * ag_send_size_;
+            int idx = 2 + i * ag_send_size;
             for (int j = 0; j < nn; ++j) {
                 // order is (firetime,gid) pairs.
-                double firetime = spfixin_[idx++] * dt + t_exchange_;
-                int gid = spupk(spfixin_ + idx);
+                double firetime = spikein_fixed[idx++] * dt + t_exchange_;
+                int gid = spupk(spikein_fixed + idx);
                 idx += localgid_size_;
                 auto gid2in_it = gid2in.find(gid);
                 if (gid2in_it != gid2in.end()) {
@@ -444,7 +467,7 @@ void nrn_spike_exchange_compressed(NrnThread* nt) {
                 }
             }
         }
-        n = ovfl_;
+        n = ovfl;
         int idx = 0;
         for (int i = 0; i < n; ++i) {
             double firetime = spfixin_ovfl_[idx++] * dt + t_exchange_;
@@ -568,29 +591,33 @@ void BBS_netpar_solve(double tstop) {
     double time = nrn_wtime();
 
 #if NRNMPI
-    tstopunset;
-    double mt = dt;
-    double md = mindelay_ - 1e-10;
-    if (md < mt) {
-        if (nrnmpi_myid == 0) {
-            hoc_execerror("mindelay is 0", "(or less than dt for fixed step method)");
-        } else {
-            return;
+    if (corenrn_param.mpi_enable) {
+        tstopunset;
+        double mt = dt;
+        double md = mindelay_ - 1e-10;
+        if (md < mt) {
+            if (nrnmpi_myid == 0) {
+                hoc_execerror("mindelay is 0", "(or less than dt for fixed step method)");
+            } else {
+                return;
+            }
         }
-    }
 
-    nrn_timeout(timeout_);
-    ncs2nrn_integrate(tstop * (1. + 1e-11));
-    nrn_spike_exchange(nrn_threads);
-    nrn_timeout(0);
-    if (!npe_.empty()) {
-        npe_[0].wx_ = npe_[0].ws_ = 0.;
-    };
-    // printf("%d netpar_solve exit t=%g tstop=%g mindelay_=%g\n",nrnmpi_myid, t, tstop, mindelay_);
-    nrnmpi_barrier();
-#else  // not NRNMPI
-    ncs2nrn_integrate(tstop);
+        nrn_timeout(timeout_);
+        ncs2nrn_integrate(tstop * (1. + 1e-11));
+        nrn_spike_exchange(nrn_threads);
+        nrn_timeout(0);
+        if (!npe_.empty()) {
+            npe_[0].wx_ = npe_[0].ws_ = 0.;
+        };
+        // printf("%d netpar_solve exit t=%g tstop=%g mindelay_=%g\n",nrnmpi_myid, t, tstop,
+        // mindelay_);
+        nrnmpi_barrier();
+    } else
 #endif
+    {
+        ncs2nrn_integrate(tstop);
+    }
     tstopunset;
 
     if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
@@ -650,24 +677,26 @@ double set_mindelay(double maxdelay) {
     }
 
 #if NRNMPI
-    if (nrnmpi_use) {
+    if (corenrn_param.mpi_enable) {
         active_ = true;
-    }
-    if (use_compress_) {
-        if (mindelay / dt > 255) {
-            mindelay = 255 * dt;
+        if (use_compress_) {
+            if (mindelay / dt > 255) {
+                mindelay = 255 * dt;
+            }
         }
-    }
 
-    // printf("%d netpar_mindelay local %g now calling nrnmpi_mindelay\n", nrnmpi_myid, mindelay);
-    //	double st = time();
-    mindelay_ = nrnmpi_dbl_allmin(mindelay);
-    //	add_wait_time(st);
-    // printf("%d local min=%g  global min=%g\n", nrnmpi_myid, mindelay, mindelay_);
-    errno = 0;
-#else
-    mindelay_ = mindelay;
+        // printf("%d netpar_mindelay local %g now calling nrnmpi_mindelay\n", nrnmpi_myid,
+        // mindelay);
+        //	double st = time();
+        mindelay_ = nrnmpi_dbl_allmin(mindelay);
+        //	add_wait_time(st);
+        // printf("%d local min=%g  global min=%g\n", nrnmpi_myid, mindelay, mindelay_);
+        errno = 0;
+    } else
 #endif  // NRNMPI
+    {
+        mindelay_ = mindelay;
+    }
     return mindelay_;
 }
 
@@ -708,61 +737,62 @@ two phase multisend distributes the injection.
 
 int nrnmpi_spike_compress(int nspike, bool gid_compress, int xchng_meth) {
 #if NRNMPI
-    if (nrnmpi_numprocs < 2) {
-        return 0;
-    }
+    if (corenrn_param.mpi_enable) {
 #if NRN_MULTISEND
-    if (xchng_meth > 0) {
-        use_multisend_ = 1;
-        return 0;
-    }
-#endif
-    nrn_assert(xchng_meth == 0);
-    if (nspike >= 0) {
-        ag_send_nspike_ = 0;
-        if (spfixout_) {
-            free(spfixout_);
-            spfixout_ = 0;
-        }
-        if (spfixin_) {
-            free(spfixin_);
-            spfixin_ = 0;
+        if (xchng_meth > 0) {
+            use_multisend_ = 1;
+            return 0;
         }
-        if (spfixin_ovfl_) {
-            free(spfixin_ovfl_);
-            spfixin_ovfl_ = 0;
-        }
-        localmaps.clear();
-    }
-    if (nspike == 0) {  // turn off
-        use_compress_ = false;
-        nrn_use_localgid_ = false;
-    } else if (nspike > 0) {  // turn on
-        use_compress_ = true;
-        ag_send_nspike_ = nspike;
-        nrn_use_localgid_ = false;
-        if (gid_compress) {
-            // we can only do this after everything is set up
-            mk_localgid_rep();
-            if (!nrn_use_localgid_ && nrnmpi_myid == 0) {
-                printf(
-                    "Notice: gid compression did not succeed. Probably more than 255 cells on one "
-                    "cpu.\n");
+#endif
+        nrn_assert(xchng_meth == 0);
+        if (nspike >= 0) {
+            ag_send_nspike = 0;
+            if (spikeout_fixed) {
+                free(spikeout_fixed);
+                spikeout_fixed = nullptr;
+            }
+            if (spikein_fixed) {
+                free(spikein_fixed);
+                spikein_fixed = nullptr;
             }
+            if (spfixin_ovfl_) {
+                free(spfixin_ovfl_);
+                spfixin_ovfl_ = nullptr;
+            }
+            localmaps.clear();
         }
-        if (!nrn_use_localgid_) {
-            localgid_size_ = sizeof(unsigned int);
+        if (nspike == 0) {  // turn off
+            use_compress_ = false;
+            nrn_use_localgid_ = false;
+        } else if (nspike > 0) {  // turn on
+            use_compress_ = true;
+            ag_send_nspike = nspike;
+            nrn_use_localgid_ = false;
+            if (gid_compress) {
+                // we can only do this after everything is set up
+                mk_localgid_rep();
+                if (!nrn_use_localgid_ && nrnmpi_myid == 0) {
+                    printf(
+                        "Notice: gid compression did not succeed. Probably more than 255 cells on "
+                        "one "
+                        "cpu.\n");
+                }
+            }
+            if (!nrn_use_localgid_) {
+                localgid_size_ = sizeof(unsigned int);
+            }
+            ag_send_size = 2 + ag_send_nspike * (1 + localgid_size_);
+            spfixout_capacity_ = ag_send_size + 50 * (1 + localgid_size_);
+            spikeout_fixed = (unsigned char*) emalloc(spfixout_capacity_);
+            spikein_fixed = (unsigned char*) emalloc(nrnmpi_numprocs * ag_send_size);
+            ovfl_capacity = 100;
+            spfixin_ovfl_ = (unsigned char*) emalloc(ovfl_capacity * (1 + localgid_size_));
         }
-        ag_send_size_ = 2 + ag_send_nspike_ * (1 + localgid_size_);
-        spfixout_capacity_ = ag_send_size_ + 50 * (1 + localgid_size_);
-        spfixout_ = (unsigned char*) emalloc(spfixout_capacity_);
-        spfixin_ = (unsigned char*) emalloc(nrnmpi_numprocs * ag_send_size_);
-        ovfl_capacity_ = 100;
-        spfixin_ovfl_ = (unsigned char*) emalloc(ovfl_capacity_ * (1 + localgid_size_));
-    }
-    return ag_send_nspike_;
-#else
-    return 0;
+        return ag_send_nspike;
+    } else
 #endif
+    {
+        return 0;
+    }
 }
 }  // namespace coreneuron
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index 0a7c60d26..e74d866ce 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -9,7 +9,9 @@
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/network/partrans.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 // This is the computational code for src->target transfer (e.g. gap junction)
 // simulation.
@@ -86,7 +88,7 @@ void nrnmpi_v_transfer() {
     // transfer
     int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
 #if NRNMPI
-    if (nrnmpi_numprocs > 1) {  // otherwise insrc_buf_ == outsrc_buf_
+    if (corenrn_param.mpi_enable) {  // otherwise insrc_buf_ == outsrc_buf_
         nrnmpi_barrier();
         nrnmpi_dbl_alltoallv(
             outsrc_buf_, outsrccnt_, outsrcdspl_, insrc_buf_, insrccnt_, insrcdspl_);
diff --git a/coreneuron/network/partrans_setup.cpp b/coreneuron/network/partrans_setup.cpp
index b78a689c6..cf31e16ad 100644
--- a/coreneuron/network/partrans_setup.cpp
+++ b/coreneuron/network/partrans_setup.cpp
@@ -13,6 +13,7 @@
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/network/partrans.hpp"
 #include "coreneuron/nrniv/nrniv_decl.h"
 
diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp
index 249c89bd3..db077601e 100644
--- a/coreneuron/sim/fadvance_core.cpp
+++ b/coreneuron/sim/fadvance_core.cpp
@@ -25,8 +25,6 @@
 #include "coreneuron/io/nrn2core_direct.h"
 
 namespace coreneuron {
-
-extern corenrn_parameters corenrn_param;
 static void* nrn_fixed_step_thread(NrnThread*);
 static void* nrn_fixed_step_group_thread(NrnThread*, int, int, int&);
 
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index e6c41f629..c108e2431 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -12,6 +12,7 @@
 #include "coreneuron/mechanism/membfunc.hpp"
 #include "coreneuron/utils/memory.h"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/io/reports/nrnreport.hpp"
 #include <vector>
 #include <memory>
diff --git a/coreneuron/utils/memory_utils.cpp b/coreneuron/utils/memory_utils.cpp
index bb8f989d1..bb54c1a4f 100644
--- a/coreneuron/utils/memory_utils.cpp
+++ b/coreneuron/utils/memory_utils.cpp
@@ -26,6 +26,8 @@
 #include <unistd.h>
 #include "coreneuron/utils/memory_utils.h"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <mach/mach.h>
@@ -34,7 +36,6 @@
 #endif
 
 namespace coreneuron {
-
 double nrn_mallinfo(void) {
     // -ve mem usage for non-supported platforms
     double mbs = -1.0;
@@ -79,12 +80,15 @@ void report_mem_usage(const char* message, bool all_ranks) {
 
 /* @todo: avoid three all reduce class */
 #if NRNMPI
-    mem_avg = nrnmpi_dbl_allreduce(cur_mem, 1) / nrnmpi_numprocs;
-    mem_max = nrnmpi_dbl_allreduce(cur_mem, 2);
-    mem_min = nrnmpi_dbl_allreduce(cur_mem, 3);
-#else
-    mem_avg = mem_max = mem_min = cur_mem;
+    if (corenrn_param.mpi_enable) {
+        mem_avg = nrnmpi_dbl_allreduce(cur_mem, 1) / nrnmpi_numprocs;
+        mem_max = nrnmpi_dbl_allreduce(cur_mem, 2);
+        mem_min = nrnmpi_dbl_allreduce(cur_mem, 3);
+    } else
 #endif
+    {
+        mem_avg = mem_max = mem_min = cur_mem;
+    }
 
     // all ranks prints information if all_ranks is true
     if (all_ranks) {
diff --git a/coreneuron/utils/nrn_stats.cpp b/coreneuron/utils/nrn_stats.cpp
index 0e8b08e61..bc60c1cd7 100644
--- a/coreneuron/utils/nrn_stats.cpp
+++ b/coreneuron/utils/nrn_stats.cpp
@@ -24,8 +24,6 @@
 #include "coreneuron/network/partrans.hpp"
 #include "coreneuron/io/output_spikes.hpp"
 namespace coreneuron {
-extern corenrn_parameters corenrn_param;
-
 const int NUM_STATS = 13;
 enum event_type { enq = 0, spike, ite };
 
@@ -58,11 +56,14 @@ void report_cell_stats(void) {
     stat_array[6] = spikevec_positive_gid_size;  // number of non-negative gid spikes
 
 #if NRNMPI
-    nrnmpi_long_allreduce_vec(stat_array, gstat_array, NUM_STATS, 1);
-#else
-    assert(sizeof(stat_array) == sizeof(gstat_array));
-    memcpy(gstat_array, stat_array, sizeof(stat_array));
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_long_allreduce_vec(stat_array, gstat_array, NUM_STATS, 1);
+    } else
 #endif
+    {
+        assert(sizeof(stat_array) == sizeof(gstat_array));
+        std::memcpy(gstat_array, stat_array, sizeof(stat_array));
+    }
 
     if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
         printf("\n\n Simulation Statistics\n");
diff --git a/coreneuron/utils/nrnoc_aux.cpp b/coreneuron/utils/nrnoc_aux.cpp
index 1efc7d395..a058e9b9f 100644
--- a/coreneuron/utils/nrnoc_aux.cpp
+++ b/coreneuron/utils/nrnoc_aux.cpp
@@ -13,6 +13,7 @@
 #include "coreneuron/mpi/nrnmpi.h"
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
 
 namespace coreneuron {
 bool stoprun;
@@ -29,7 +30,9 @@ char* pnt_name(Point_process* pnt) {
 
 void nrn_exit(int err) {
 #if NRNMPI
-    nrnmpi_finalize();
+    if (corenrn_param.mpi_enable) {
+        nrnmpi_finalize();
+    }
 #endif
     exit(err);
 }
diff --git a/coreneuron/utils/nrntimeout.cpp b/coreneuron/utils/nrntimeout.cpp
index f69317aaf..02b82f5d4 100644
--- a/coreneuron/utils/nrntimeout.cpp
+++ b/coreneuron/utils/nrntimeout.cpp
@@ -9,6 +9,7 @@
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/utils/utils.hpp"
 
 #if NRNMPI
 
diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu
index 0ee863f8f..cac8b4967 100644
--- a/coreneuron/utils/randoms/nrnran123.cu
+++ b/coreneuron/utils/randoms/nrnran123.cu
@@ -5,6 +5,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
+#include "coreneuron/mpi/core/nrnmpi.hpp"
 #include "coreneuron/utils/memory.h"
 #include "coreneuron/utils/nrnmutdec.h"
 #include "coreneuron/utils/randoms/nrnran123.h"
diff --git a/coreneuron/utils/utils.cpp b/coreneuron/utils/utils.cpp
new file mode 100644
index 000000000..9196ca9a8
--- /dev/null
+++ b/coreneuron/utils/utils.cpp
@@ -0,0 +1,38 @@
+#include <sys/time.h>
+#include "utils.hpp"
+#include "coreneuron/mpi/nrnmpi.h"
+#include "coreneuron/mpi/core/nrnmpi.hpp"
+#include "coreneuron/apps/corenrn_parameters.hpp"
+
+namespace coreneuron {
+void nrn_abort(int errcode) {
+#if NRNMPI
+    if (corenrn_param.mpi_enable && nrnmpi_initialized()) {
+        nrnmpi_abort(errcode);
+    } else
+#endif
+    {
+        abort();
+    }
+}
+
+void nrn_fatal_error(const char* msg) {
+    if (nrnmpi_myid == 0) {
+        printf("%s\n", msg);
+    }
+    nrn_abort(-1);
+}
+
+double nrn_wtime() {
+#if NRNMPI
+    if (corenrn_param.mpi_enable) {
+        return nrnmpi_wtime();
+    } else
+#endif
+    {
+        struct timeval time1;
+        gettimeofday(&time1, nullptr);
+        return (time1.tv_sec + time1.tv_usec / 1.e6);
+    }
+}
+}  // namespace coreneuron
diff --git a/coreneuron/utils/utils.hpp b/coreneuron/utils/utils.hpp
new file mode 100644
index 000000000..5a56e0e16
--- /dev/null
+++ b/coreneuron/utils/utils.hpp
@@ -0,0 +1,15 @@
+/*
+# =============================================================================
+# Copyright (c) 2021 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================.
+*/
+
+#pragma once
+
+namespace coreneuron {
+extern void nrn_abort(int errcode);
+extern void nrn_fatal_error(const char* msg);
+extern double nrn_wtime(void);
+}  // namespace coreneuron
diff --git a/extra/instrumentation.tau b/extra/instrumentation.tau
index b2dbde064..a0aa63ded 100644
--- a/extra/instrumentation.tau
+++ b/extra/instrumentation.tau
@@ -3,7 +3,7 @@ BEGIN_INCLUDE_LIST
  int coreneuron::main(int, char **, char **)
  int coreneuron::nrnmpi_bgp_conserve(int, int)
  int coreneuron::nrnmpi_bgp_single_advance(NRNMPI_Spike *)
- int coreneuron::nrnmpi_spike_exchange()
+ int coreneuron::nrnmpi_spike_exchange(int*, NRNMPI_Spike*)
  int main(int, char **, char **)
  size_t nrnbbcore_write()
  void coreneuron::*nrn_fixed_step_group_thread(coreneuron::NrnThread *)
@@ -27,7 +27,6 @@ BEGIN_INCLUDE_LIST
  void coreneuron::deliver_net_events(coreneuron::NrnThread *)
  void coreneuron::determine_inputpresyn()
  void coreneuron::finitialize(void)
- void coreneuron::make_spike_type()
  void coreneuron::ncs2nrn_integrate(double)
  void coreneuron::nonvint(coreneuron::NrnThread *)
  void coreneuron::nrn2ncs_outputevent(int, double)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 393ce4923..a363b40d8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -29,7 +29,11 @@ if(Boost_FOUND)
     add_subdirectory(unit/interleave_info)
     add_subdirectory(unit/alignment)
     add_subdirectory(unit/queueing)
-    add_subdirectory(unit/lfp)
+    # lfp test uses nrnmpi_* wrappers but does not load the dynamic MPI library TODO: re-enable
+    # after NEURON and CoreNEURON dynamic MPI are merged
+    if(NOT CORENRN_ENABLE_DYNAMIC_MPI)
+      add_subdirectory(unit/lfp)
+    endif()
   endif()
   message(STATUS "Boost found, unit tests enabled")
 else()
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 5ec9a3ba0..cb3a3d905 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -87,6 +87,9 @@ endforeach()
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/ring/out.dat.ref"
      DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/ring_spike_buffer/")
 
+# names of all tests added
+set(CORENRN_TEST_NAMES "")
+
 # Configure test scripts
 foreach(args_line ${TEST_CASES_WITH_ARGS})
   string(REPLACE "!" ";" string_line ${args_line})
@@ -109,6 +112,7 @@ foreach(args_line ${TEST_CASES_WITH_ARGS})
     COMMAND "/bin/sh" ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}/integration_test.sh
     WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}")
   set_tests_properties(${TEST_NAME}_TEST PROPERTIES PROCESSORS ${test_num_processors})
+  list(APPEND CORENRN_TEST_NAMES ${TEST_NAME}_TEST)
 endforeach()
 
 foreach(args_line ${NEGATIVE_TEST_CASES})
@@ -127,6 +131,7 @@ foreach(args_line ${NEGATIVE_TEST_CASES})
     COMMAND "/bin/sh" ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}/negative_integration_test.sh
     WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}")
   set_tests_properties(${TEST_NAME}_TEST PROPERTIES PROCESSORS ${test_num_processors})
+  list(APPEND CORENRN_TEST_NAMES ${TEST_NAME}_TEST)
 endforeach()
 
 if(CORENRN_ENABLE_REPORTING)
@@ -141,5 +146,13 @@ if(CORENRN_ENABLE_REPORTING)
       NAME ${SIM_NAME}
       COMMAND "/bin/sh" ${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}/reporting_test.sh
       WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}")
+    list(APPEND CORENRN_TEST_NAMES ${SIM_NAME})
   endforeach()
 endif()
+
+# DYLD_LIBRARY_PATH.LD_LIBRARY_PATH for dynamic MPI build
+if(CORENRN_ENABLE_DYNAMIC_MPI)
+  set(TEST_ENV LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
+               DYLD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/lib:$ENV{DYLD_LIBRARY_PATH})
+  set_tests_properties(${CORENRN_TEST_NAMES} PROPERTIES ENVIRONMENT "${TEST_ENV}")
+endif()