From 27058a96364b2b75cde17ac56bd1ec01aaa4fc1e Mon Sep 17 00:00:00 2001
From: Marcel Breyer <marcel.breyer@ipvs.uni-stuttgart.de>
Date: Tue, 10 Dec 2024 17:56:58 +0100
Subject: [PATCH] Implement general new MPI communicator API. Note: MPI isn't
 used, i.e., each MPI rank does the classification all for itself!

---
 include/plssvm/backends/CUDA/csvm.hpp         |  51 ++-
 include/plssvm/backends/gpu_csvm.hpp          |  10 +-
 include/plssvm/csvm.hpp                       |  31 +-
 include/plssvm/data_set.hpp                   | 309 +++++++++++++++---
 .../plssvm/detail/cmd/data_set_variants.hpp   |  38 ++-
 include/plssvm/detail/cmd/parser_predict.hpp  |   4 +-
 include/plssvm/detail/cmd/parser_train.hpp    |   4 +-
 .../plssvm/detail/io/libsvm_model_parsing.hpp |  10 +-
 include/plssvm/detail/logging.hpp             |  25 +-
 .../logging_without_performance_tracking.hpp  |  21 ++
 include/plssvm/model.hpp                      |  31 +-
 src/main_predict.cpp                          |  72 ++--
 src/main_train.cpp                            |  59 ++--
 src/plssvm/backends/CUDA/csvm.cu              |  18 +-
 src/plssvm/csvm.cpp                           |   6 +
 src/plssvm/detail/cmd/parser_predict.cpp      |  46 ++-
 src/plssvm/detail/cmd/parser_train.cpp        |  60 +++-
 17 files changed, 646 insertions(+), 149 deletions(-)

diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp
index 5e0eed30d..565648e7a 100644
--- a/include/plssvm/backends/CUDA/csvm.hpp
+++ b/include/plssvm/backends/CUDA/csvm.hpp
@@ -21,6 +21,7 @@
 #include "plssvm/csvm.hpp"                                // plssvm::detail::csvm_backend_exists
 #include "plssvm/detail/memory_size.hpp"                  // plssvm::detail::memory_size
 #include "plssvm/detail/type_traits.hpp"                  // PLSSVM_REQUIRES
+#include "plssvm/mpi/communicator.hpp"                    // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                           // plssvm::parameter
 #include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
 
@@ -59,6 +60,16 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
      * @throws plssvm::cuda::backend_exception if no CUDA capable devices could be found
      */
     explicit csvm(parameter params = {});
+    /**
+     * @brief Construct a new C-SVM using the CUDA backend with the default parameters.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] params struct encapsulating all possible parameters
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::cuda::backend_exception if the target platform isn't plssvm::target_platform::automatic or plssvm::target_platform::gpu_nvidia
+     * @throws plssvm::cuda::backend_exception if the plssvm::target_platform::gpu_nvidia target isn't available
+     * @throws plssvm::cuda::backend_exception if no CUDA capable devices could be found
+     */
+    explicit csvm(mpi::communicator comm, parameter params = {});
     /**
      * @brief Construct a new C-SVM using the CUDA backend on the @p target platform with the parameters given through @p params.
      * @param[in] target the target platform used for this C-SVM
@@ -69,6 +80,17 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
      * @throws plssvm::cuda::backend_exception if no CUDA capable devices could be found
      */
     explicit csvm(target_platform target, parameter params = {});
+    /**
+     * @brief Construct a new C-SVM using the CUDA backend on the @p target platform with the parameters given through @p params.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] target the target platform used for this C-SVM
+     * @param[in] params struct encapsulating all possible SVM parameters
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::cuda::backend_exception if the target platform isn't plssvm::target_platform::automatic or plssvm::target_platform::gpu_nvidia
+     * @throws plssvm::cuda::backend_exception if the plssvm::target_platform::gpu_nvidia target isn't available
+     * @throws plssvm::cuda::backend_exception if no CUDA capable devices could be found
+     */
+    explicit csvm(mpi::communicator comm, target_platform target, parameter params = {});
 
     /**
      * @brief Construct a new C-SVM using the CUDA backend and the optionally provided @p named_args.
@@ -80,7 +102,19 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
      */
     template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
     explicit csvm(Args &&...named_args) :
-        csvm{ plssvm::target_platform::automatic, std::forward<Args>(named_args)... } { }
+        csvm{ mpi::communicator{}, std::forward<Args>(named_args)... } { }
+    /**
+     * @brief Construct a new C-SVM using the CUDA backend and the optionally provided @p named_args.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] named_args the additional optional named arguments
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::cuda::backend_exception if the target platform isn't plssvm::target_platform::automatic or plssvm::target_platform::gpu_nvidia
+     * @throws plssvm::cuda::backend_exception if the plssvm::target_platform::gpu_nvidia target isn't available
+     * @throws plssvm::cuda::backend_exception if no CUDA capable devices could be found
+     */
+    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
+    explicit csvm(mpi::communicator comm, Args &&...named_args) :
+        csvm{ std::move(comm), plssvm::target_platform::automatic, std::forward<Args>(named_args)... } { }
 
     /**
      * @brief Construct a new C-SVM using the CUDA backend on the @p target platform and the optionally provided @p named_args.
@@ -93,7 +127,20 @@ class csvm : public ::plssvm::detail::gpu_csvm<detail::device_ptr, int, detail::
      */
     template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
     explicit csvm(const target_platform target, Args &&...named_args) :
-        base_type{ std::forward<Args>(named_args)... } {
+        csvm{ mpi::communicator{}, target, std::forward<Args>(named_args)... } { }
+    /**
+     * @brief Construct a new C-SVM using the CUDA backend on the @p target platform and the optionally provided @p named_args.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] target the target platform used for this C-SVM
+     * @param[in] named_args the additional optional named-parameters
+     * @throws plssvm::exception all exceptions thrown in the base class constructor
+     * @throws plssvm::cuda::backend_exception if the target platform isn't plssvm::target_platform::automatic or plssvm::target_platform::gpu_nvidia
+     * @throws plssvm::cuda::backend_exception if the plssvm::target_platform::gpu_nvidia target isn't available
+     * @throws plssvm::cuda::backend_exception if no CUDA capable devices could be found
+     */
+    template <typename... Args, PLSSVM_REQUIRES(::plssvm::detail::has_only_parameter_named_args_v<Args...>)>
+    explicit csvm(mpi::communicator comm, const target_platform target, Args &&...named_args) :
+        base_type{ std::move(comm), std::forward<Args>(named_args)... } {
         this->init(target);
     }
 
diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp
index cf2641b38..6634491d1 100644
--- a/include/plssvm/backends/gpu_csvm.hpp
+++ b/include/plssvm/backends/gpu_csvm.hpp
@@ -21,6 +21,7 @@
 #include "plssvm/detail/move_only_any.hpp"      // plssvm::detail::{move_only_any, move_only_any_cast}
 #include "plssvm/kernel_function_types.hpp"     // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                    // plssvm::aos_matrix, plssvm::soa_matrix
+#include "plssvm/mpi/communicator.hpp"          // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                 // plssvm::parameter
 #include "plssvm/shape.hpp"                     // plssvm::shape
 #include "plssvm/solver_types.hpp"              // plssvm::solver_type
@@ -56,17 +57,18 @@ class gpu_csvm : public ::plssvm::csvm {
     /**
      * @copydoc plssvm::csvm::csvm()
      */
-    explicit gpu_csvm(parameter params = {}) :
-        ::plssvm::csvm{ params } { }
+    explicit gpu_csvm(mpi::communicator comm, parameter params = {}) :
+        ::plssvm::csvm{ std::move(comm), params } { }
 
     /**
      * @brief Construct a C-SVM forwarding all parameters @p args to the plssvm::parameter constructor.
      * @tparam Args the type of the (named-)parameters
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
      * @param[in] args the parameters used to construct a plssvm::parameter
      */
     template <typename... Args>
-    explicit gpu_csvm(Args &&...args) :
-        ::plssvm::csvm{ std::forward<Args>(args)... } { }
+    explicit gpu_csvm(mpi::communicator comm, Args &&...args) :
+        ::plssvm::csvm{ std::move(comm), std::forward<Args>(args)... } { }
 
     /**
      * @copydoc plssvm::csvm::csvm(const plssvm::csvm &)
diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp
index 3e0ea2472..edc145f49 100644
--- a/include/plssvm/csvm.hpp
+++ b/include/plssvm/csvm.hpp
@@ -31,6 +31,7 @@
 #include "plssvm/kernel_function_types.hpp"                // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                               // plssvm::aos_matrix
 #include "plssvm/model.hpp"                                // plssvm::model
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                            // plssvm::parameter
 #include "plssvm/shape.hpp"                                // plssvm::shape
 #include "plssvm/solver_types.hpp"                         // plssvm::solver_type
@@ -69,16 +70,18 @@ class csvm {
     /**
      * @brief Construct a C-SVM using the SVM parameter @p params.
      * @details Uses the default SVM parameter if none are provided.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
      * @param[in] params the SVM parameter
      */
-    explicit csvm(parameter params = {});
+    explicit csvm(mpi::communicator comm, parameter params = {});
     /**
      * @brief Construct a C-SVM forwarding all parameters @p args to the plssvm::parameter constructor.
      * @tparam Args the type of the (named-)parameters
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
      * @param[in] args the parameters used to construct a plssvm::parameter
      */
     template <typename... Args>
-    explicit csvm(Args &&...args);
+    explicit csvm(mpi::communicator comm, Args &&...args);
 
     /**
      * @brief Delete copy-constructor since a CSVM is a move-only type.
@@ -255,6 +258,9 @@ class csvm {
     /// The data distribution on the available devices.
     mutable std::unique_ptr<detail::data_distribution> data_distribution_{};
 
+    /// The used MPI communicator.
+    mpi::communicator comm_{};
+
   protected:  // necessary for tests, would otherwise be private
     /**
      * @brief Perform some sanity checks on the passed SVM parameters.
@@ -311,13 +317,15 @@ class csvm {
     parameter params_{};
 };
 
-inline csvm::csvm(parameter params) :
+inline csvm::csvm(mpi::communicator comm, parameter params) :
+    comm_{ std::move(comm) },
     params_{ params } {
     this->sanity_check_parameter();
 }
 
 template <typename... Args>
-csvm::csvm(Args &&...named_args) :
+csvm::csvm(mpi::communicator comm, Args &&...named_args) :
+    comm_{ std::move(comm) },
     params_{ std::forward<Args>(named_args)... } {
     this->sanity_check_parameter();
 }
@@ -376,6 +384,7 @@ model<label_type> csvm::fit(const data_set<label_type> &data, Args &&...named_ar
     const std::chrono::time_point start_time = std::chrono::steady_clock::now();
 
     detail::log(verbosity_level::full,
+                comm_,
                 "Using {} ({}) as multi-class classification strategy.\n",
                 used_classification,
                 classification_type_to_full_string(used_classification));
@@ -417,6 +426,7 @@ model<label_type> csvm::fit(const data_set<label_type> &data, Args &&...named_ar
         if (num_classes == 2) {
             // special optimization for binary case (no temporary copies necessary)
             detail::log(verbosity_level::full,
+                        comm_,
                         "\nClassifying 0 vs 1 ({} vs {}) (1/1):\n",
                         data.mapping_->get_label_by_mapped_index(0),
                         data.mapping_->get_label_by_mapped_index(1));
@@ -460,6 +470,7 @@ model<label_type> csvm::fit(const data_set<label_type> &data, Args &&...named_ar
 
                     // solve the minimization problem -> note that only a single rhs is present
                     detail::log(verbosity_level::full,
+                                comm_,
                                 "\nClassifying {} vs {} ({} vs {}) ({}/{}):\n",
                                 i,
                                 j,
@@ -486,6 +497,7 @@ model<label_type> csvm::fit(const data_set<label_type> &data, Args &&...named_ar
 
     const std::chrono::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "\nLearned the SVM classifier for {} multi-class classification in {}.\n\n",
                 classification_type_to_full_string(used_classification),
                 detail::tracking::tracking_entry{ "cg", "total_runtime", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) });
@@ -804,6 +816,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
 
         // output the necessary information on the console
         detail::log(verbosity_level::full,
+                    comm_,
                     "Determining the solver type based on the available memory:\n"
                     "  - total system memory: {2}\n"
                     "  - usable system memory (with safety margin of min({0} %, {1}): {3}\n"
@@ -842,7 +855,10 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             // use the explicit solver type
             used_solver = solver_type::cg_explicit;
         } else {
-            detail::log(verbosity_level::full, "Cannot use cg_explicit due to memory constraints on device(s) {}!\n", format_vector(failed_cg_explicit_constraints));
+            detail::log(verbosity_level::full,
+                        comm_,
+                        "Cannot use cg_explicit due to memory constraints on device(s) {}!\n",
+                        format_vector(failed_cg_explicit_constraints));
 
             // check whether there is enough memory available for cg_implicit
             if (const std::vector<std::size_t> failed_cg_implicit_constraints = check_sizes(total_memory_needed_implicit_per_device, usable_device_memory_per_device); failed_cg_implicit_constraints.empty()) {
@@ -865,6 +881,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
 
         // output the maximum memory allocation size per device
         detail::log(verbosity_level::full,
+                    comm_,
                     "  - maximum supported single memory allocation size: {}\n"
                     "  - maximum needed single memory allocation size (cg_explicit): {}\n"
                     "  - maximum needed single memory allocation size (cg_implicit): {}\n",
@@ -881,6 +898,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             used_solver == solver_type::cg_explicit && !failed_cg_explicit_constraints.empty()) {
             // max mem alloc size constraints not fulfilled
             detail::log(verbosity_level::full,
+                        comm_,
                         "Cannot use cg_explicit due to maximum single memory allocation constraints on device(s) {}! Falling back to cg_implicit.\n",
                         format_vector(failed_cg_explicit_constraints));
             // can't use cg_explicit
@@ -890,6 +908,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
             used_solver == solver_type::cg_implicit && !failed_cg_implicit_constraints.empty()) {
             // can't fulfill maximum single memory allocation size even for cg_implicit
             plssvm::detail::log(verbosity_level::full | verbosity_level::warning,
+                                comm_,
                                 "WARNING: if you are sure that the guaranteed maximum memory allocation size can be safely ignored on your device, "
                                 "this check can be disabled via \"-DPLSSVM_ENFORCE_MAX_MEM_ALLOC_SIZE=OFF\" during the CMake configuration!\n");
             throw kernel_launch_resources{ fmt::format("Can't fulfill maximum single memory allocation constraint for device(s) {} even for the cg_implicit solver!", format_vector(failed_cg_implicit_constraints)) };
@@ -898,6 +917,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
     }
 
     detail::log(verbosity_level::full,
+                comm_,
                 "Using {} as solver for AX=B.\n\n",
                 detail::tracking::tracking_entry{ "solver", "solver_type", used_solver });
 
@@ -926,6 +946,7 @@ std::tuple<aos_matrix<real_type>, std::vector<real_type>, std::vector<unsigned l
 
     if (used_solver != solver_type::cg_implicit) {
         detail::log(verbosity_level::full | verbosity_level::timing,
+                    comm_,
                     "Assembled the kernel matrix in {}.\n",
                     assembly_duration);
     }
diff --git a/include/plssvm/data_set.hpp b/include/plssvm/data_set.hpp
index f344e5bae..c4615ea8e 100644
--- a/include/plssvm/data_set.hpp
+++ b/include/plssvm/data_set.hpp
@@ -27,6 +27,7 @@
 #include "plssvm/exceptions/exceptions.hpp"                // plssvm::data_set_exception
 #include "plssvm/file_format_types.hpp"                    // plssvm::file_format_type
 #include "plssvm/matrix.hpp"                               // plssvm::soa_matrix
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
 #include "plssvm/shape.hpp"                                // plssvm::shape
 #include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity_level
 
@@ -64,6 +65,7 @@ using optional_ref = std::optional<std::reference_wrapper<T>>;
  * @brief Encapsulate all necessary data that is needed for training or predicting using an SVM.
  * @details May or may not contain labels!
  *          Internally, saves all data using [`std::shared_ptr`](https://en.cppreference.com/w/cpp/memory/shared_ptr) to make a plssvm::data_set relatively cheap to copy!
+ * @note Currently, **each** MPI rank loads/stores the whole data set (if MPI is available).
  * @tparam U the label type of the data (must be an arithmetic type or `std::string`; default: `int`)
  */
 template <typename U = int>
@@ -96,6 +98,15 @@ class data_set {
      * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::data_set::read_file
      */
     explicit data_set(const std::string &filename);
+    /**
+     * @brief Read the data points from the file @p filename.
+     *        Automatically determines the plssvm::file_format_type based on the file extension.
+     * @details If @p filename ends with `.arff` it uses the ARFF parser, otherwise the LIBSVM parser is used.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] filename the file to read the data points from
+     * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::data_set::read_file
+     */
+    explicit data_set(mpi::communicator comm, const std::string &filename);
     /**
      * @brief Read the data points from the file @p filename assuming that the file is given in the @p plssvm::file_format_type.
      * @param[in] filename the file to read the data points from
@@ -103,6 +114,14 @@ class data_set {
      * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::data_set::read_file
      */
     data_set(const std::string &filename, file_format_type format);
+    /**
+     * @brief Read the data points from the file @p filename assuming that the file is given in the @p plssvm::file_format_type.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] filename the file to read the data points from
+     * @param[in] format the assumed file format used to parse the data points
+     * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::data_set::read_file
+     */
+    data_set(mpi::communicator comm, const std::string &filename, file_format_type format);
     /**
      * @brief Read the data points from the file @p filename and scale it using the provided @p scale_parameter.
      *        Automatically determines the plssvm::file_format_type based on the file extension.
@@ -113,6 +132,17 @@ class data_set {
      * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
      */
     data_set(const std::string &filename, scaling scale_parameter);
+    /**
+     * @brief Read the data points from the file @p filename and scale it using the provided @p scale_parameter.
+     *        Automatically determines the plssvm::file_format_type based on the file extension.
+     * @details If @p filename ends with `.arff` it uses the ARFF parser, otherwise the LIBSVM parser is used.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] filename the file to read the data points from
+     * @param[in] scale_parameter the parameters used to scale the data set feature values to a given range
+     * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::data_set::read_file
+     * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
+     */
+    data_set(mpi::communicator comm, const std::string &filename, scaling scale_parameter);
     /**
      * @brief Read the data points from the file @p filename assuming that the file is given in the plssvm::file_format_type @p format and
      *        scale it using the provided @p scale_parameter.
@@ -123,6 +153,17 @@ class data_set {
      * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
      */
     data_set(const std::string &filename, file_format_type format, scaling scale_parameter);
+    /**
+     * @brief Read the data points from the file @p filename assuming that the file is given in the plssvm::file_format_type @p format and
+     *        scale it using the provided @p scale_parameter.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] filename the file to read the data points from
+     * @param[in] format the assumed file format used to parse the data points
+     * @param[in] scale_parameter the parameters used to scale the data set feature values to a given range
+     * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::data_set::read_file
+     * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
+     */
+    data_set(mpi::communicator comm, const std::string &filename, file_format_type format, scaling scale_parameter);
 
     /**
      * @brief Create a new data set by converting the provided @p data_points to a plssvm::matrix.
@@ -133,6 +174,16 @@ class data_set {
      * @throws plssvm::data_set_exception if any @p data_point has no features
      */
     explicit data_set(const std::vector<std::vector<real_type>> &data_points);
+    /**
+     * @brief Create a new data set by converting the provided @p data_points to a plssvm::matrix.
+     * @details Since no labels are provided, this data set may **not** be used to a call to plssvm::csvm::fit!
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     */
+    explicit data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points);
     /**
      * @brief Create a new data set by converting the provided @p data_points to a plssvm::matrix and copying the @p labels.
      * @param[in] data_points the data points used in this data set
@@ -143,6 +194,17 @@ class data_set {
      * @throws plssvm::data_set_exception if the number of data points in @p data_points and number of @p labels mismatch
      */
     data_set(const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels);
+    /**
+     * @brief Create a new data set by converting the provided @p data_points to a plssvm::matrix and copying the @p labels.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @param[in] labels the labels used in this data set
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     * @throws plssvm::data_set_exception if the number of data points in @p data_points and number of @p labels mismatch
+     */
+    data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels);
     /**
      * @brief Create a new data set  by converting the provided @p data_points to a plssvm::matrix and scale them using the provided @p scale_parameter.
      * @param[in] data_points the data points used in this data set
@@ -153,6 +215,17 @@ class data_set {
      * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
      */
     data_set(const std::vector<std::vector<real_type>> &data_points, scaling scale_parameter);
+    /**
+     * @brief Create a new data set  by converting the provided @p data_points to a plssvm::matrix and scale them using the provided @p scale_parameter.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @param[in] scale_parameter the parameters used to scale the data set feature values to a given range
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
+     */
+    data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points, scaling scale_parameter);
     /**
      * @brief Create a new data set  by converting the provided @p data_points to a plssvm::matrix and copying the @p labels and scale the @p data_points using the provided @p scale_parameter.
      * @param[in] data_points the data points used in this data set
@@ -165,6 +238,19 @@ class data_set {
      * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
      */
     data_set(const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels, scaling scale_parameter);
+    /**
+     * @brief Create a new data set  by converting the provided @p data_points to a plssvm::matrix and copying the @p labels and scale the @p data_points using the provided @p scale_parameter.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @param[in] labels the labels used in this data set
+     * @param[in] scale_parameter the parameters used to scale the data set feature values to a given range
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     * @throws plssvm::data_set_exception if the number of data points in @p data_points and number of @p labels mismatch
+     * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
+     */
+    data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels, scaling scale_parameter);
 
     /**
      * @brief Create a new data set from the provided @p data_points.
@@ -178,6 +264,19 @@ class data_set {
      */
     template <layout_type layout>
     explicit data_set(const matrix<real_type, layout> &data_points);
+    /**
+     * @brief Create a new data set from the provided @p data_points.
+     * @details Since no labels are provided, this data set may **not** be used to a call to plssvm::csvm::fit!
+     * @note If the provided matrix isn't padded, adds the necessary padding entries automatically.
+     * @tparam layout the layout type of the input matrix
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     */
+    template <layout_type layout>
+    explicit data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points);
     /**
      * @brief Create a new data set from the provided @p data_points and @p labels.
      * @note If the provided matrix isn't padded, adds the necessary padding entries automatically.
@@ -191,6 +290,20 @@ class data_set {
      */
     template <layout_type layout>
     data_set(const matrix<real_type, layout> &data_points, std::vector<label_type> labels);
+    /**
+     * @brief Create a new data set from the provided @p data_points and @p labels.
+     * @note If the provided matrix isn't padded, adds the necessary padding entries automatically.
+     * @tparam layout the layout type of the input matrix
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @param[in] labels the labels used in this data set
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     * @throws plssvm::data_set_exception if the number of data points in @p data_points and number of @p labels mismatch
+     */
+    template <layout_type layout>
+    data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points, std::vector<label_type> labels);
     /**
      * @brief Create a new data set from the the provided @p data_points and scale them using the provided @p scale_parameter.
      * @note If the provided matrix isn't padded, adds the necessary padding entries automatically.
@@ -204,6 +317,20 @@ class data_set {
      */
     template <layout_type layout>
     data_set(const matrix<real_type, layout> &data_points, scaling scale_parameter);
+    /**
+     * @brief Create a new data set from the the provided @p data_points and scale them using the provided @p scale_parameter.
+     * @note If the provided matrix isn't padded, adds the necessary padding entries automatically.
+     * @tparam layout the layout type of the input matrix
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @param[in] scale_parameter the parameters used to scale the data set feature values to a given range
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
+     */
+    template <layout_type layout>
+    data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points, scaling scale_parameter);
     /**
      * @brief Create a new data set from the the provided @p data_points and @p labels and scale the @p data_points using the provided @p scale_parameter.
      * @note If the provided matrix isn't padded, adds the necessary padding entries automatically.
@@ -219,9 +346,26 @@ class data_set {
      */
     template <layout_type layout>
     data_set(const matrix<real_type, layout> &data_points, std::vector<label_type> labels, scaling scale_parameter);
+    /**
+     * @brief Create a new data set from the the provided @p data_points and @p labels and scale the @p data_points using the provided @p scale_parameter.
+     * @note If the provided matrix isn't padded, adds the necessary padding entries automatically.
+     * @tparam layout the layout type of the input matrix
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] data_points the data points used in this data set
+     * @param[in] labels the labels used in this data set
+     * @param[in] scale_parameter the parameters used to scale the data set feature values to a given range
+     * @throws plssvm::data_set_exception if the @p data_points vector is empty
+     * @throws plssvm::data_set_exception if the data points in @p data_points have mismatching number of features
+     * @throws plssvm::data_set_exception if any @p data_point has no features
+     * @throws plssvm::data_set_exception if the number of data points in @p data_points and number of @p labels mismatch
+     * @throws plssvm::data_set_exception all exceptions thrown by plssvm::data_set::scale
+     */
+    template <layout_type layout>
+    data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points, std::vector<label_type> labels, scaling scale_parameter);
 
     /**
      * @brief Save the data points and potential labels of this data set to the file @p filename using the file @p format type.
+     * @note Only the main MPI rank (traditionally rank 0) saves the whole data set (if MPI is available).
      * @param[in] filename the file to save the data points and labels to
      * @param[in] format the file format
      */
@@ -230,6 +374,7 @@ class data_set {
      * @brief Save the data points and potential labels of this data set to the file @p filename.
      * @details Automatically determines the plssvm::file_format_type based on the file extension.
      *          If the file extension isn't `.arff`, saves the data as `.libsvm` file.
+     * @note Only the main MPI rank (traditionally rank 0) saves the whole data set (if MPI is available).
      * @param[in] filename the file to save the data points and labels to
      */
     void save(const std::string &filename) const;
@@ -295,6 +440,14 @@ class data_set {
      */
     [[nodiscard]] optional_ref<const scaling> scaling_factors() const noexcept;
 
+    /**
+     * @brief Get the associated MPI communicator.
+     * @return the MPI communicator (`[[nodiscard]]`)
+     */
+    [[nodiscard]] mpi::communicator communicator() noexcept {
+        return comm_;
+    }
+
   private:
     /**
      * @brief Default construct an empty data set.
@@ -332,6 +485,9 @@ class data_set {
     /// The number of features in this data set.
     size_type num_features_{ 0 };
 
+    /// The used MPI communicator.
+    mpi::communicator comm_{};
+
     /// A pointer to the two-dimensional data points.
     std::shared_ptr<soa_matrix<real_type>> data_ptr_{ nullptr };
     /// A pointer to the original labels of this data set; may be `nullptr` if no labels have been provided.
@@ -390,12 +546,14 @@ class data_set<U>::scaling {
      * @throws plssvm::data_set_exception if lower is greater or equal than upper
      */
     scaling(real_type lower, real_type upper);
+    scaling(mpi::communicator comm, real_type lower, real_type upper);
     /**
      * @brief Read the scaling interval and factors from the provided file @p filename.
      * @param[in] filename the filename to read the scaling information from
      * @throws plssvm::invalid_file_format_exception all exceptions thrown by the plssvm::detail::io::parse_scaling_factors function
      */
-    scaling(const std::string &filename);  // can't be explicit due to the data_set_variant
+    scaling(const std::string &filename);                          // can't be explicit due to the data_set_variant
+    scaling(mpi::communicator comm, const std::string &filename);  // can't be explicit due to the data_set_variant
 
     /**
      * @brief Save the scaling factors to the file @p filename.
@@ -408,18 +566,31 @@ class data_set<U>::scaling {
     std::pair<real_type, real_type> scaling_interval{};
     /// The scaling factors for all features.
     std::vector<factors> scaling_factors{};
+
+    /// The used MPI communicator.
+    mpi::communicator comm_{};
 };
 
 template <typename U>
 data_set<U>::scaling::scaling(const real_type lower, const real_type upper) :
-    scaling_interval{ std::make_pair(lower, upper) } {
+    scaling{ mpi::communicator{}, lower, upper } { }
+
+template <typename U>
+data_set<U>::scaling::scaling(mpi::communicator comm, const real_type lower, const real_type upper) :
+    scaling_interval{ std::make_pair(lower, upper) },
+    comm_{ std::move(comm) } {
     if (lower >= upper) {
         throw data_set_exception{ fmt::format("Inconsistent scaling interval specification: lower ({}) must be less than upper ({})!", lower, upper) };
     }
 }
 
 template <typename U>
-data_set<U>::scaling::scaling(const std::string &filename) {
+data_set<U>::scaling::scaling(const std::string &filename) :
+    scaling{ mpi::communicator{}, filename } { }
+
+template <typename U>
+data_set<U>::scaling::scaling(mpi::communicator comm, const std::string &filename) :
+    comm_{ std::move(comm) } {
     // open the file
     detail::io::file_reader reader{ filename };
     reader.read_lines('#');
@@ -437,6 +608,7 @@ void data_set<U>::scaling::save(const std::string &filename) const {
 
     const std::chrono::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Write {} scaling factors in {} to the file '{}'.\n",
                 detail::tracking::tracking_entry{ "scaling_factors_write", "num_scaling_factors", scaling_factors.size() },
                 detail::tracking::tracking_entry{ "scaling_factors_write", "time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) },
@@ -545,21 +717,35 @@ auto data_set<U>::label_mapper::labels() const -> std::vector<label_type> {
 //*************************************************************************************************************************************//
 
 template <typename U>
-data_set<U>::data_set(const std::string &filename) {
+data_set<U>::data_set(const std::string &filename) :
+    data_set{ mpi::communicator{}, filename } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::string &filename) :
+    comm_{ std::move(comm) } {
     // read data set from file
     // if the file doesn't end with .arff, assume a LIBSVM file
     this->read_file(filename, detail::ends_with(filename, ".arff") ? file_format_type::arff : file_format_type::libsvm);
 }
 
 template <typename U>
-data_set<U>::data_set(const std::string &filename, const file_format_type format) {
+data_set<U>::data_set(const std::string &filename, const file_format_type format) :
+    data_set{ mpi::communicator{}, filename, format } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::string &filename, const file_format_type format) :
+    comm_{ std::move(comm) } {
     // read data set from file
     this->read_file(filename, format);
 }
 
 template <typename U>
 data_set<U>::data_set(const std::string &filename, scaling scale_parameter) :
-    data_set{ filename } {
+    data_set{ mpi::communicator{}, filename, std::move(scale_parameter) } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::string &filename, scaling scale_parameter) :
+    data_set{ std::move(comm), filename } {
     // initialize scaling
     scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
     // scale data set
@@ -568,7 +754,11 @@ data_set<U>::data_set(const std::string &filename, scaling scale_parameter) :
 
 template <typename U>
 data_set<U>::data_set(const std::string &filename, file_format_type format, scaling scale_parameter) :
-    data_set{ filename, format } {
+    data_set{ mpi::communicator{}, filename, format, std::move(scale_parameter) } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::string &filename, file_format_type format, scaling scale_parameter) :
+    data_set{ std::move(comm), filename, format } {
     // initialize scaling
     scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
     // scale data set
@@ -577,29 +767,45 @@ data_set<U>::data_set(const std::string &filename, file_format_type format, scal
 
 // clang-format off
 template <typename U>
-data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points) try :
-    data_set{ soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } } } {}
+data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points) :
+    data_set{ mpi::communicator{}, data_points } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points) try :
+    data_set{ std::move(comm), soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } } } {}
     catch (const matrix_exception &e) {
         throw data_set_exception{ e.what() };
     }
 
 template <typename U>
-data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels) try :
-    data_set{ soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } }, std::move(labels) } {}
+data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels) :
+    data_set{ mpi::communicator{}, data_points, labels } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels) try :
+    data_set{ std::move(comm), soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } }, std::move(labels) } {}
     catch (const matrix_exception &e) {
         throw data_set_exception{ e.what() };
     }
 
 template <typename U>
-data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points, scaling scale_parameter) try :
-    data_set{ soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } }, std::move(scale_parameter) } {}
+data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points, scaling scale_parameter) :
+    data_set{ mpi::communicator{}, data_points, std::move(scale_parameter) } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points, scaling scale_parameter) try :
+    data_set{ std::move(comm), soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } }, std::move(scale_parameter) } {}
     catch (const matrix_exception &e) {
         throw data_set_exception{ e.what() };
     }
 
 template <typename U>
-data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels, scaling scale_parameter) try :
-    data_set{ soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } }, std::move(labels), std::move(scale_parameter) } {}
+data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels, scaling scale_parameter) :
+    data_set{ mpi::communicator{}, data_points, std::move(labels), std::move(scale_parameter) } { }
+
+template <typename U>
+data_set<U>::data_set(mpi::communicator comm, const std::vector<std::vector<real_type>> &data_points, std::vector<label_type> labels, scaling scale_parameter) try :
+    data_set{ std::move(comm), soa_matrix<real_type>{ data_points, shape{ PADDING_SIZE, PADDING_SIZE } }, std::move(labels), std::move(scale_parameter) } {}
     catch (const matrix_exception &e) {
         throw data_set_exception{ e.what() };
     }
@@ -609,8 +815,14 @@ data_set<U>::data_set(const std::vector<std::vector<real_type>> &data_points, st
 template <typename U>
 template <layout_type layout>
 data_set<U>::data_set(const matrix<real_type, layout> &data_points) :
+    data_set{ mpi::communicator{}, data_points } { }
+
+template <typename U>
+template <layout_type layout>
+data_set<U>::data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points) :
     num_data_points_{ data_points.num_rows() },
     num_features_{ data_points.num_cols() },
+    comm_{ std::move(comm) },
     data_ptr_{ std::make_shared<soa_matrix<real_type>>(data_points, shape{ PADDING_SIZE, PADDING_SIZE }) } {
     // the provided data points vector may not be empty
     if (data_ptr_->num_rows() == 0) {
@@ -621,6 +833,7 @@ data_set<U>::data_set(const matrix<real_type, layout> &data_points) :
     }
 
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Created a data set with {} data points and {} features.\n",
                 detail::tracking::tracking_entry{ "data_set_create", "num_data_points", num_data_points_ },
                 detail::tracking::tracking_entry{ "data_set_create", "num_features", num_features_ });
@@ -629,8 +842,14 @@ data_set<U>::data_set(const matrix<real_type, layout> &data_points) :
 template <typename U>
 template <layout_type layout>
 data_set<U>::data_set(const matrix<real_type, layout> &data_points, std::vector<label_type> labels) :
+    data_set{ mpi::communicator{}, data_points, std::move(labels) } { }
+
+template <typename U>
+template <layout_type layout>
+data_set<U>::data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points, std::vector<label_type> labels) :
     num_data_points_{ data_points.num_rows() },
     num_features_{ data_points.num_cols() },
+    comm_{ std::move(comm) },
     data_ptr_{ std::make_shared<soa_matrix<real_type>>(data_points, shape{ PADDING_SIZE, PADDING_SIZE }) },
     labels_ptr_{ std::make_shared<std::vector<label_type>>(std::move(labels)) } {
     // the number of labels must be equal to the number of data points!
@@ -643,6 +862,7 @@ data_set<U>::data_set(const matrix<real_type, layout> &data_points, std::vector<
     this->create_mapping(std::vector<label_type>(unique_labels.cbegin(), unique_labels.cend()));
 
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Created a data set with {} data points, {} features, and {} classes.\n",
                 detail::tracking::tracking_entry{ "data_set_create", "num_data_points", num_data_points_ },
                 detail::tracking::tracking_entry{ "data_set_create", "num_features", num_features_ },
@@ -652,7 +872,12 @@ data_set<U>::data_set(const matrix<real_type, layout> &data_points, std::vector<
 template <typename U>
 template <layout_type layout>
 data_set<U>::data_set(const matrix<real_type, layout> &data_points, scaling scale_parameter) :
-    data_set{ std::move(data_points) } {
+    data_set{ mpi::communicator{}, data_points, std::move(scale_parameter) } { }
+
+template <typename U>
+template <layout_type layout>
+data_set<U>::data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points, scaling scale_parameter) :
+    data_set{ std::move(data_points), std::move(comm) } {
     // initialize scaling
     scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
     // scale data set
@@ -662,7 +887,12 @@ data_set<U>::data_set(const matrix<real_type, layout> &data_points, scaling scal
 template <typename U>
 template <layout_type layout>
 data_set<U>::data_set(const matrix<real_type, layout> &data_points, std::vector<label_type> labels, scaling scale_parameter) :
-    data_set{ std::move(data_points), std::move(labels) } {
+    data_set{ mpi::communicator{}, data_points, std::move(labels), std::move(scale_parameter) } { }
+
+template <typename U>
+template <layout_type layout>
+data_set<U>::data_set(mpi::communicator comm, const matrix<real_type, layout> &data_points, std::vector<label_type> labels, scaling scale_parameter) :
+    data_set{ std::move(data_points), std::move(labels), std::move(comm) } {
     // initialize scaling
     scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
     // scale data set
@@ -673,31 +903,34 @@ template <typename U>
 void data_set<U>::save(const std::string &filename, const file_format_type format) const {
     const std::chrono::time_point start_time = std::chrono::steady_clock::now();
 
-    // save the data set
-    if (this->has_labels()) {
-        // save data with labels
-        switch (format) {
-            case file_format_type::libsvm:
-                detail::io::write_libsvm_data(filename, *data_ptr_, *labels_ptr_);
-                break;
-            case file_format_type::arff:
-                detail::io::write_arff_data(filename, *data_ptr_, *labels_ptr_);
-                break;
-        }
-    } else {
-        // save data without labels
-        switch (format) {
-            case file_format_type::libsvm:
-                detail::io::write_libsvm_data(filename, *data_ptr_);
-                break;
-            case file_format_type::arff:
-                detail::io::write_arff_data(filename, *data_ptr_);
-                break;
+    if (comm_.is_main_rank()) {
+        // save the data set
+        if (this->has_labels()) {
+            // save data with labels
+            switch (format) {
+                case file_format_type::libsvm:
+                    detail::io::write_libsvm_data(filename, *data_ptr_, *labels_ptr_);
+                    break;
+                case file_format_type::arff:
+                    detail::io::write_arff_data(filename, *data_ptr_, *labels_ptr_);
+                    break;
+            }
+        } else {
+            // save data without labels
+            switch (format) {
+                case file_format_type::libsvm:
+                    detail::io::write_libsvm_data(filename, *data_ptr_);
+                    break;
+                case file_format_type::arff:
+                    detail::io::write_arff_data(filename, *data_ptr_);
+                    break;
+            }
         }
     }
 
     const std::chrono::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Write {} data points with {} features and {} classes in {} to the {} file '{}'.\n",
                 detail::tracking::tracking_entry{ "data_set_write", "num_data_points", num_data_points_ },
                 detail::tracking::tracking_entry{ "data_set_write", "num_features", num_features_ },
@@ -829,6 +1062,7 @@ void data_set<U>::scale() {
 
     const std::chrono::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Scaled the data set to the range [{}, {}] in {}.\n",
                 detail::tracking::tracking_entry{ "data_set_scale", "lower", lower },
                 detail::tracking::tracking_entry{ "data_set_scale", "upper", upper },
@@ -884,6 +1118,7 @@ void data_set<U>::read_file(const std::string &filename, file_format_type format
 
     const std::chrono::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Read {} data points with {} features and {} classes in {} using the {} parser from file '{}'.\n",
                 detail::tracking::tracking_entry{ "data_set_read", "num_data_points", num_data_points_ },
                 detail::tracking::tracking_entry{ "data_set_read", "num_features", num_features_ },
diff --git a/include/plssvm/detail/cmd/data_set_variants.hpp b/include/plssvm/detail/cmd/data_set_variants.hpp
index 239d9a007..43d30cee3 100644
--- a/include/plssvm/detail/cmd/data_set_variants.hpp
+++ b/include/plssvm/detail/cmd/data_set_variants.hpp
@@ -17,6 +17,7 @@
 #include "plssvm/detail/cmd/parser_predict.hpp"  // plssvm::detail::cmd::parser_predict
 #include "plssvm/detail/cmd/parser_scale.hpp"    // plssvm::detail::cmd::parser_scale
 #include "plssvm/detail/cmd/parser_train.hpp"    // plssvm::detail::cmd::parser_train
+#include "plssvm/mpi/communicator.hpp"           // plssvm::mpi::communicator
 
 #include <string>   // std::string
 #include <variant>  // std::variant
@@ -31,37 +32,40 @@ using data_set_variants = std::variant<plssvm::data_set<int>, plssvm::data_set<s
 /**
  * @brief Return the correct data set based on the plssvm::detail::cmd::parser_train command line options.
  * @tparam label_type the type of the labels
+ * @param[in] comm the MPI communicator wrapper
  * @param[in] cmd_parser the provided command line parser
  * @return the data set based on the provided command line parser (`[[nodiscard]]`)
  */
 template <typename label_type = typename data_set<>::label_type>
-[[nodiscard]] inline data_set_variants data_set_factory_impl(const cmd::parser_train &cmd_parser) {
-    return data_set_variants{ plssvm::data_set<label_type>{ cmd_parser.input_filename } };
+[[nodiscard]] inline data_set_variants data_set_factory_impl(mpi::communicator comm, const cmd::parser_train &cmd_parser) {
+    return data_set_variants{ plssvm::data_set<label_type>{ std::move(comm), cmd_parser.input_filename } };
 }
 
 /**
  * @brief Return the correct data set based on the plssvm::detail::cmd::parser_predict command line options.
  * @tparam label_type the type of the labels
+ * @param[in] comm the MPI communicator wrapper
  * @param[in] cmd_parser the provided command line parser
  * @return the data set based on the provided command line parser (`[[nodiscard]]`)
  */
 template <typename label_type = typename data_set<>::label_type>
-[[nodiscard]] inline data_set_variants data_set_factory_impl(const cmd::parser_predict &cmd_parser) {
-    return data_set_variants{ plssvm::data_set<label_type>{ cmd_parser.input_filename } };
+[[nodiscard]] inline data_set_variants data_set_factory_impl(mpi::communicator comm, const cmd::parser_predict &cmd_parser) {
+    return data_set_variants{ plssvm::data_set<label_type>{ std::move(comm), cmd_parser.input_filename } };
 }
 
 /**
  * @brief Return the correct data set based on the plssvm::detail::cmd::parser_scale command line options.
  * @tparam label_type the type of the labels
+ * @param[in] comm the MPI communicator wrapper
  * @param[in] cmd_parser the provided command line parser
  * @return the data set based on the provided command line parser (`[[nodiscard]]`)
  */
 template <typename label_type = typename data_set<>::label_type>
-[[nodiscard]] inline data_set_variants data_set_factory_impl(const cmd::parser_scale &cmd_parser) {
+[[nodiscard]] inline data_set_variants data_set_factory_impl(mpi::communicator comm, const cmd::parser_scale &cmd_parser) {
     if (!cmd_parser.restore_filename.empty()) {
-        return data_set_variants{ plssvm::data_set<label_type>{ cmd_parser.input_filename, { cmd_parser.restore_filename } } };
+        return data_set_variants{ plssvm::data_set<label_type>{ comm, cmd_parser.input_filename, { comm, cmd_parser.restore_filename } } };
     } else {
-        return data_set_variants{ plssvm::data_set<label_type>{ cmd_parser.input_filename, { cmd_parser.lower, cmd_parser.upper } } };
+        return data_set_variants{ plssvm::data_set<label_type>{ comm, cmd_parser.input_filename, { comm, cmd_parser.lower, cmd_parser.upper } } };
     }
 }
 
@@ -74,9 +78,25 @@ template <typename label_type = typename data_set<>::label_type>
 template <typename cmd_parser_type>
 [[nodiscard]] inline data_set_variants data_set_factory(const cmd_parser_type &cmd_parser) {
     if (cmd_parser.strings_as_labels) {
-        return data_set_factory_impl<std::string>(cmd_parser);
+        return data_set_factory_impl<std::string>(mpi::communicator{}, cmd_parser);
     } else {
-        return data_set_factory_impl(cmd_parser);
+        return data_set_factory_impl(mpi::communicator{}, cmd_parser);
+    }
+}
+
+/**
+ * @brief Based on the provided command line @p cmd_parser, return the correct plssvm::data_set.
+ * @tparam cmd_parser_type the type of the command line parser (train, predict, or scale)
+ * @param[in] comm the MPI communicator wrapper
+ * @param[in] cmd_parser the provided command line parser
+ * @return the data set based on the provided command line parser (`[[nodiscard]]`)
+ */
+template <typename cmd_parser_type>
+[[nodiscard]] inline data_set_variants data_set_factory(mpi::communicator comm, const cmd_parser_type &cmd_parser) {
+    if (cmd_parser.strings_as_labels) {
+        return data_set_factory_impl<std::string>(std::move(comm), cmd_parser);
+    } else {
+        return data_set_factory_impl(std::move(comm), cmd_parser);
     }
 }
 
diff --git a/include/plssvm/detail/cmd/parser_predict.hpp b/include/plssvm/detail/cmd/parser_predict.hpp
index 4ba2e1a65..9c1cb880c 100644
--- a/include/plssvm/detail/cmd/parser_predict.hpp
+++ b/include/plssvm/detail/cmd/parser_predict.hpp
@@ -16,6 +16,7 @@
 #include "plssvm/backend_types.hpp"                       // plssvm::backend_type
 #include "plssvm/backends/Kokkos/execution_space.hpp"     // plssvm::kokkos::execution_space
 #include "plssvm/backends/SYCL/implementation_types.hpp"  // plssvm::sycl::implementation_type
+#include "plssvm/mpi/communicator.hpp"                    // plssvm::mpi::communicator
 #include "plssvm/target_platforms.hpp"                    // plssvm::target_platform
 
 #include "fmt/base.h"     // fmt::formatter
@@ -33,10 +34,11 @@ struct parser_predict {
     /**
      * @brief Parse the command line arguments @p argv using [`cxxopts`](https://github.com/jarro2783/cxxopts) and set the predict parameters accordingly.
      * @details If no output filename is given, uses the input filename and appends a ".predict". The output file is than saved in the current working directory.
+     * @param[in] comm the MPI communicator wrapper
      * @param[in] argc the number of passed command line arguments
      * @param[in] argv the command line arguments
      */
-    parser_predict(int argc, char **argv);
+    parser_predict(const mpi::communicator &comm, int argc, char **argv);
 
     /// The used backend: automatic (depending on the specified target_platforms), OpenMP, HPX, stdpar, CUDA, HIP, OpenCL, SYCL, or Kokkos.
     backend_type backend{ backend_type::automatic };
diff --git a/include/plssvm/detail/cmd/parser_train.hpp b/include/plssvm/detail/cmd/parser_train.hpp
index 73897249a..dc762b7aa 100644
--- a/include/plssvm/detail/cmd/parser_train.hpp
+++ b/include/plssvm/detail/cmd/parser_train.hpp
@@ -19,6 +19,7 @@
 #include "plssvm/backends/SYCL/kernel_invocation_types.hpp"  // plssvm::sycl::kernel_invocation_type
 #include "plssvm/classification_types.hpp"                   // plssvm::classification_type
 #include "plssvm/constants.hpp"                              // plssvm::real_type
+#include "plssvm/mpi/communicator.hpp"                       // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                              // plssvm::parameter
 #include "plssvm/solver_types.hpp"                           // plssvm::solving_type
 #include "plssvm/target_platforms.hpp"                       // plssvm::target_platform
@@ -39,10 +40,11 @@ struct parser_train {
     /**
      * @brief Parse the command line arguments @p argv using [`cxxopts`](https://github.com/jarro2783/cxxopts) and set the training parameters accordingly.
      * @details If no model filename is given, uses the input filename and appends a ".model". The model file is than saved in the current working directory.
+     * @param[in] comm the MPI communicator wrapper
      * @param[in] argc the number of passed command line arguments
      * @param[in] argv the command line arguments
      */
-    parser_train(int argc, char **argv);
+    parser_train(const mpi::communicator &comm, int argc, char **argv);
 
     /// Other base C-SVM parameters
     plssvm::parameter csvm_params{};
diff --git a/include/plssvm/detail/io/libsvm_model_parsing.hpp b/include/plssvm/detail/io/libsvm_model_parsing.hpp
index c42c82e8e..9fef6c8b0 100644
--- a/include/plssvm/detail/io/libsvm_model_parsing.hpp
+++ b/include/plssvm/detail/io/libsvm_model_parsing.hpp
@@ -26,6 +26,7 @@
 #include "plssvm/gamma.hpp"                     // plssvm::get_gamma_string
 #include "plssvm/kernel_function_types.hpp"     // plssvm::kernel_function_type
 #include "plssvm/matrix.hpp"                    // plssvm::soa_matrix
+#include "plssvm/mpi/communicator.hpp"          // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                 // plssvm::parameter
 #include "plssvm/shape.hpp"                     // plssvm::shape
 #include "plssvm/verbosity_levels.hpp"          // plssvm::verbosity_level
@@ -580,6 +581,7 @@ template <typename label_type>
  * @endcode
  * @tparam label_type the type of the labels (any arithmetic type, except bool, or std::string)
  * @param[in,out] out the output-stream to write the header information to
+ * @param[in] comm the used MPI communicator
  * @param[in] params the SVM parameters
  * @param[in] rho the rho values for the different classes resulting from the hyperplane learning
  * @param[in] data the data used to create the model
@@ -587,7 +589,7 @@ template <typename label_type>
  * @return the order of the different classes as it should appear in the following data section (`[[nodiscard]]`)
  */
 template <typename label_type>
-[[nodiscard]] inline std::vector<label_type> write_libsvm_model_header(fmt::ostream &out, const plssvm::parameter &params, const std::vector<real_type> &rho, const data_set<label_type> &data) {
+[[nodiscard]] inline std::vector<label_type> write_libsvm_model_header(fmt::ostream &out, const mpi::communicator &comm, const plssvm::parameter &params, const std::vector<real_type> &rho, const data_set<label_type> &data) {
     PLSSVM_ASSERT(data.has_labels(), "Cannot write a model file that does not include labels!");
     PLSSVM_ASSERT(!rho.empty(), "At least one rho value must be provided!");
 
@@ -634,6 +636,7 @@ template <typename label_type>
 
     // print model header
     detail::log(verbosity_level::full | verbosity_level::libsvm,
+                comm,
                 "\n{}\n",
                 out_string);
     // write model header to file
@@ -665,6 +668,7 @@ template <typename label_type>
  * @endcode
  * @tparam label_type the type of the labels (any arithmetic type, except bool, or std::string)
  * @param[in] filename the file to write the LIBSVM model to
+ * @param[in] comm the used MPI communicator
  * @param[in] params the SVM parameters
  * @param[in] classification the used multi-class classification strategy
  * @param[in] rho the rho value resulting from the hyperplane learning
@@ -674,7 +678,7 @@ template <typename label_type>
  * @attention The PLSSVM model file is only compatible with LIBSVM for the one vs. one classification type.
  */
 template <typename label_type>
-inline void write_libsvm_model_data(const std::string &filename, const plssvm::parameter &params, const classification_type classification, const std::vector<real_type> &rho, const std::vector<aos_matrix<real_type>> &alpha, const std::vector<std::vector<std::size_t>> &index_sets, const data_set<label_type> &data) {
+inline void write_libsvm_model_data(const std::string &filename, const mpi::communicator &comm, const plssvm::parameter &params, const classification_type classification, const std::vector<real_type> &rho, const std::vector<aos_matrix<real_type>> &alpha, const std::vector<std::vector<std::size_t>> &index_sets, const data_set<label_type> &data) {
     PLSSVM_ASSERT(!filename.empty(), "The provided model filename must not be empty!");
     PLSSVM_ASSERT(data.has_labels(), "Cannot write a model file that does not include labels!");
     PLSSVM_ASSERT(rho.size() == calculate_number_of_classifiers(classification, data.num_classes()),
@@ -725,7 +729,7 @@ inline void write_libsvm_model_data(const std::string &filename, const plssvm::p
     fmt::ostream out = fmt::output_file(filename);
 
     // write header information
-    const std::vector<label_type> label_order = write_libsvm_model_header(out, params, rho, data);
+    const std::vector<label_type> label_order = write_libsvm_model_header(out, comm, params, rho, data);
 
     // the maximum size of one formatted LIBSVM entry, e.g., 1234:1.365363e+10
     // biggest number representable as std::size_t: 18446744073709551615 -> 20 chars
diff --git a/include/plssvm/detail/logging.hpp b/include/plssvm/detail/logging.hpp
index 8cccb39b9..ee6350d9e 100644
--- a/include/plssvm/detail/logging.hpp
+++ b/include/plssvm/detail/logging.hpp
@@ -15,6 +15,7 @@
 
 #include "plssvm/detail/tracking/performance_tracker.hpp"  // plssvm::detail::tracking::is_tracking_entry_v,
                                                            // PLSSVM_PERFORMANCE_TRACKER_ENABLED, PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
 #include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity_level, plssvm::verbosity, bitwise-operators on plssvm::verbosity_level
 
 #include "fmt/chrono.h"  // format std::chrono types
@@ -38,7 +39,7 @@ namespace plssvm::detail {
  * @param[in] args the values to fill the {fmt}-like placeholders in @p msg
  */
 template <typename... Args>
-void log(const verbosity_level verb, const std::string_view msg, Args &&...args) {
+void log(const verbosity_level verb, const std::string_view msg, Args &&...args) {  // TODO: remove
     // if the verbosity level is quiet, nothing is logged
     // otherwise verb must contain the bit-flag currently set by plssvm::verbosity
     if (verbosity != verbosity_level::quiet && (verb & verbosity) != verbosity_level::quiet) {
@@ -62,6 +63,28 @@ void log(const verbosity_level verb, const std::string_view msg, Args &&...args)
 #endif
 }
 
+/**
+ * @brief Output the message @p msg filling the {fmt} like placeholders with @p args to the standard output stream if @p comm represents the current main MPI rank.
+ * @details If a value in @p args is of type plssvm::detail::tracking_entry and performance tracking is enabled,
+ *          this is also added to the `plssvm::detail::performance_tracker`.
+ *          Only logs the message if the verbosity level matches the `plssvm::verbosity` level.
+ * @tparam Args the types of the placeholder values
+ * @param[in] verb the verbosity level of the message to log; must match the `plssvm::verbosity` level to log the message
+ * @param[in] comm the used MPI communicator
+ * @param[in] msg the message to print on the standard output stream if requested (i.e., `plssvm::verbosity` isn't `plssvm::verbosity_level::quiet`)
+ * @param[in] args the values to fill the {fmt}-like placeholders in @p msg
+ */
+template <typename... Args>
+void log(const verbosity_level verb, const mpi::communicator &comm, const std::string_view msg, Args &&...args) {
+    if (comm.is_main_rank()) {
+        // only print on the main MPI rank
+        log(verb, msg, std::forward<Args>(args)...);
+    } else {
+        // set output to quiet otherwise
+        log(verbosity_level::quiet, msg, std::forward<Args>(args)...);
+    }
+}
+
 }  // namespace plssvm::detail
 
 #endif  // PLSSVM_DETAIL_LOGGING_HPP_
diff --git a/include/plssvm/detail/logging_without_performance_tracking.hpp b/include/plssvm/detail/logging_without_performance_tracking.hpp
index a92729a66..650646081 100644
--- a/include/plssvm/detail/logging_without_performance_tracking.hpp
+++ b/include/plssvm/detail/logging_without_performance_tracking.hpp
@@ -13,6 +13,7 @@
 #define PLSSVM_DETAIL_LOGGING_WITHOUT_PERFORMANCE_TRACKING_HPP_
 #pragma once
 
+#include "plssvm/mpi/communicator.hpp"  // plssvm::mpi::communicator
 #include "plssvm/verbosity_levels.hpp"  // plssvm::verbosity_level, plssvm::verbosity, bitwise-operators on plssvm::verbosity_level
 
 #include "fmt/chrono.h"  // format std::chrono types
@@ -46,6 +47,26 @@ void log_untracked(const verbosity_level verb, const std::string_view msg, Args
     }
 }
 
+/**
+ * @brief Output the message @p msg filling the {fmt} like placeholders with @p args to the standard output stream if @p comm represents the current main MPI rank.
+ * @details Only logs the message if the verbosity level matches the `plssvm::verbosity` level.
+ * @tparam Args the types of the placeholder values
+ * @param[in] verb the verbosity level of the message to log; must match the `plssvm::verbosity` level to log the message
+ * @param[in] comm the used MPI communicator
+ * @param[in] msg the message to print on the standard output stream if requested (i.e., `plssvm::verbosity` isn't `plssvm::verbosity_level::quiet`)
+ * @param[in] args the values to fill the {fmt}-like placeholders in @p msg
+ */
+template <typename... Args>
+void log_untracked(const verbosity_level verb, const mpi::communicator &comm, const std::string_view msg, Args &&...args) {
+    if (comm.is_main_rank()) {
+        // only print on the main MPI rank
+        log_untracked(verb, msg, std::forward<Args>(args)...);
+    } else {
+        // set output to quiet otherwise
+        log_untracked(verbosity_level::quiet, msg, std::forward<Args>(args)...);
+    }
+}
+
 }  // namespace plssvm::detail
 
 #endif  // PLSSVM_DETAIL_LOGGING_WITHOUT_PERFORMANCE_TRACKING_HPP_
diff --git a/include/plssvm/model.hpp b/include/plssvm/model.hpp
index 6522526b3..0bec38869 100644
--- a/include/plssvm/model.hpp
+++ b/include/plssvm/model.hpp
@@ -23,6 +23,7 @@
 #include "plssvm/detail/tracking/performance_tracker.hpp"  // PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY, plssvm::detail::tracking::tracking_entry
 #include "plssvm/detail/type_list.hpp"                     // plssvm::detail::{supported_label_types, tuple_contains_v}
 #include "plssvm/matrix.hpp"                               // plssvm::soa_matrix, plssvm::aos_matrix
+#include "plssvm/mpi/communicator.hpp"                     // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                            // plssvm::parameter
 #include "plssvm/verbosity_levels.hpp"                     // plssvm::verbosity_level
 
@@ -45,6 +46,7 @@ namespace plssvm {
 
 /**
  * @brief Implements a class encapsulating the result of a call to the SVM fit function. A model is used to predict the labels of a new data set.
+ * @note Currently, **each** MPI rank loads/stores the whole data set (if MPI is available).
  * @tparam U the type of the used labels (must be an arithmetic type or `std:string`; default: `int`)
  */
 template <typename U = int>
@@ -67,9 +69,17 @@ class model {
      * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::detail::io::parse_libsvm_model_header and plssvm::detail::io::parse_libsvm_data
      */
     explicit model(const std::string &filename);
+    /**
+     * @brief Read a previously learned model from the LIBSVM model file @p filename.
+     * @param[in] comm the used MPI communicator (**note**: currently unused)
+     * @param[in] filename the model file to read
+     * @throws plssvm::invalid_file_format_exception all exceptions thrown by plssvm::detail::io::parse_libsvm_model_header and plssvm::detail::io::parse_libsvm_data
+     */
+    model(mpi::communicator comm, const std::string &filename);
 
     /**
      * @brief Save the model to a LIBSVM model file for later usage.
+     * @note Only the main MPI rank (traditionally rank 0) saves the whole data set (if MPI is available).
      * @param[in] filename the file to save the model to
      */
     void save(const std::string &filename) const;
@@ -177,6 +187,9 @@ class model {
     /// The number of iterations needed to fit this model.
     std::optional<std::vector<unsigned long long>> num_iters_{};
 
+    /// The used MPI communicator.
+    mpi::communicator comm_{};
+
     /**
      * @brief The learned weights for each support vector.
      * @details For one vs. all the vector contains a single matrix representing all weights.
@@ -213,10 +226,16 @@ model<U>::model(parameter params, data_set<label_type> data, const classificatio
     classification_strategy_{ classification_strategy },
     data_{ std::move(data) },
     num_support_vectors_{ data_.num_data_points() },
-    num_features_{ data_.num_features() } { }
+    num_features_{ data_.num_features() },
+    comm_{ data_.communicator() } { }
+
+template <typename U>
+model<U>::model(const std::string &filename) :
+    model{ mpi::communicator{}, filename } { }
 
 template <typename U>
-model<U>::model(const std::string &filename) {
+model<U>::model(mpi::communicator comm, const std::string &filename) :
+    comm_{ std::move(comm) } {
     const std::chrono::time_point start_time = std::chrono::steady_clock::now();
 
     // open the file
@@ -271,6 +290,7 @@ model<U>::model(const std::string &filename) {
 
     const std::chrono::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Read {} support vectors with {} features and {} classes using {} classification in {} using the libsvm model parser from file '{}'.\n\n",
                 detail::tracking::tracking_entry{ "model_read", "num_support_vectors", num_support_vectors_ },
                 detail::tracking::tracking_entry{ "model_read", "num_features", num_features_ },
@@ -290,11 +310,14 @@ void model<U>::save(const std::string &filename) const {
 
     const std::chrono::time_point start_time = std::chrono::steady_clock::now();
 
-    // save model file header and support vectors
-    detail::io::write_libsvm_model_data(filename, params_, classification_strategy_, *rho_ptr_, *alpha_ptr_, *index_sets_ptr_, data_);
+    if (comm_.is_main_rank()) {
+        // save model file header and support vectors
+        detail::io::write_libsvm_model_data(filename, comm_, params_, classification_strategy_, *rho_ptr_, *alpha_ptr_, *index_sets_ptr_, data_);
+    }
 
     const std::chrono::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Write {} support vectors with {} features and {} classes using {} classification in {} to the libsvm model file '{}'.\n",
                 detail::tracking::tracking_entry{ "model_write", "num_support_vectors", num_support_vectors_ },
                 detail::tracking::tracking_entry{ "model_write", "num_features", num_features_ },
diff --git a/src/main_predict.cpp b/src/main_predict.cpp
index 3d47ad53f..f87e5c3f3 100644
--- a/src/main_predict.cpp
+++ b/src/main_predict.cpp
@@ -22,7 +22,7 @@
     #include "hws/system_hardware_sampler.hpp"  // hws::system_hardware_sampler
 #endif
 
-#include "fmt/format.h"  // fmt::print
+#include "fmt/format.h"  // fmt::print, fmt::format
 #include "fmt/os.h"      // fmt::ostream, fmt::output_file
 #include "fmt/ranges.h"  // fmt::join
 
@@ -34,6 +34,7 @@
 #include <functional>  // std::mem_fn
 #include <iostream>    // std::cerr, std::endl
 #include <memory>      // std::unique_ptr, std::make_unique
+#include <string>      // std::string
 #include <utility>     // std::pair
 #include <variant>     // std::visit
 #include <vector>      // std::vector
@@ -41,9 +42,11 @@
 using namespace std::chrono_literals;
 
 int main(int argc, char *argv[]) {
-    // create std::unique_ptr containing a plssvm::scope_guard
-    // -> used to automatically handle necessary environment teardown operations
-    std::unique_ptr<plssvm::environment::scope_guard> environment_guard{};
+    // create environment scoped guard
+    const plssvm::environment::scope_guard environment_guard{};
+    // create a PLSSVM communicator -> use MPI_COMM_WORLD for our executables
+    // if MPI is not supported, does nothing
+    const plssvm::mpi::communicator comm{};
 
     try {
         const std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
@@ -56,17 +59,22 @@ int main(int argc, char *argv[]) {
 #endif
 
         // parse SVM parameter from command line
-        const plssvm::detail::cmd::parser_predict cmd_parser{ argc, argv };
+        const plssvm::detail::cmd::parser_predict cmd_parser{ comm, argc, argv };
+
+        // add MPI related tracking entries
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "mpi", "", comm }));
 
         // send warning if the build type is release and assertions are enabled
         if constexpr (std::string_view{ PLSSVM_BUILD_TYPE } == "Release" && PLSSVM_IS_DEFINED(PLSSVM_ENABLE_ASSERTS)) {
             plssvm::detail::log(plssvm::verbosity_level::full | plssvm::verbosity_level::warning,
+                                comm,
                                 "WARNING: The build type is set to Release, but assertions are enabled. "
                                 "This may result in a noticeable performance degradation in parts of PLSSVM!\n");
         }
 
         // output used parameter
         plssvm::detail::log(plssvm::verbosity_level::full,
+                            comm,
                             "\ntask: prediction\n{}\n",
                             plssvm::detail::tracking::tracking_entry{ "parameter", "", cmd_parser });
 
@@ -76,39 +84,28 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
-            // check whether HPX is used as backend (it is either requested directly or as automatic backend)
-            const bool use_hpx_as_backend{ cmd_parser.backend == plssvm::backend_type::hpx || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::hpx) };
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
-            // initialize environments if necessary
-            std::vector<plssvm::backend_type> backends_to_initialize{};
-            if (use_hpx_as_backend) {
-                backends_to_initialize.push_back(plssvm::backend_type::hpx);
-            }
-            if (use_kokkos_as_backend) {
-                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
-            }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
-
             // create default csvm
             const std::unique_ptr<plssvm::csvm> svm = [&]() {
                 if (use_sycl_as_backend) {
-                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type);
+                    return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type);
                 } else if (use_kokkos_as_backend) {
-                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
+                    return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
                 } else {
-                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target);
+                    return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target);
                 }
             }();
 
             // create model
-            const plssvm::model<label_type> model{ cmd_parser.model_filename };
+            const plssvm::model<label_type> model{ comm, cmd_parser.model_filename };
 
             // output parameter used to learn the model
             {
                 const plssvm::parameter params = model.get_params();
                 plssvm::detail::log(plssvm::verbosity_level::full,
+                                    comm,
                                     "Parameter used to train the model:\n"
                                     "  kernel_type: {} -> {}\n",
                                     params.kernel_type,
@@ -118,6 +115,7 @@ int main(int argc, char *argv[]) {
                         break;
                     case plssvm::kernel_function_type::polynomial:
                         plssvm::detail::log(plssvm::verbosity_level::full,
+                                            comm,
                                             "  degree: {}\n"
                                             "  gamma: {}\n"
                                             "  coef0: {}\n",
@@ -128,10 +126,11 @@ int main(int argc, char *argv[]) {
                     case plssvm::kernel_function_type::rbf:
                     case plssvm::kernel_function_type::laplacian:
                     case plssvm::kernel_function_type::chi_squared:
-                        plssvm::detail::log(plssvm::verbosity_level::full, "  gamma: {}\n", plssvm::get_gamma_string(params.gamma));
+                        plssvm::detail::log(plssvm::verbosity_level::full, comm, "  gamma: {}\n", plssvm::get_gamma_string(params.gamma));
                         break;
                     case plssvm::kernel_function_type::sigmoid:
                         plssvm::detail::log(plssvm::verbosity_level::full,
+                                            comm,
                                             "  gamma: {}\n"
                                             "  coef0: {}\n",
                                             plssvm::get_gamma_string(params.gamma),
@@ -147,11 +146,15 @@ int main(int argc, char *argv[]) {
             {
                 const std::chrono::time_point write_start_time = std::chrono::steady_clock::now();
 
-                fmt::ostream out = fmt::output_file(cmd_parser.predict_filename);
-                out.print("{}", fmt::join(predicted_labels, "\n"));
+                // only write predict file on the main MPI rank
+                if (comm.is_main_rank()) {
+                    fmt::ostream out = fmt::output_file(cmd_parser.predict_filename);
+                    out.print("{}", fmt::join(predicted_labels, "\n"));
+                }
 
                 const std::chrono::time_point write_end_time = std::chrono::steady_clock::now();
                 plssvm::detail::log(plssvm::verbosity_level::full | plssvm::verbosity_level::timing,
+                                    comm,
                                     "Write {} predictions in {} to the file '{}'.\n",
                                     plssvm::detail::tracking::tracking_entry{ "predictions_write", "num_predictions", predicted_labels.size() },
                                     plssvm::detail::tracking::tracking_entry{ "predictions_write", "time", std::chrono::duration_cast<std::chrono::milliseconds>(write_end_time - write_start_time) },
@@ -165,15 +168,15 @@ int main(int argc, char *argv[]) {
                 const plssvm::classification_report report{ correct_labels, predicted_labels };
 
                 // print complete report
-                plssvm::detail::log(plssvm::verbosity_level::full, "\n{}\n", report);
+                plssvm::detail::log(plssvm::verbosity_level::full, comm, "\n{}\n", report);
                 // print only accuracy for LIBSVM conformity
-                plssvm::detail::log(plssvm::verbosity_level::libsvm, "{} (classification)\n", report.accuracy());
+                plssvm::detail::log(plssvm::verbosity_level::libsvm, comm, "{} (classification)\n", report.accuracy());
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "accuracy", "achieved_accuracy", report.accuracy().achieved_accuracy }));
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "accuracy", "num_correct", report.accuracy().num_correct }));
                 PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "accuracy", "num_total", report.accuracy().num_total }));
             }
         };
-        std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(cmd_parser));
+        std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(comm, cmd_parser));
 
         // stop CPU hardware sampler and dump results if available
 #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED)
@@ -183,16 +186,25 @@ int main(int argc, char *argv[]) {
 
         const std::chrono::steady_clock::time_point end_time = std::chrono::steady_clock::now();
         plssvm::detail::log(plssvm::verbosity_level::full | plssvm::verbosity_level::timing,
+                            comm,
                             "\nTotal runtime: {}\n",
                             plssvm::detail::tracking::tracking_entry{ "", "total_time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) });
 
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE(cmd_parser.performance_tracking_filename);
+        // TODO: really change file name? what to output on the command line?
+        std::string performance_tracking_filename{ cmd_parser.performance_tracking_filename };
+#if defined(PLSSVM_HAS_MPI_ENABLED)
+        if (!performance_tracking_filename.empty()) {
+            // only append rank name to the file name if a file name has been provided
+            performance_tracking_filename += fmt::format(".{}", comm.rank());
+        }
+#endif
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE(performance_tracking_filename);
 
     } catch (const plssvm::exception &e) {
-        std::cerr << e.what_with_loc() << std::endl;
+        std::cerr << fmt::format("An exception occurred on MPI rank {}!: {}", comm.rank(), e.what_with_loc()) << std::endl;
         return EXIT_FAILURE;
     } catch (const std::exception &e) {
-        std::cerr << e.what() << std::endl;
+        std::cerr << fmt::format("An exception occurred on MPI rank {}!: {}", comm.rank(), e.what()) << std::endl;
         return EXIT_FAILURE;
     }
 
diff --git a/src/main_train.cpp b/src/main_train.cpp
index 2e2a39905..c28a8d328 100644
--- a/src/main_train.cpp
+++ b/src/main_train.cpp
@@ -21,6 +21,10 @@
     #include "hws/system_hardware_sampler.hpp"  // hws::system_hardware_sampler
 #endif
 
+#include "plssvm/mpi/detail/version.hpp"
+
+#include "fmt/format.h"  // fmt::format
+
 #include <algorithm>    // std::for_each
 #include <chrono>       // std::chrono::{steady_clock, duration, milliseconds}, std::chrono_literals namespace
 #include <cstddef>      // std::size_t
@@ -29,6 +33,7 @@
 #include <functional>   // std::mem_fn
 #include <iostream>     // std::cerr, std::endl
 #include <memory>       // std::unique_ptr, std::make_unique
+#include <string>       // std::string
 #include <type_traits>  // std::remove_reference_t
 #include <utility>      // std::pair
 #include <variant>      // std::visit
@@ -37,9 +42,11 @@
 using namespace std::chrono_literals;
 
 int main(int argc, char *argv[]) {
-    // create std::unique_ptr containing a plssvm::scope_guard
-    // -> used to automatically handle necessary environment teardown operations
-    std::unique_ptr<plssvm::environment::scope_guard> environment_guard{};
+    // create environment scoped guard
+    const plssvm::environment::scope_guard environment_guard{};
+    // create a PLSSVM communicator -> use MPI_COMM_WORLD for our executables
+    // if MPI is not supported, does nothing
+    const plssvm::mpi::communicator comm{};
 
     try {
         const std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
@@ -52,17 +59,22 @@ int main(int argc, char *argv[]) {
 #endif
 
         // parse SVM parameter from command line
-        plssvm::detail::cmd::parser_train cmd_parser{ argc, argv };
+        const plssvm::detail::cmd::parser_train cmd_parser{ comm, argc, argv };
+
+        // add MPI related tracking entries
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "mpi", "", comm }));
 
         // send warning if the build type is release and assertions are enabled
         if constexpr (std::string_view{ PLSSVM_BUILD_TYPE } == "Release" && PLSSVM_IS_DEFINED(PLSSVM_ENABLE_ASSERTS)) {
             plssvm::detail::log(plssvm::verbosity_level::full | plssvm::verbosity_level::warning,
+                                comm,
                                 "WARNING: The build type is set to Release, but assertions are enabled. "
                                 "This may result in a noticeable performance degradation in parts of PLSSVM!\n");
         }
 
         // output used parameter
         plssvm::detail::log(plssvm::verbosity_level::full,
+                            comm,
                             "\ntask: training\n{}\n\n\n",
                             plssvm::detail::tracking::tracking_entry{ "parameter", "", cmd_parser });
 
@@ -72,29 +84,17 @@ int main(int argc, char *argv[]) {
 
             // check whether SYCL is used as backend (it is either requested directly or as automatic backend)
             const bool use_sycl_as_backend{ cmd_parser.backend == plssvm::backend_type::sycl || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::sycl) };
-            // check whether HPX is used as backend (it is either requested directly or as automatic backend)
-            const bool use_hpx_as_backend{ cmd_parser.backend == plssvm::backend_type::hpx || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::hpx) };
             // check whether Kokkos is used as backend (it is either requested directly or as automatic backend)
             const bool use_kokkos_as_backend{ cmd_parser.backend == plssvm::backend_type::kokkos || (cmd_parser.backend == plssvm::backend_type::automatic && plssvm::determine_default_backend() == plssvm::backend_type::kokkos) };
 
-            // initialize environments if necessary
-            std::vector<plssvm::backend_type> backends_to_initialize{};
-            if (use_hpx_as_backend) {
-                backends_to_initialize.push_back(plssvm::backend_type::hpx);
-            }
-            if (use_kokkos_as_backend) {
-                backends_to_initialize.push_back(plssvm::backend_type::kokkos);
-            }
-            environment_guard = std::make_unique<plssvm::environment::scope_guard>(backends_to_initialize);
-
             // create SVM
             const std::unique_ptr<plssvm::csvm> svm = [&]() {
                 if (use_sycl_as_backend) {
-                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
+                    return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::sycl_implementation_type = cmd_parser.sycl_implementation_type, plssvm::sycl_kernel_invocation_type = cmd_parser.sycl_kernel_invocation_type);
                 } else if (use_kokkos_as_backend) {
-                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
+                    return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params, plssvm::kokkos_execution_space = cmd_parser.kokkos_execution_space);
                 } else {
-                    return plssvm::make_csvm(cmd_parser.backend, cmd_parser.target, cmd_parser.csvm_params);
+                    return plssvm::make_csvm(cmd_parser.backend, comm, cmd_parser.target, cmd_parser.csvm_params);
                 }
             }();
 
@@ -110,10 +110,11 @@ int main(int argc, char *argv[]) {
                                plssvm::max_iter = cmd_parser.max_iter,
                                plssvm::classification = cmd_parser.classification,
                                plssvm::solver = cmd_parser.solver);
+
             // save model to file
             model.save(cmd_parser.model_filename);
         };
-        std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(cmd_parser));
+        std::visit(data_set_visitor, plssvm::detail::cmd::data_set_factory(comm, cmd_parser));
 
         // stop CPU hardware sampler and dump results if available
 #if defined(PLSSVM_HARDWARE_SAMPLING_ENABLED)
@@ -121,18 +122,30 @@ int main(int argc, char *argv[]) {
         PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_HWS_ENTRY(sampler);
 #endif
 
+        // wait until all MPI processes reach this point
+        comm.barrier();
+
         const std::chrono::steady_clock::time_point end_time = std::chrono::steady_clock::now();
         plssvm::detail::log(plssvm::verbosity_level::full,
+                            comm,
                             "\nTotal runtime: {}\n",
                             plssvm::detail::tracking::tracking_entry{ "", "total_time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) });
 
-        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE(cmd_parser.performance_tracking_filename);
+        // TODO: really change file name? what to output on the command line?
+        std::string performance_tracking_filename{ cmd_parser.performance_tracking_filename };
+#if defined(PLSSVM_HAS_MPI_ENABLED)
+        if (!performance_tracking_filename.empty()) {
+            // only append rank name to the file name if a file name has been provided
+            performance_tracking_filename += fmt::format(".{}", comm.rank());
+        }
+#endif
+        PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_SAVE(performance_tracking_filename);
 
     } catch (const plssvm::exception &e) {
-        std::cerr << e.what_with_loc() << std::endl;
+        std::cerr << fmt::format("An exception occurred on MPI rank {}!: {}", comm.rank(), e.what_with_loc()) << std::endl;
         return EXIT_FAILURE;
     } catch (const std::exception &e) {
-        std::cerr << e.what() << std::endl;
+        std::cerr << fmt::format("An exception occurred on MPI rank {}!: {}", comm.rank(), e.what()) << std::endl;
         return EXIT_FAILURE;
     }
 
diff --git a/src/plssvm/backends/CUDA/csvm.cu b/src/plssvm/backends/CUDA/csvm.cu
index 9eebc97e3..ba29de3d7 100644
--- a/src/plssvm/backends/CUDA/csvm.cu
+++ b/src/plssvm/backends/CUDA/csvm.cu
@@ -26,6 +26,7 @@
 #include "plssvm/exceptions/exceptions.hpp"                                         // plssvm::exception
 #include "plssvm/gamma.hpp"                                                         // plssvm::gamma_type
 #include "plssvm/kernel_function_types.hpp"                                         // plssvm::kernel_function_type
+#include "plssvm/mpi/communicator.hpp"                                              // plssvm::mpi::communicator
 #include "plssvm/parameter.hpp"                                                     // plssvm::parameter
 #include "plssvm/shape.hpp"                                                         // plssvm::shape
 #include "plssvm/target_platforms.hpp"                                              // plssvm::target_platform
@@ -43,16 +44,23 @@
 #include <iostream>   // std::cout, std::endl
 #include <numeric>    // std::iota
 #include <string>     // std::string
+#include <utility>    // std::move
 #include <variant>    // std::get
 #include <vector>     // std:vector
 
 namespace plssvm::cuda {
 
 csvm::csvm(parameter params) :
-    csvm{ plssvm::target_platform::automatic, params } { }
+    csvm{ mpi::communicator{}, plssvm::target_platform::automatic, params } { }
+
+csvm::csvm(mpi::communicator comm, parameter params) :
+    csvm{ std::move(comm), plssvm::target_platform::automatic, params } { }
 
 csvm::csvm(target_platform target, parameter params) :
-    base_type{ params } {
+    csvm{ mpi::communicator{}, target, params } { }
+
+csvm::csvm(mpi::communicator comm, target_platform target, parameter params) :
+    base_type{ std::move(comm), params } {
     this->init(target);
 }
 
@@ -78,7 +86,10 @@ void csvm::init(const target_platform target) {
 #endif
     }
 
+    // TODO: how to handle device output on multiple MPI ranks?!
+
     plssvm::detail::log(verbosity_level::full,
+                        comm_,
                         "\nUsing CUDA ({}) as backend.\n",
                         plssvm::detail::tracking::tracking_entry{ "dependencies", "cuda_runtime_version", detail::get_runtime_version() });
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "backend", plssvm::backend_type::cuda }));
@@ -98,6 +109,7 @@ void csvm::init(const target_platform target) {
 
     // print found CUDA devices
     plssvm::detail::log(verbosity_level::full,
+                        comm_,
                         "Found {} CUDA device(s):\n",
                         plssvm::detail::tracking::tracking_entry{ "backend", "num_devices", devices_.size() });
     std::vector<std::string> device_names;
@@ -106,6 +118,7 @@ void csvm::init(const target_platform target) {
         cudaDeviceProp prop{};
         PLSSVM_CUDA_ERROR_CHECK(cudaGetDeviceProperties(&prop, device))
         plssvm::detail::log(verbosity_level::full,
+                            comm_,
                             "  [{}, {}, {}.{}]\n",
                             device,
                             prop.name,
@@ -115,6 +128,7 @@ void csvm::init(const target_platform target) {
     }
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((plssvm::detail::tracking::tracking_entry{ "backend", "device", device_names }));
     plssvm::detail::log(verbosity_level::full | verbosity_level::timing,
+                        comm_,
                         "\n");
 }
 
diff --git a/src/plssvm/csvm.cpp b/src/plssvm/csvm.cpp
index b2e1edfda..5018cf115 100644
--- a/src/plssvm/csvm.cpp
+++ b/src/plssvm/csvm.cpp
@@ -144,6 +144,7 @@ std::pair<soa_matrix<real_type>, std::vector<unsigned long long>> csvm::conjugat
 
         const std::size_t max_residual_difference_idx = rhs_idx_max_residual_difference();
         detail::log(verbosity_level::full | verbosity_level::timing,
+                    comm_,
                     "Start Iteration {} (max: {}) with {}/{} converged rhs (max residual {} with target residual {} for rhs {}). ",
                     iter + 1,
                     max_cg_iter,
@@ -189,6 +190,7 @@ std::pair<soa_matrix<real_type>, std::vector<unsigned long long>> csvm::conjugat
         const std::chrono::steady_clock::time_point iteration_end_time = std::chrono::steady_clock::now();
         const std::chrono::duration iteration_duration = std::chrono::duration_cast<std::chrono::milliseconds>(iteration_end_time - iteration_start_time);
         detail::log(verbosity_level::full | verbosity_level::timing,
+                    comm_,
                     "Done in {}.\n",
                     iteration_duration);
         total_iteration_time += iteration_duration;
@@ -199,6 +201,7 @@ std::pair<soa_matrix<real_type>, std::vector<unsigned long long>> csvm::conjugat
     }
     const std::size_t max_residual_difference_idx = rhs_idx_max_residual_difference();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Finished after {}/{} iterations with {}/{} converged rhs (max residual {} with target residual {} for rhs {}) and an average iteration time of {}.\n",
                 detail::tracking::tracking_entry{ "cg", "iterations", iter },
                 detail::tracking::tracking_entry{ "cg", "max_iterations", max_cg_iter },
@@ -213,6 +216,7 @@ std::pair<soa_matrix<real_type>, std::vector<unsigned long long>> csvm::conjugat
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "cg", "target_residuals", eps * eps * delta0 }));
     PLSSVM_DETAIL_TRACKING_PERFORMANCE_TRACKER_ADD_TRACKING_ENTRY((detail::tracking::tracking_entry{ "cg", "epsilon", eps }));
     detail::log(verbosity_level::libsvm,
+                comm_,
                 "optimization finished, #iter = {}\n",
                 iter);
 
@@ -271,6 +275,7 @@ std::pair<std::vector<real_type>, real_type> csvm::perform_dimensional_reduction
     const real_type QA_cost = kernel_function(A, num_rows_reduced, A, num_rows_reduced, params) + real_type{ 1.0 } / params.cost;
     const std::chrono::steady_clock::time_point dimension_reduction_end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Performed dimensional reduction in {}.\n",
                 detail::tracking::tracking_entry{ "cg", "dimensional_reduction", std::chrono::duration_cast<std::chrono::milliseconds>(dimension_reduction_end_time - dimension_reduction_start_time) });
 
@@ -296,6 +301,7 @@ aos_matrix<real_type> csvm::run_predict_values(const parameter &params, const so
 
     const std::chrono::steady_clock::time_point end_time = std::chrono::steady_clock::now();
     detail::log(verbosity_level::full | verbosity_level::timing,
+                comm_,
                 "Predicted the values of {} predict points using {} support vectors with {} features each in {}.\n",
                 predict_points.num_rows(),
                 support_vectors.num_rows(),
diff --git a/src/plssvm/detail/cmd/parser_predict.cpp b/src/plssvm/detail/cmd/parser_predict.cpp
index 656d9a76d..31a764626 100644
--- a/src/plssvm/detail/cmd/parser_predict.cpp
+++ b/src/plssvm/detail/cmd/parser_predict.cpp
@@ -14,6 +14,7 @@
 #include "plssvm/constants.hpp"                                    // plssvm::real_type
 #include "plssvm/detail/assert.hpp"                                // PLSSVM_ASSERT
 #include "plssvm/detail/logging_without_performance_tracking.hpp"  // plssvm::detail::log_untracked
+#include "plssvm/mpi/communicator.hpp"                             // plssvm::mpi::communicator
 #include "plssvm/target_platforms.hpp"                             // plssvm::list_available_target_platforms
 #include "plssvm/verbosity_levels.hpp"                             // plssvm::verbosity, plssvm::verbosity_level
 #include "plssvm/version/version.hpp"                              // plssvm::version::detail::get_version_info
@@ -24,6 +25,7 @@
 #include "fmt/ranges.h"  // fmt::join
 
 #include <cstdlib>      // std::exit, EXIT_SUCCESS, EXIT_FAILURE
+#include <cstdlib>      // std::atexit
 #include <exception>    // std::exception
 #include <filesystem>   // std::filesystem::path
 #include <iostream>     // std::cout, std::cerr, std::endl
@@ -32,11 +34,18 @@
 
 namespace plssvm::detail::cmd {
 
-parser_predict::parser_predict(int argc, char **argv) {
+parser_predict::parser_predict(const mpi::communicator &comm, int argc, char **argv) {
     // check for basic argc and argv correctness
     PLSSVM_ASSERT(argc >= 1, fmt::format("At least one argument is always given (the executable name), but argc is {}!", argc));
     PLSSVM_ASSERT(argv != nullptr, "At least one argument is always given (the executable name), but argv is a nullptr!");
 
+    // register a std::atexit handler since our parser may directly call std::exit
+    std::atexit([]() {
+        if (mpi::is_active()) {
+            mpi::finalize();
+        }
+    });
+
     // setup command line parser with all available options
     cxxopts::Options options("plssvm-predict", "LS-SVM with multiple (GPU-)backends");
     options
@@ -74,27 +83,35 @@ parser_predict::parser_predict(int argc, char **argv) {
         options.parse_positional({ "test", "model", "output" });
         result = options.parse(argc, argv);
     } catch (const std::exception &e) {
-        std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_FAILURE);
     }
 
     // print help message and exit
     if (result.count("help")) {
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_SUCCESS);
     }
 
     // print version info
     if (result.count("version")) {
-        std::cout << version::detail::get_version_info("plssvm-predict") << std::endl;
+        if (comm.is_main_rank()) {
+            std::cout << version::detail::get_version_info("plssvm-predict") << std::endl;
+        }
         std::exit(EXIT_SUCCESS);
     }
 
     // check if the number of positional arguments is not too large
     if (!result.unmatched().empty()) {
-        std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: only up to three positional options may be given, but {} (\"{}\") additional option(s) where provided!", result.unmatched().size(), fmt::join(result.unmatched(), " ")) << std::endl;
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: only up to three positional options may be given, but {} (\"{}\") additional option(s) where provided!", result.unmatched().size(), fmt::join(result.unmatched(), " ")) << std::endl;
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_FAILURE);
     }
 
@@ -116,6 +133,7 @@ parser_predict::parser_predict(int argc, char **argv) {
         // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
         if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
                                   "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
                                   sycl_implementation_type);
         }
@@ -134,6 +152,7 @@ parser_predict::parser_predict(int argc, char **argv) {
         // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
         if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
                                   "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
                                   kokkos_execution_space);
         }
@@ -151,6 +170,7 @@ parser_predict::parser_predict(int argc, char **argv) {
         const verbosity_level verb = result["verbosity"].as<verbosity_level>();
         if (quiet && verb != verbosity_level::quiet) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
                                   "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
                                   verb);
             verbosity = verbosity_level::quiet;
@@ -163,16 +183,20 @@ parser_predict::parser_predict(int argc, char **argv) {
 
     // parse test data filename
     if (!result.count("test")) {
-        std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: missing test file!\n") << std::endl;
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: missing test file!\n") << std::endl;
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_FAILURE);
     }
     input_filename = result["test"].as<decltype(input_filename)>();
 
     // parse model filename
     if (!result.count("model")) {
-        std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: missing model file!\n") << std::endl;
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: missing model file!\n") << std::endl;
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_FAILURE);
     }
     model_filename = result["model"].as<decltype(model_filename)>();
diff --git a/src/plssvm/detail/cmd/parser_train.cpp b/src/plssvm/detail/cmd/parser_train.cpp
index 31d5b8719..ef9dc1f21 100644
--- a/src/plssvm/detail/cmd/parser_train.cpp
+++ b/src/plssvm/detail/cmd/parser_train.cpp
@@ -19,9 +19,11 @@
 #include "plssvm/detail/utility.hpp"                               // plssvm::detail::to_underlying
 #include "plssvm/gamma.hpp"                                        // plssvm::get_gamma_string
 #include "plssvm/kernel_function_types.hpp"                        // plssvm::kernel_type_to_math_string
-#include "plssvm/target_platforms.hpp"                             // plssvm::list_available_target_platforms
-#include "plssvm/verbosity_levels.hpp"                             // plssvm::verbosity, plssvm::verbosity_level
-#include "plssvm/version/version.hpp"                              // plssvm::version::detail::get_version_info
+#include "plssvm/mpi/communicator.hpp"                             // plssvm::mpi::communicator
+#include "plssvm/mpi/environment.hpp"
+#include "plssvm/target_platforms.hpp"  // plssvm::list_available_target_platforms
+#include "plssvm/verbosity_levels.hpp"  // plssvm::verbosity, plssvm::verbosity_level
+#include "plssvm/version/version.hpp"   // plssvm::version::detail::get_version_info
 
 #include "cxxopts.hpp"   // cxxopts::Options, cxxopts::value,cxxopts::ParseResult
 #include "fmt/color.h"   // fmt::fg, fmt::color::red
@@ -29,6 +31,7 @@
 #include "fmt/ranges.h"  // fmt::join
 
 #include <cstdlib>      // std::exit, EXIT_SUCCESS, EXIT_FAILURE
+#include <cstdlib>      // std::atexit
 #include <exception>    // std::exception
 #include <filesystem>   // std::filesystem::path
 #include <iostream>     // std::cout, std::cerr, std::endl
@@ -39,11 +42,18 @@
 
 namespace plssvm::detail::cmd {
 
-parser_train::parser_train(int argc, char **argv) {
+parser_train::parser_train(const mpi::communicator &comm, int argc, char **argv) {
     // check for basic argc and argv correctness
     PLSSVM_ASSERT(argc >= 1, fmt::format("At least one argument is always given (the executable name), but argc is {}!", argc));
     PLSSVM_ASSERT(argv != nullptr, "At least one argument is always given (the executable name), but argv is a nullptr!");
 
+    // register a std::atexit handler since our parser may directly call std::exit
+    std::atexit([]() {
+        if (mpi::is_active()) {
+            mpi::finalize();
+        }
+    });
+
     // create the help message for the kernel function type
     const auto kernel_type_to_help_entry = [](const kernel_function_type kernel) {
         return fmt::format("\t {} -- {}: {}\n", detail::to_underlying(kernel), kernel, kernel_function_type_to_math_string(kernel));
@@ -99,27 +109,35 @@ parser_train::parser_train(int argc, char **argv) {
         options.parse_positional({ "input", "model" });
         result = options.parse(argc, argv);
     } catch (const std::exception &e) {
-        std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: {}\n", e.what()) << std::endl;
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_FAILURE);
     }
 
     // print help message and exit
     if (result.count("help")) {
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_SUCCESS);
     }
 
     // print version info
     if (result.count("version")) {
-        std::cout << version::detail::get_version_info("plssvm-train") << std::endl;
+        if (comm.is_main_rank()) {
+            std::cout << version::detail::get_version_info("plssvm-train") << std::endl;
+        }
         std::exit(EXIT_SUCCESS);
     }
 
     // check if the number of positional arguments is not too large
     if (!result.unmatched().empty()) {
-        std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: only up to two positional options may be given, but {} (\"{}\") additional option(s) where provided!\n", result.unmatched().size(), fmt::join(result.unmatched(), " ")) << std::endl;
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: only up to two positional options may be given, but {} (\"{}\") additional option(s) where provided!\n", result.unmatched().size(), fmt::join(result.unmatched(), " ")) << std::endl;
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_FAILURE);
     }
 
@@ -138,8 +156,10 @@ parser_train::parser_train(int argc, char **argv) {
         const decltype(csvm_params.gamma) gamma_input = result["gamma"].as<decltype(csvm_params.gamma)>();
         // check if the provided gamma is legal iff a real_type has been provided
         if (std::holds_alternative<real_type>(gamma_input) && std::get<real_type>(gamma_input) <= real_type{ 0.0 }) {
-            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: gamma must be greater than 0.0, but is {}!\n", std::get<real_type>(gamma_input)) << std::endl;
-            std::cout << options.help() << std::endl;
+            if (comm.is_main_rank()) {
+                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: gamma must be greater than 0.0, but is {}!\n", std::get<real_type>(gamma_input)) << std::endl;
+                std::cout << options.help() << std::endl;
+            }
             std::exit(EXIT_FAILURE);
         }
         // provided gamma was legal -> override default value
@@ -166,8 +186,10 @@ parser_train::parser_train(int argc, char **argv) {
         const auto max_iter_input = result["max_iter"].as<long long int>();
         // check if the provided max_iter is legal
         if (max_iter_input <= decltype(max_iter_input){ 0 }) {
-            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: max_iter must be greater than 0, but is {}!\n", max_iter_input) << std::endl;
-            std::cout << options.help() << std::endl;
+            if (comm.is_main_rank()) {
+                std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: max_iter must be greater than 0, but is {}!\n", max_iter_input) << std::endl;
+                std::cout << options.help() << std::endl;
+            }
             std::exit(EXIT_FAILURE);
         }
         // provided max_iter was legal -> override default value
@@ -200,6 +222,7 @@ parser_train::parser_train(int argc, char **argv) {
         // warn if kernel invocation type is explicitly set but SYCL isn't the current (automatic) backend
         if (!sycl_backend_is_used && sycl_kernel_invocation_type != sycl::kernel_invocation_type::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
                                   "WARNING: explicitly set a SYCL kernel invocation type but the current backend isn't SYCL; ignoring --sycl_kernel_invocation_type={}\n",
                                   sycl_kernel_invocation_type);
         }
@@ -210,6 +233,7 @@ parser_train::parser_train(int argc, char **argv) {
         // warn if a SYCL implementation type is explicitly set but SYCL isn't the current (automatic) backend
         if (!sycl_backend_is_used && sycl_implementation_type != sycl::implementation_type::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
                                   "WARNING: explicitly set a SYCL implementation type but the current backend isn't SYCL; ignoring --sycl_implementation_type={}\n",
                                   sycl_implementation_type);
         }
@@ -228,6 +252,7 @@ parser_train::parser_train(int argc, char **argv) {
         // warn if the kokkos execution space is explicitly set but Kokkos isn't the current (automatic) backend
         if (!kokkos_backend_is_used && kokkos_execution_space != kokkos::execution_space::automatic) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
                                   "WARNING: explicitly set a Kokkos execution space but the current backend isn't Kokkos; ignoring --kokkos_execution_space={}\n",
                                   kokkos_execution_space);
         }
@@ -245,6 +270,7 @@ parser_train::parser_train(int argc, char **argv) {
         const verbosity_level verb = result["verbosity"].as<verbosity_level>();
         if (quiet && verb != verbosity_level::quiet) {
             detail::log_untracked(verbosity_level::full | verbosity_level::warning,
+                                  comm,
                                   "WARNING: explicitly set the -q/--quiet flag, but the provided verbosity level isn't \"quiet\"; setting --verbosity={} to --verbosity=quiet\n",
                                   verb);
             verbosity = verbosity_level::quiet;
@@ -257,8 +283,10 @@ parser_train::parser_train(int argc, char **argv) {
 
     // parse input data filename
     if (!result.count("input")) {
-        std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: missing input file!\n") << std::endl;
-        std::cout << options.help() << std::endl;
+        if (comm.is_main_rank()) {
+            std::cerr << fmt::format(fmt::fg(fmt::color::red), "ERROR: missing input file!\n") << std::endl;
+            std::cout << options.help() << std::endl;
+        }
         std::exit(EXIT_FAILURE);
     }
     input_filename = result["input"].as<decltype(input_filename)>();