From 5a2d420d3a5994e30dcb953167c901df86cbcc1b Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Fri, 26 Apr 2024 23:45:51 +0000
Subject: [PATCH 1/6] Revert "NVFUSER_DISTRIBUTED instead of USE_DISTRIBUTED
 (#1711)"

This reverts commit 9ae6c767212a12e7b28be5627744afd5965296d6.
---
 CMakeLists.txt                     |  8 --------
 csrc/multidevice/communication.cpp |  4 ++--
 csrc/multidevice/communication.h   |  2 +-
 csrc/multidevice/communicator.cpp  |  8 ++++----
 csrc/multidevice/communicator.h    |  2 +-
 csrc/multidevice/utils.cpp         |  2 +-
 setup.py                           | 17 +----------------
 tools/gen_nvfuser_version.py       | 16 ----------------
 8 files changed, 10 insertions(+), 49 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ede7cb1219..001c3ad0950 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,13 +15,6 @@ set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
 option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
 option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)
 
-include(CMakeDependentOption)
-cmake_dependent_option(NVFUSER_DISTRIBUTED "" ON "USE_DISTRIBUTED" OFF)
-if (NVFUSER_DISTRIBUTED)
-  add_compile_definitions(NVFUSER_DISTRIBUTED)
-endif()
-message(STATUS "Setting NVFUSER_DISTRIBUTED=${NVFUSER_DISTRIBUTED}")
-
 # We try to update which C++ standard we use together in lockstep across all
 # built libraries, and these variables control which that is. Generally we are
 # on C++20, but we still support a version of CUDA (11) that does not recognize
@@ -769,7 +762,6 @@ message(STATUS "******** Nvfuser configuration summary ********")
 message(STATUS "  UCC_FOUND: ${UCC_FOUND}")
 message(STATUS "  NVFUSER_STANDALONE_BUILD_WITH_UCC  : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
 message(STATUS "  NVFUSER_BUILD_WITH_ASAN            : ${NVFUSER_BUILD_WITH_ASAN}")
-message(STATUS "  NVFUSER_DISTRIBUTED                : ${NVFUSER_DISTRIBUTED}")
 message(STATUS "  NVFUSER_CPP_STANDARD               : ${NVFUSER_CPP_STANDARD}")
 
 if(NVFUSER_STANDALONE_BUILD_WITH_UCC)
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index 4f4db1711ab..0882e9af335 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -6,7 +6,7 @@
  */
 // clang-format on
 #include <multidevice/communication.h>
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
+#if defined(USE_DISTRIBUTED) && defined(USE_C10D_NCCL)
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
 #endif
 #include <utils.h>
@@ -229,7 +229,7 @@ c10::intrusive_ptr<c10d::Work> Reduce::post(
   c10d::ReduceOptions options = {
       .reduceOp = params_.redOp, .rootRank = root_relative_index_};
   auto team_backend = comm.getBackendForTeam(params_.team, backend);
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
+#if defined(USE_DISTRIBUTED) && defined(USE_C10D_NCCL)
   auto nccl_backend = dynamic_cast<c10d::ProcessGroupNCCL*>(team_backend.get());
   if (nccl_backend) {
 #if NVF_TORCH_VERSION_NO_LESS(2, 3, 0)
diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h
index 5c2e59f887d..77f7a4b6de4 100644
--- a/csrc/multidevice/communication.h
+++ b/csrc/multidevice/communication.h
@@ -9,7 +9,7 @@
 
 #include <multidevice/communicator.h>
 #include <multidevice/multidevice.h>
-#ifdef NVFUSER_DISTRIBUTED
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Types.hpp>
 #else
 #include <multidevice/c10d_mock.h>
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index b113edd61d6..67766ce85d4 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -10,7 +10,7 @@
 #include <netdb.h>
 #include <map>
 
-#ifdef NVFUSER_DISTRIBUTED
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #ifdef USE_C10D_GLOO
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
@@ -132,7 +132,7 @@ inline std::string getTeamKey(const Team& team, CommunicatorBackend backend) {
       });
 }
 
-#ifdef NVFUSER_DISTRIBUTED
+#ifdef USE_DISTRIBUTED
 // creates and return a process group backend
 c10::intrusive_ptr<c10d::Backend> createBackend(
     CommunicatorBackend backend,
@@ -187,7 +187,7 @@ Communicator::Communicator(
     return;
   }
 
-#ifdef NVFUSER_DISTRIBUTED
+#ifdef USE_DISTRIBUTED
   c10d::TCPStoreOptions store_opts;
   {
     char hostname[HOST_NAME_MAX]; // NOLINT (modernize-avoid-c-arrays)
@@ -222,7 +222,7 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
   // check if backend associated with the team is present in the cache
   if (backends_.find(team_key) ==
       backends_.end()) { // create the backend and cache it
-#ifdef NVFUSER_DISTRIBUTED
+#ifdef USE_DISTRIBUTED
     // check that the caller's rank belongs to the requested team
     auto rank_it = std::find(team.begin(), team.end(), deviceId());
     NVF_ERROR(
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 9666ec10cb9..3a8fc465a01 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -13,7 +13,7 @@
 
 #include <exceptions.h>
 #include <multidevice/multidevice.h>
-#ifdef NVFUSER_DISTRIBUTED
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 037862e41e0..12a0f4b9a7c 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -21,7 +21,7 @@
 namespace nvfuser {
 
 NVF_API bool distributedEnabled() {
-#ifdef NVFUSER_DISTRIBUTED
+#ifdef USE_DISTRIBUTED
   return true;
 #else
   return false;
diff --git a/setup.py b/setup.py
index 5f5e47a8e03..3521035db1a 100644
--- a/setup.py
+++ b/setup.py
@@ -26,9 +26,6 @@
 #   --build-with-ucc
 #     Build nvfuser with UCC support. You may need to specify environment variables of UCC_HOME, UCC_DIR, UCX_HOME, UCX_DIR.
 #
-#   --build-without-distributed
-#     Build nvfuser without multidevice support
-#
 #   --debug
 #     Building nvfuser in debug mode
 #
@@ -74,7 +71,6 @@
 NO_NINJA = False
 BUILD_WITH_UCC = False
 BUILD_WITH_ASAN = False
-BUILD_WITHOUT_DISTRIBUTED = False
 OVERWRITE_VERSION = False
 VERSION_TAG = None
 BUILD_TYPE = "Release"
@@ -106,9 +102,6 @@
     if arg == "--build-with-asan":
         BUILD_WITH_ASAN = True
         continue
-    if arg == "--build-without-distributed":
-        BUILD_WITHOUT_DISTRIBUTED = True
-        continue
     if arg == "--debug":
         BUILD_TYPE = "Debug"
         continue
@@ -289,10 +282,7 @@ def cmake(install_prefix: str = "./nvfuser"):
     if not os.path.exists(cmake_build_dir):
         os.makedirs(cmake_build_dir)
 
-    from tools.gen_nvfuser_version import (
-        get_pytorch_cmake_prefix,
-        get_pytorch_use_distributed,
-    )
+    from tools.gen_nvfuser_version import get_pytorch_cmake_prefix
 
     # this is used to suppress import error.
     # so we can get the right pytorch prefix for cmake
@@ -306,8 +296,6 @@ def cmake(install_prefix: str = "./nvfuser"):
 
     logger.setLevel(logger_level)
 
-    pytorch_use_distributed = get_pytorch_use_distributed()
-
     # generate cmake directory
     cmd_str = [
         get_cmake_bin(),
@@ -315,7 +303,6 @@ def cmake(install_prefix: str = "./nvfuser"):
         "-DCMAKE_BUILD_TYPE=" + BUILD_TYPE,
         f"-DCMAKE_INSTALL_PREFIX={install_prefix}",
         f"-DNVFUSER_CPP_STANDARD={CPP_STANDARD}",
-        f"-DUSE_DISTRIBUTED={pytorch_use_distributed}",
         "-B",
         cmake_build_dir,
     ]
@@ -333,8 +320,6 @@ def cmake(install_prefix: str = "./nvfuser"):
         cmd_str.append("-DBUILD_NVFUSER_BENCHMARK=ON")
     if BUILD_WITH_ASAN:
         cmd_str.append("-DNVFUSER_BUILD_WITH_ASAN=ON")
-    if BUILD_WITHOUT_DISTRIBUTED:
-        cmd_str.append("-DNVFUSER_DISTRIBUTED=OFF")
     cmd_str.append(".")
 
     print(f"Configuring CMake with {' '.join(cmd_str)}")
diff --git a/tools/gen_nvfuser_version.py b/tools/gen_nvfuser_version.py
index 789aa96d37a..7537ff3ad4a 100644
--- a/tools/gen_nvfuser_version.py
+++ b/tools/gen_nvfuser_version.py
@@ -45,22 +45,6 @@ def get_pytorch_cmake_prefix():
     return stdout_msg.decode("utf-8").rstrip("\n")
 
 
-def get_pytorch_use_distributed():
-    from subprocess import Popen, PIPE
-
-    # need to do this in a separate process so we are not going to delete nvfuser library while it's loaded by torch
-    process_torch_prefix = Popen(
-        [
-            sys.executable,
-            "-c",
-            "import torch; print(torch._C._has_distributed())",
-        ],
-        stdout=PIPE,
-    )
-    stdout_msg, error_msg = process_torch_prefix.communicate()
-    return stdout_msg.decode("utf-8").rstrip("\n")
-
-
 if __name__ == "__main__":
     version_file = nvfuser_root / "nvfuser" / "version.py"
     with open(version_file, "w") as f:

From a5dd89b7f12fe168b1bc9ac0ed0c30381f98cebc Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Sat, 27 Apr 2024 02:04:43 +0000
Subject: [PATCH 2/6] Remove USE_DISTRIBUTED.

---
 csrc/multidevice/c10d_mock.h            | 134 ------------------------
 csrc/multidevice/communication.cpp      |   4 +-
 csrc/multidevice/communication.h        |   4 -
 csrc/multidevice/communicator.cpp       |  10 --
 csrc/multidevice/communicator.h         |  31 +++---
 csrc/multidevice/utils.cpp              |   8 --
 csrc/multidevice/utils.h                |   3 -
 tests/cpp/test_multidevice_pipeline.cpp |   5 +-
 tests/cpp/test_resharding.cpp           |   3 -
 9 files changed, 16 insertions(+), 186 deletions(-)
 delete mode 100644 csrc/multidevice/c10d_mock.h

diff --git a/csrc/multidevice/c10d_mock.h b/csrc/multidevice/c10d_mock.h
deleted file mode 100644
index 58572507216..00000000000
--- a/csrc/multidevice/c10d_mock.h
+++ /dev/null
@@ -1,134 +0,0 @@
-// clang-format off
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-present NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- */
-// clang-format on
-#pragma once
-
-#include <ATen/core/TensorBody.h>
-#include <ATen/core/ivalue.h>
-#include <c10/util/intrusive_ptr.h>
-
-namespace c10d {
-class Work : public torch::CustomClassHolder {
- public:
-  void wait() {}
-};
-
-struct ReduceOp : torch::CustomClassHolder {
-  enum RedOpType {
-    SUM,
-    AVG,
-    PRODUCT,
-    MIN,
-    MAX,
-    BAND,
-    BOR,
-    BXOR,
-    UNUSED,
-  };
-
-  ReduceOp() = default;
-  ReduceOp(RedOpType op) : op_(op) {}
-
-  RedOpType op_ = UNUSED;
-};
-
-struct ReduceScatterOptions {
-  ReduceOp reduceOp = ReduceOp::UNUSED;
-};
-
-struct ScatterOptions {
-  int64_t rootRank = 0;
-};
-
-struct AllgatherOptions {};
-
-struct GatherOptions {
-  int64_t rootRank = 0;
-};
-
-struct BroadcastOptions {
-  int64_t rootRank = 0;
-};
-
-struct AllreduceOptions {
-  ReduceOp reduceOp = ReduceOp::UNUSED;
-};
-
-struct ReduceOptions {
-  ReduceOp reduceOp = ReduceOp::UNUSED;
-  int64_t rootRank = 0;
-};
-
-class Backend : public torch::CustomClassHolder {
- public:
-  c10::intrusive_ptr<Work> barrier() {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> send(
-      std::vector<at::Tensor>& tensors,
-      int dstRank,
-      int tag) {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> recv(
-      std::vector<at::Tensor>& tensors,
-      int srcRank,
-      int tag) {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> allgather(
-      std::vector<std::vector<at::Tensor>>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
-      const AllgatherOptions& opts = AllgatherOptions()) {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> gather(
-      std::vector<std::vector<at::Tensor>>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
-      const GatherOptions& opts = GatherOptions()) {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> reduce_scatter(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
-    return c10::make_intrusive<Work>();
-  }
-  c10::intrusive_ptr<Work> scatter(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ScatterOptions& opts = ScatterOptions()) {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> broadcast(
-      std::vector<at::Tensor>& tensors,
-      const BroadcastOptions& opts = BroadcastOptions()) {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> allreduce(
-      std::vector<at::Tensor>& tensors,
-      const AllreduceOptions& opts = AllreduceOptions()) {
-    return c10::make_intrusive<Work>();
-  }
-
-  c10::intrusive_ptr<Work> reduce(
-      std::vector<at::Tensor>& tensors,
-      const ReduceOptions& opts = ReduceOptions()) {
-    return c10::make_intrusive<Work>();
-  }
-};
-
-class TCPStore : public torch::CustomClassHolder {};
-
-} // namespace c10d
diff --git a/csrc/multidevice/communication.cpp b/csrc/multidevice/communication.cpp
index 0882e9af335..d512cd4dcb3 100644
--- a/csrc/multidevice/communication.cpp
+++ b/csrc/multidevice/communication.cpp
@@ -6,7 +6,7 @@
  */
 // clang-format on
 #include <multidevice/communication.h>
-#if defined(USE_DISTRIBUTED) && defined(USE_C10D_NCCL)
+#if defined(USE_C10D_NCCL)
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
 #endif
 #include <utils.h>
@@ -229,7 +229,7 @@ c10::intrusive_ptr<c10d::Work> Reduce::post(
   c10d::ReduceOptions options = {
       .reduceOp = params_.redOp, .rootRank = root_relative_index_};
   auto team_backend = comm.getBackendForTeam(params_.team, backend);
-#if defined(USE_DISTRIBUTED) && defined(USE_C10D_NCCL)
+#if defined(USE_C10D_NCCL)
   auto nccl_backend = dynamic_cast<c10d::ProcessGroupNCCL*>(team_backend.get());
   if (nccl_backend) {
 #if NVF_TORCH_VERSION_NO_LESS(2, 3, 0)
diff --git a/csrc/multidevice/communication.h b/csrc/multidevice/communication.h
index 77f7a4b6de4..762a35b97b0 100644
--- a/csrc/multidevice/communication.h
+++ b/csrc/multidevice/communication.h
@@ -9,11 +9,7 @@
 
 #include <multidevice/communicator.h>
 #include <multidevice/multidevice.h>
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Types.hpp>
-#else
-#include <multidevice/c10d_mock.h>
-#endif
 #include <type.h>
 #include <visibility.h>
 
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 67766ce85d4..e0162c8bc8e 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -10,7 +10,6 @@
 #include <netdb.h>
 #include <map>
 
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
 #ifdef USE_C10D_GLOO
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
@@ -21,7 +20,6 @@
 #if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
 #include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
 #endif
-#endif
 
 namespace nvfuser {
 
@@ -132,7 +130,6 @@ inline std::string getTeamKey(const Team& team, CommunicatorBackend backend) {
       });
 }
 
-#ifdef USE_DISTRIBUTED
 // creates and return a process group backend
 c10::intrusive_ptr<c10d::Backend> createBackend(
     CommunicatorBackend backend,
@@ -164,7 +161,6 @@ c10::intrusive_ptr<c10d::Backend> createBackend(
 #endif
   NVF_ERROR(false, "no distributed backend available");
 }
-#endif
 } // namespace
 
 Communicator::Communicator(
@@ -187,7 +183,6 @@ Communicator::Communicator(
     return;
   }
 
-#ifdef USE_DISTRIBUTED
   c10d::TCPStoreOptions store_opts;
   {
     char hostname[HOST_NAME_MAX]; // NOLINT (modernize-avoid-c-arrays)
@@ -203,7 +198,6 @@ Communicator::Communicator(
       c10d::TCPStoreOptions::kDefaultPort; // 29500
   store_opts.port = master_port_ ? master_port_ : comm_master_port_default;
   store_ = c10::make_intrusive<c10d::TCPStore>(master_addr_, store_opts);
-#endif
 
 #if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
   ucc_available_ = true;
@@ -222,7 +216,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
   // check if backend associated with the team is present in the cache
   if (backends_.find(team_key) ==
       backends_.end()) { // create the backend and cache it
-#ifdef USE_DISTRIBUTED
     // check that the caller's rank belongs to the requested team
     auto rank_it = std::find(team.begin(), team.end(), deviceId());
     NVF_ERROR(
@@ -237,9 +230,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
         c10::make_intrusive<c10d::PrefixStore>(team_key, store_),
         team_rank,
         static_cast<int64_t>(team.size()));
-#else
-    backends_[team_key] = c10::make_intrusive<c10d::Backend>();
-#endif
   }
   return backends_.at(team_key);
 }
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 3a8fc465a01..33969f25129 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -10,32 +10,25 @@
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
-
-#include <exceptions.h>
-#include <multidevice/multidevice.h>
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
-#else
-#include <multidevice/c10d_mock.h>
-#endif
+
+#include <exceptions.h>
+#include <multidevice/multidevice.h>
 #include <visibility.h>
 
 namespace nvfuser {
 
-/*
-   This file implements the class Communicator which sets up the inter-process
-   Backend. This class contains inter-process information, such as the rank, the
-   world size, as well as the Process Group that can be called to perform
-   inter-process communications.
-
-   Each process is associated with a unique deviceId and device. The actual MPI
-   rank remains private to the class and should not be used by the user. The
-   communicator class holds privately the mappings ranks <-> device IDs <->
-   device.
-
-*/
+// This file implements the class Communicator which sets up the inter-process
+// Backend. This class contains inter-process information, such as the rank, the
+// world size, as well as the Process Group that can be called to perform
+// inter-process communications.
+//
+// Each process is associated with a unique deviceId and device. The actual MPI
+// rank remains private to the class and should not be used by the user. The
+// communicator class holds privately the mappings ranks <-> device IDs <->
+// device.
 
 using RankType = DeviceIdxType;
 
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 12a0f4b9a7c..df315b2045f 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -20,14 +20,6 @@
 
 namespace nvfuser {
 
-NVF_API bool distributedEnabled() {
-#ifdef USE_DISTRIBUTED
-  return true;
-#else
-  return false;
-#endif
-}
-
 namespace {
 
 std::unordered_set<IterDomain*> getShardedIterDomains(TensorView* tv) {
diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
index 7926c1873df..8f0e1a70e43 100644
--- a/csrc/multidevice/utils.h
+++ b/csrc/multidevice/utils.h
@@ -14,9 +14,6 @@
 
 namespace nvfuser {
 
-// Returns true iff nvFuser was compiled with distributed APIs enabled.
-NVF_API bool distributedEnabled();
-
 // Returns whether a TensorView has a non-reduction axis parallelized Didx
 // Checks that the other non-reduction axis are not parallelized on Didx
 NVF_API bool isSharded(TensorView*);
diff --git a/tests/cpp/test_multidevice_pipeline.cpp b/tests/cpp/test_multidevice_pipeline.cpp
index 27d857bf6dc..85f7f74c257 100644
--- a/tests/cpp/test_multidevice_pipeline.cpp
+++ b/tests/cpp/test_multidevice_pipeline.cpp
@@ -43,10 +43,9 @@ using namespace torch::jit::fuser::cuda;
 using namespace at::indexing;
 
 // To run the following tests on several devices, pytorch must be installed with
-// the flag USE_DISTRIBUTED=1 and nccl support. With that, nvFuser is built by
-// default with NVFUSER_DISTRIBUTED defined. Then, on a node with at least 6
+// the flag USE_DISTRIBUTED=1 and nccl support. Then, on a node with at least 6
 // GPUs, run the test using mpirun: `mpirun -np 6 build/test_multidevice
-// --gtest_filter=PipelineTwoStages*`.
+// --gtest_filter=PipelineTestTwoStages*`.
 
 TEST_F(PipelineTest, Pipeline) {
   const std::vector<int64_t> input_shape1 = {6, 7};
diff --git a/tests/cpp/test_resharding.cpp b/tests/cpp/test_resharding.cpp
index 0ef340493b1..5a6f7422ba7 100644
--- a/tests/cpp/test_resharding.cpp
+++ b/tests/cpp/test_resharding.cpp
@@ -225,9 +225,6 @@ TEST_F(ReshardingTest, Detection) {
 }
 
 TEST_P(ReshardingTest, Insert) {
-  if (!distributedEnabled()) { // Test only works with distributed
-    GTEST_SKIP() << "Requires distributed API";
-  }
   auto
       [mesh0,
        mesh1,

From 048b30a6a3813831dbff1286d143b828520bdf68 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 29 Apr 2024 17:42:39 +0000
Subject: [PATCH 3/6] Require USE_DISTRIBUTED to be on.

---
 setup.py                     |  7 ++++++-
 tools/gen_nvfuser_version.py | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3521035db1a..ff4198d6924 100644
--- a/setup.py
+++ b/setup.py
@@ -282,7 +282,10 @@ def cmake(install_prefix: str = "./nvfuser"):
     if not os.path.exists(cmake_build_dir):
         os.makedirs(cmake_build_dir)
 
-    from tools.gen_nvfuser_version import get_pytorch_cmake_prefix
+    from tools.gen_nvfuser_version import (
+        get_pytorch_cmake_prefix,
+        get_pytorch_use_distributed,
+    )
 
     # this is used to suppress import error.
     # so we can get the right pytorch prefix for cmake
@@ -296,6 +299,8 @@ def cmake(install_prefix: str = "./nvfuser"):
 
     logger.setLevel(logger_level)
 
+    assert get_pytorch_use_distributed(), "nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."
+
     # generate cmake directory
     cmd_str = [
         get_cmake_bin(),
diff --git a/tools/gen_nvfuser_version.py b/tools/gen_nvfuser_version.py
index 7537ff3ad4a..b115e4cb51a 100644
--- a/tools/gen_nvfuser_version.py
+++ b/tools/gen_nvfuser_version.py
@@ -45,6 +45,23 @@ def get_pytorch_cmake_prefix():
     return stdout_msg.decode("utf-8").rstrip("\n")
 
 
+def get_pytorch_use_distributed() -> bool:
+    from subprocess import Popen, PIPE
+
+    # need to do this in a separate process so we are not going to delete nvfuser library while it's loaded by torch
+    process_torch_prefix = Popen(
+        [
+            sys.executable,
+            "-c",
+            "import torch; print(torch._C._has_distributed())",
+        ],
+        stdout=PIPE,
+    )
+    stdout_msg, _ = process_torch_prefix.communicate()
+    stdout_msg = stdout_msg.decode("utf-8").rstrip("\n")
+    return bool(stdout_msg)
+
+
 if __name__ == "__main__":
     version_file = nvfuser_root / "nvfuser" / "version.py"
     with open(version_file, "w") as f:

From 3d4f96000f754081e474160c53d405cac93bdb36 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 29 Apr 2024 17:44:26 +0000
Subject: [PATCH 4/6] lintrunner.

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ff4198d6924..e430c85ee66 100644
--- a/setup.py
+++ b/setup.py
@@ -299,7 +299,9 @@ def cmake(install_prefix: str = "./nvfuser"):
 
     logger.setLevel(logger_level)
 
-    assert get_pytorch_use_distributed(), "nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."
+    assert (
+        get_pytorch_use_distributed()
+    ), "nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."
 
     # generate cmake directory
     cmd_str = [

From ef0d78c3ef4f2fe2b59a389aae5ba2fe3b57a5c2 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 29 Apr 2024 13:49:04 -0700
Subject: [PATCH 5/6] Fix literal evaluation.

---
 tools/gen_nvfuser_version.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tools/gen_nvfuser_version.py b/tools/gen_nvfuser_version.py
index b115e4cb51a..c4072939d46 100644
--- a/tools/gen_nvfuser_version.py
+++ b/tools/gen_nvfuser_version.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
+import ast
 import subprocess
 import sys
 from pathlib import Path
@@ -30,36 +31,32 @@ def get_version() -> str:
 
 
 def get_pytorch_cmake_prefix():
-    from subprocess import Popen, PIPE
-
     # need to do this in a separate process so we are not going to delete nvfuser library while it's loaded by torch
-    process_torch_prefix = Popen(
+    process_torch_prefix = subprocess.Popen(
         [
             sys.executable,
             "-c",
             "import torch.utils; print(torch.utils.cmake_prefix_path)",
         ],
-        stdout=PIPE,
+        stdout=subprocess.PIPE,
     )
     stdout_msg, error_msg = process_torch_prefix.communicate()
     return stdout_msg.decode("utf-8").rstrip("\n")
 
 
 def get_pytorch_use_distributed() -> bool:
-    from subprocess import Popen, PIPE
-
     # need to do this in a separate process so we are not going to delete nvfuser library while it's loaded by torch
-    process_torch_prefix = Popen(
+    process_torch_prefix = subprocess.Popen(
         [
             sys.executable,
             "-c",
             "import torch; print(torch._C._has_distributed())",
         ],
-        stdout=PIPE,
+        stdout=subprocess.PIPE,
     )
     stdout_msg, _ = process_torch_prefix.communicate()
     stdout_msg = stdout_msg.decode("utf-8").rstrip("\n")
-    return bool(stdout_msg)
+    return ast.literal_eval(stdout_msg)
 
 
 if __name__ == "__main__":

From 68704b79e8c87f36dc1a36a42c81f58d6320aeec Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 29 Apr 2024 14:47:25 -0700
Subject: [PATCH 6/6] RuntimeError instead of assert.

---
 setup.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index e430c85ee66..612914e5cc6 100644
--- a/setup.py
+++ b/setup.py
@@ -299,9 +299,10 @@ def cmake(install_prefix: str = "./nvfuser"):
 
     logger.setLevel(logger_level)
 
-    assert (
-        get_pytorch_use_distributed()
-    ), "nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."
+    if not get_pytorch_use_distributed():
+        raise RuntimeError(
+            "nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."
+        )
 
     # generate cmake directory
     cmd_str = [