Skip to content

Commit

Permalink
Remove NVFUSER_DISTRIBUTED. (#2155)
Browse files Browse the repository at this point in the history
@xwang233 discussed this in the nvFuser-MultiGPU chatroom. At this
moment, supporting non-distributed build of pytorch isn't worth the cost
of additional CI and using knobs like `NVFUSER_DISTRIBUTED` or
`USE_DISTRIBUTED`.

Feel free to revert this PR if non-distributed build becomes important.
  • Loading branch information
wujingyue authored and zasdfgbnm committed May 2, 2024
1 parent dc8558d commit 046b9ed
Show file tree
Hide file tree
Showing 12 changed files with 28 additions and 215 deletions.
8 changes: 0 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,6 @@ set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)

include(CMakeDependentOption)
cmake_dependent_option(NVFUSER_DISTRIBUTED "" ON "USE_DISTRIBUTED" OFF)
if (NVFUSER_DISTRIBUTED)
add_compile_definitions(NVFUSER_DISTRIBUTED)
endif()
message(STATUS "Setting NVFUSER_DISTRIBUTED=${NVFUSER_DISTRIBUTED}")

# We try to update which C++ standard we use together in lockstep across all
# built libraries, and these variables control which that is. Generally we are
# on C++20, but we still support a version of CUDA (11) that does not recognize
Expand Down Expand Up @@ -769,7 +762,6 @@ message(STATUS "******** Nvfuser configuration summary ********")
message(STATUS " UCC_FOUND: ${UCC_FOUND}")
message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}")
message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}")
message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}")

if(NVFUSER_STANDALONE_BUILD_WITH_UCC)
Expand Down
134 changes: 0 additions & 134 deletions csrc/multidevice/c10d_mock.h

This file was deleted.

4 changes: 2 additions & 2 deletions csrc/multidevice/communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
*/
// clang-format on
#include <multidevice/communication.h>
#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
#if defined(USE_C10D_NCCL)
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
#endif
#include <utils.h>
Expand Down Expand Up @@ -229,7 +229,7 @@ c10::intrusive_ptr<c10d::Work> Reduce::post(
c10d::ReduceOptions options = {
.reduceOp = params_.redOp, .rootRank = root_relative_index_};
auto team_backend = comm.getBackendForTeam(params_.team, backend);
#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
#if defined(USE_C10D_NCCL)
auto nccl_backend = dynamic_cast<c10d::ProcessGroupNCCL*>(team_backend.get());
if (nccl_backend) {
#if NVF_TORCH_VERSION_NO_LESS(2, 3, 0)
Expand Down
4 changes: 0 additions & 4 deletions csrc/multidevice/communication.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@

#include <multidevice/communicator.h>
#include <multidevice/multidevice.h>
#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/Types.hpp>
#else
#include <multidevice/c10d_mock.h>
#endif
#include <type.h>
#include <visibility.h>

Expand Down
10 changes: 0 additions & 10 deletions csrc/multidevice/communicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <netdb.h>
#include <map>

#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
#ifdef USE_C10D_GLOO
#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
Expand All @@ -21,7 +20,6 @@
#if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
#include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
#endif
#endif

namespace nvfuser {

Expand Down Expand Up @@ -132,7 +130,6 @@ inline std::string getTeamKey(const Team& team, CommunicatorBackend backend) {
});
}

#ifdef NVFUSER_DISTRIBUTED
// creates and return a process group backend
c10::intrusive_ptr<c10d::Backend> createBackend(
CommunicatorBackend backend,
Expand Down Expand Up @@ -164,7 +161,6 @@ c10::intrusive_ptr<c10d::Backend> createBackend(
#endif
NVF_ERROR(false, "no distributed backend available");
}
#endif
} // namespace

Communicator::Communicator(
Expand All @@ -187,7 +183,6 @@ Communicator::Communicator(
return;
}

#ifdef NVFUSER_DISTRIBUTED
c10d::TCPStoreOptions store_opts;
{
char hostname[HOST_NAME_MAX]; // NOLINT (modernize-avoid-c-arrays)
Expand All @@ -203,7 +198,6 @@ Communicator::Communicator(
c10d::TCPStoreOptions::kDefaultPort; // 29500
store_opts.port = master_port_ ? master_port_ : comm_master_port_default;
store_ = c10::make_intrusive<c10d::TCPStore>(master_addr_, store_opts);
#endif

#if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
ucc_available_ = true;
Expand All @@ -222,7 +216,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
// check if backend associated with the team is present in the cache
if (backends_.find(team_key) ==
backends_.end()) { // create the backend and cache it
#ifdef NVFUSER_DISTRIBUTED
// check that the caller's rank belongs to the requested team
auto rank_it = std::find(team.begin(), team.end(), deviceId());
NVF_ERROR(
Expand All @@ -237,9 +230,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
c10::make_intrusive<c10d::PrefixStore>(team_key, store_),
team_rank,
static_cast<int64_t>(team.size()));
#else
backends_[team_key] = c10::make_intrusive<c10d::Backend>();
#endif
}
return backends_.at(team_key);
}
Expand Down
31 changes: 12 additions & 19 deletions csrc/multidevice/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,25 @@
#include <ATen/core/TensorBody.h>
#include <ATen/core/ivalue.h>
#include <c10/util/intrusive_ptr.h>

#include <exceptions.h>
#include <multidevice/multidevice.h>
#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/Backend.hpp>
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
#include <torch/csrc/distributed/c10d/Work.hpp>
#else
#include <multidevice/c10d_mock.h>
#endif

#include <exceptions.h>
#include <multidevice/multidevice.h>
#include <visibility.h>

namespace nvfuser {

/*
This file implements the class Communicator which sets up the inter-process
Backend. This class contains inter-process information, such as the rank, the
world size, as well as the Process Group that can be called to perform
inter-process communications.
Each process is associated with a unique deviceId and device. The actual MPI
rank remains private to the class and should not be used by the user. The
communicator class holds privately the mappings ranks <-> device IDs <->
device.
*/
// This file implements the class Communicator which sets up the inter-process
// Backend. This class contains inter-process information, such as the rank, the
// world size, as well as the Process Group that can be called to perform
// inter-process communications.
//
// Each process is associated with a unique deviceId and device. The actual MPI
// rank remains private to the class and should not be used by the user. The
// communicator class holds privately the mappings ranks <-> device IDs <->
// device.

using RankType = DeviceIdxType;

Expand Down
8 changes: 0 additions & 8 deletions csrc/multidevice/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@

namespace nvfuser {

NVF_API bool distributedEnabled() {
#ifdef NVFUSER_DISTRIBUTED
return true;
#else
return false;
#endif
}

namespace {

std::unordered_set<IterDomain*> getShardedIterDomains(TensorView* tv) {
Expand Down
3 changes: 0 additions & 3 deletions csrc/multidevice/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@

namespace nvfuser {

// Returns true iff nvFuser was compiled with distributed APIs enabled.
NVF_API bool distributedEnabled();

// Returns whether a TensorView has a non-reduction axis parallelized Didx
// Checks that the other non-reduction axis are not parallelized on Didx
NVF_API bool isSharded(TensorView*);
Expand Down
15 changes: 4 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@
# --build-with-ucc
# Build nvfuser with UCC support. You may need to specify environment variables of UCC_HOME, UCC_DIR, UCX_HOME, UCX_DIR.
#
# --build-without-distributed
# Build nvfuser without multidevice support
#
# --debug
# Building nvfuser in debug mode
#
Expand Down Expand Up @@ -74,7 +71,6 @@
NO_NINJA = False
BUILD_WITH_UCC = False
BUILD_WITH_ASAN = False
BUILD_WITHOUT_DISTRIBUTED = False
OVERWRITE_VERSION = False
VERSION_TAG = None
BUILD_TYPE = "Release"
Expand Down Expand Up @@ -106,9 +102,6 @@
if arg == "--build-with-asan":
BUILD_WITH_ASAN = True
continue
if arg == "--build-without-distributed":
BUILD_WITHOUT_DISTRIBUTED = True
continue
if arg == "--debug":
BUILD_TYPE = "Debug"
continue
Expand Down Expand Up @@ -306,7 +299,10 @@ def cmake(install_prefix: str = "./nvfuser"):

logger.setLevel(logger_level)

pytorch_use_distributed = get_pytorch_use_distributed()
if not get_pytorch_use_distributed():
raise RuntimeError(
"nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."
)

# generate cmake directory
cmd_str = [
Expand All @@ -315,7 +311,6 @@ def cmake(install_prefix: str = "./nvfuser"):
"-DCMAKE_BUILD_TYPE=" + BUILD_TYPE,
f"-DCMAKE_INSTALL_PREFIX={install_prefix}",
f"-DNVFUSER_CPP_STANDARD={CPP_STANDARD}",
f"-DUSE_DISTRIBUTED={pytorch_use_distributed}",
"-B",
cmake_build_dir,
]
Expand All @@ -333,8 +328,6 @@ def cmake(install_prefix: str = "./nvfuser"):
cmd_str.append("-DBUILD_NVFUSER_BENCHMARK=ON")
if BUILD_WITH_ASAN:
cmd_str.append("-DNVFUSER_BUILD_WITH_ASAN=ON")
if BUILD_WITHOUT_DISTRIBUTED:
cmd_str.append("-DNVFUSER_DISTRIBUTED=OFF")
cmd_str.append(".")

print(f"Configuring CMake with {' '.join(cmd_str)}")
Expand Down
3 changes: 1 addition & 2 deletions tests/cpp/test_multidevice_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ using namespace torch::jit::fuser::cuda;
using namespace at::indexing;

// To run the following tests on several devices, pytorch must be installed with
// the flag USE_DISTRIBUTED=1 and nccl support. With that, nvFuser is built by
// default with NVFUSER_DISTRIBUTED defined. Then, on a node with at least 6
// the flag USE_DISTRIBUTED=1 and nccl support. Then, on a node with at least 6
// GPUs, run the test using mpirun: `mpirun -np 6 build/test_multidevice
// --gtest_filter=PipelineTestTwoStages*`.

Expand Down
3 changes: 0 additions & 3 deletions tests/cpp/test_resharding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,6 @@ TEST_F(ReshardingTest, InsertShardedAxisReordering) {
}

TEST_P(ReshardingTest, Insert) {
if (!distributedEnabled()) { // Test only works with distributed
GTEST_SKIP() << "Requires distributed API";
}
auto
[mesh0,
mesh1,
Expand Down
Loading

0 comments on commit 046b9ed

Please sign in to comment.