Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove NVFUSER_DISTRIBUTED. #2155

Merged
merged 7 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,6 @@ set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)

include(CMakeDependentOption)
cmake_dependent_option(NVFUSER_DISTRIBUTED "" ON "USE_DISTRIBUTED" OFF)
wujingyue marked this conversation as resolved.
Show resolved Hide resolved
if (NVFUSER_DISTRIBUTED)
add_compile_definitions(NVFUSER_DISTRIBUTED)
endif()
message(STATUS "Setting NVFUSER_DISTRIBUTED=${NVFUSER_DISTRIBUTED}")

# We try to update which C++ standard we use together in lockstep across all
# built libraries, and these variables control which that is. Generally we are
# on C++20, but we still support a version of CUDA (11) that does not recognize
Expand Down Expand Up @@ -769,7 +762,6 @@ message(STATUS "******** Nvfuser configuration summary ********")
message(STATUS " UCC_FOUND: ${UCC_FOUND}")
message(STATUS " NVFUSER_STANDALONE_BUILD_WITH_UCC : ${NVFUSER_STANDALONE_BUILD_WITH_UCC}")
message(STATUS " NVFUSER_BUILD_WITH_ASAN : ${NVFUSER_BUILD_WITH_ASAN}")
message(STATUS " NVFUSER_DISTRIBUTED : ${NVFUSER_DISTRIBUTED}")
message(STATUS " NVFUSER_CPP_STANDARD : ${NVFUSER_CPP_STANDARD}")

if(NVFUSER_STANDALONE_BUILD_WITH_UCC)
Expand Down
134 changes: 0 additions & 134 deletions csrc/multidevice/c10d_mock.h

This file was deleted.

4 changes: 2 additions & 2 deletions csrc/multidevice/communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
*/
// clang-format on
#include <multidevice/communication.h>
#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
#if defined(USE_C10D_NCCL)
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
#endif
#include <utils.h>
Expand Down Expand Up @@ -229,7 +229,7 @@ c10::intrusive_ptr<c10d::Work> Reduce::post(
c10d::ReduceOptions options = {
.reduceOp = params_.redOp, .rootRank = root_relative_index_};
auto team_backend = comm.getBackendForTeam(params_.team, backend);
#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
#if defined(USE_C10D_NCCL)
auto nccl_backend = dynamic_cast<c10d::ProcessGroupNCCL*>(team_backend.get());
if (nccl_backend) {
#if NVF_TORCH_VERSION_NO_LESS(2, 3, 0)
Expand Down
4 changes: 0 additions & 4 deletions csrc/multidevice/communication.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@

#include <multidevice/communicator.h>
#include <multidevice/multidevice.h>
#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/Types.hpp>
#else
#include <multidevice/c10d_mock.h>
#endif
#include <type.h>
#include <visibility.h>

Expand Down
10 changes: 0 additions & 10 deletions csrc/multidevice/communicator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <netdb.h>
#include <map>

#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
#ifdef USE_C10D_GLOO
#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
Expand All @@ -21,7 +20,6 @@
#if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
#include <torch/csrc/distributed/c10d/ProcessGroupUCC.hpp>
#endif
#endif

namespace nvfuser {

Expand Down Expand Up @@ -132,7 +130,6 @@ inline std::string getTeamKey(const Team& team, CommunicatorBackend backend) {
});
}

#ifdef NVFUSER_DISTRIBUTED
// creates and return a process group backend
c10::intrusive_ptr<c10d::Backend> createBackend(
CommunicatorBackend backend,
Expand Down Expand Up @@ -164,7 +161,6 @@ c10::intrusive_ptr<c10d::Backend> createBackend(
#endif
NVF_ERROR(false, "no distributed backend available");
}
#endif
} // namespace

Communicator::Communicator(
Expand All @@ -187,7 +183,6 @@ Communicator::Communicator(
return;
}

#ifdef NVFUSER_DISTRIBUTED
c10d::TCPStoreOptions store_opts;
{
char hostname[HOST_NAME_MAX]; // NOLINT (modernize-avoid-c-arrays)
Expand All @@ -203,7 +198,6 @@ Communicator::Communicator(
c10d::TCPStoreOptions::kDefaultPort; // 29500
store_opts.port = master_port_ ? master_port_ : comm_master_port_default;
store_ = c10::make_intrusive<c10d::TCPStore>(master_addr_, store_opts);
#endif

#if defined(USE_C10D_UCC) && defined(NVFUSER_BUILD_WITH_UCC)
ucc_available_ = true;
Expand All @@ -222,7 +216,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
// check if backend associated with the team is present in the cache
if (backends_.find(team_key) ==
backends_.end()) { // create the backend and cache it
#ifdef NVFUSER_DISTRIBUTED
// check that the caller's rank belongs to the requested team
auto rank_it = std::find(team.begin(), team.end(), deviceId());
NVF_ERROR(
Expand All @@ -237,9 +230,6 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getBackendForTeam(
c10::make_intrusive<c10d::PrefixStore>(team_key, store_),
team_rank,
static_cast<int64_t>(team.size()));
#else
backends_[team_key] = c10::make_intrusive<c10d::Backend>();
#endif
}
return backends_.at(team_key);
}
Expand Down
31 changes: 12 additions & 19 deletions csrc/multidevice/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,25 @@
#include <ATen/core/TensorBody.h>
#include <ATen/core/ivalue.h>
#include <c10/util/intrusive_ptr.h>

#include <exceptions.h>
#include <multidevice/multidevice.h>
#ifdef NVFUSER_DISTRIBUTED
#include <torch/csrc/distributed/c10d/Backend.hpp>
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
#include <torch/csrc/distributed/c10d/Work.hpp>
#else
#include <multidevice/c10d_mock.h>
#endif

#include <exceptions.h>
#include <multidevice/multidevice.h>
#include <visibility.h>

namespace nvfuser {

/*
This file implements the class Communicator which sets up the inter-process
Backend. This class contains inter-process information, such as the rank, the
world size, as well as the Process Group that can be called to perform
inter-process communications.

Each process is associated with a unique deviceId and device. The actual MPI
rank remains private to the class and should not be used by the user. The
communicator class holds privately the mappings ranks <-> device IDs <->
device.

*/
// This file implements the class Communicator which sets up the inter-process
// Backend. This class contains inter-process information, such as the rank, the
// world size, as well as the Process Group that can be called to perform
// inter-process communications.
//
// Each process is associated with a unique deviceId and device. The actual MPI
// rank remains private to the class and should not be used by the user. The
// communicator class holds privately the mappings ranks <-> device IDs <->
// device.

using RankType = DeviceIdxType;

Expand Down
8 changes: 0 additions & 8 deletions csrc/multidevice/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,6 @@

namespace nvfuser {

NVF_API bool distributedEnabled() {
#ifdef NVFUSER_DISTRIBUTED
return true;
#else
return false;
#endif
}

namespace {

std::unordered_set<IterDomain*> getShardedIterDomains(TensorView* tv) {
Expand Down
3 changes: 0 additions & 3 deletions csrc/multidevice/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@

namespace nvfuser {

// Returns true iff nvFuser was compiled with distributed APIs enabled.
NVF_API bool distributedEnabled();

// Returns whether a TensorView has a non-reduction axis parallelized Didx
// Checks that the other non-reduction axis are not parallelized on Didx
NVF_API bool isSharded(TensorView*);
Expand Down
14 changes: 3 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@
# --build-with-ucc
# Build nvfuser with UCC support. You may need to specify environment variables of UCC_HOME, UCC_DIR, UCX_HOME, UCX_DIR.
#
# --build-without-distributed
# Build nvfuser without multidevice support
#
# --debug
# Building nvfuser in debug mode
#
Expand Down Expand Up @@ -74,7 +71,6 @@
NO_NINJA = False
BUILD_WITH_UCC = False
BUILD_WITH_ASAN = False
BUILD_WITHOUT_DISTRIBUTED = False
OVERWRITE_VERSION = False
VERSION_TAG = None
BUILD_TYPE = "Release"
Expand Down Expand Up @@ -106,9 +102,6 @@
if arg == "--build-with-asan":
BUILD_WITH_ASAN = True
continue
if arg == "--build-without-distributed":
BUILD_WITHOUT_DISTRIBUTED = True
continue
if arg == "--debug":
BUILD_TYPE = "Debug"
continue
Expand Down Expand Up @@ -306,7 +299,9 @@ def cmake(install_prefix: str = "./nvfuser"):

logger.setLevel(logger_level)

pytorch_use_distributed = get_pytorch_use_distributed()
wujingyue marked this conversation as resolved.
Show resolved Hide resolved
assert (
wujingyue marked this conversation as resolved.
Show resolved Hide resolved
get_pytorch_use_distributed()
), "nvFuser requires PyTorch to be built with USE_DISTRIBUTED on."

# generate cmake directory
cmd_str = [
Expand All @@ -315,7 +310,6 @@ def cmake(install_prefix: str = "./nvfuser"):
"-DCMAKE_BUILD_TYPE=" + BUILD_TYPE,
f"-DCMAKE_INSTALL_PREFIX={install_prefix}",
f"-DNVFUSER_CPP_STANDARD={CPP_STANDARD}",
f"-DUSE_DISTRIBUTED={pytorch_use_distributed}",
"-B",
cmake_build_dir,
]
Expand All @@ -333,8 +327,6 @@ def cmake(install_prefix: str = "./nvfuser"):
cmd_str.append("-DBUILD_NVFUSER_BENCHMARK=ON")
if BUILD_WITH_ASAN:
cmd_str.append("-DNVFUSER_BUILD_WITH_ASAN=ON")
if BUILD_WITHOUT_DISTRIBUTED:
cmd_str.append("-DNVFUSER_DISTRIBUTED=OFF")
cmd_str.append(".")

print(f"Configuring CMake with {' '.join(cmd_str)}")
Expand Down
5 changes: 2 additions & 3 deletions tests/cpp/test_multidevice_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,9 @@ using namespace torch::jit::fuser::cuda;
using namespace at::indexing;

// To run the following tests on several devices, pytorch must be installed with
// the flag USE_DISTRIBUTED=1 and nccl support. With that, nvFuser is built by
// default with NVFUSER_DISTRIBUTED defined. Then, on a node with at least 6
// the flag USE_DISTRIBUTED=1 and nccl support. Then, on a node with at least 6
// GPUs, run the test using mpirun: `mpirun -np 6 build/test_multidevice
// --gtest_filter=PipelineTwoStages*`.
// --gtest_filter=PipelineTestTwoStages*`.

TEST_F(PipelineTest, Pipeline) {
const std::vector<int64_t> input_shape1 = {6, 7};
Expand Down
Loading
Loading