diff --git a/3rdparty/mscclpp/include/common.h b/3rdparty/mscclpp/include/common.h deleted file mode 100644 index ccde5a3ef4..0000000000 --- a/3rdparty/mscclpp/include/common.h +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef MSCCL_COMMON_HPP_ -#define MSCCL_COMMON_HPP_ - -#if defined(__HIP_PLATFORM_AMD__) -#define WARP_SIZE 64 -#define __syncwarp() __builtin_amdgcn_wave_barrier() -#else -#define WARP_SIZE 32 -#endif - -constexpr int NRANKS_PER_NODE = 8; -constexpr int SCRATCH_SIZE = 1024 * 1024 * 70; // 35 thread-blocks * 8 ranks * 256KB = 70MB - -template -__forceinline__ __device__ To bit_cast(const From& src) { - static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); - - union { - From f; - To t; - } u; - u.f = src; - return u.t; -} - -template -__forceinline__ __device__ T add_elements(T a, T b) { - return a + b; -} - -template <> -__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { - return __hadd2(a, b); -} - -template -__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) { - int4 ret; - ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} - -template -__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) { - return add_vectors_helper<__half2>(a, b); -} - -template -__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) { - uint2 ret; - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - return ret; -} - -template -__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) { - return add_vectors_helper<__half2>(a, b); -} - -template -__forceinline__ __device__ int add_vectors_helper(int a, int b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -__forceinline__ __device__ int add_vectors(int a, int b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ int add_vectors<__half>(int a, int b) { - return add_vectors_helper<__half2>(a, b); -} - -template -__forceinline__ __device__ uint32_t add_vectors_helper(uint32_t a, uint32_t b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -__forceinline__ __device__ uint32_t add_vectors(uint32_t a, uint32_t b) { - return add_vectors_helper(a, b); -} - -template <> -__forceinline__ __device__ uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { - return add_vectors_helper<__half2>(a, b); -} - -#endif // MSCCL_COMMON_HPP_ diff --git a/3rdparty/mscclpp/include/msccl.cuh b/3rdparty/mscclpp/include/msccl.cuh deleted file mode 100644 index 93612126dc..0000000000 --- a/3rdparty/mscclpp/include/msccl.cuh +++ /dev/null @@ -1,323 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include -#include -#include -#include -#include -#include - -#include "common.h" -#include "msccl.h" - -#define MSCCL_API extern "C" __attribute__((visibility("default"))) - -#define CUDACHECK(cmd) \ - do { \ - cudaError_t e = cmd; \ - if (e != cudaSuccess) { \ - printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -#define NUM_CHANNELS_PER_CONNECTION 64 - -struct channelKey { - const void* sendbuff; - const void* recvbuff; - size_t bytes; - bool operator==(const channelKey& other) const { - return sendbuff == other.sendbuff && recvbuff == other.recvbuff && bytes == other.bytes; - } -}; - -namespace std { -template <> -struct hash { - std::size_t operator()(const channelKey& k) const { - return std::hash()(k.sendbuff) ^ std::hash()(k.recvbuff) ^ std::hash()(k.bytes); - } -}; -} // namespace std - -struct ChannelInfo { - std::vector smChannels; - std::vector smOutChannels; - std::shared_ptr> smChannelDeviceHandles; - std::shared_ptr> smOutChannelDeviceHandles; -}; - -struct mscclComm { - std::shared_ptr comm; - std::vector> connections; - std::vector> smSemaphores; - - std::unordered_map channelInfos; - std::shared_ptr scratchBuff; - std::vector remoteScratchRegMemories; -}; - -static size_t mscclTypeSize(mscclDataType_t type) { - switch (type) { - case mscclInt8: - case mscclUint8: - return 1; - case mscclFloat16: - return 2; - case mscclInt32: - case mscclUint32: - return 4; - case mscclInt64: - case mscclUint64: - return 8; - case mscclFloat32: - return 4; - case mscclFloat64: - return 8; -#if defined(__CUDA_BF16_TYPES_EXIST__) - case mscclBfloat16: - return 2; -#endif // defined(__CUDA_BF16_TYPES_EXIST__) -#if defined(__CUDA_FP8_TYPES_EXIST__) - case mscclFp8E4M3: - case mscclFp8E5M2: - return 1; -#endif // defined(__CUDA_FP8_TYPES_EXIST__) - case mscclNumTypes: - return 0; - } - return 0; -} - -static mscclpp::Transport getTransport(int, int) { return mscclpp::Transport::CudaIpc; } - -static std::vector setupRemoteMemories(std::shared_ptr comm, int rank, - void* buff, size_t bytes, - mscclpp::TransportFlags transport) { - std::vector remoteMemories; - mscclpp::RegisteredMemory memory = comm->registerMemory(buff, bytes, transport); - std::vector> remoteRegMemoryFutures; - for (int i = 0; i < comm->bootstrap()->getNranks(); i++) { - if (i == rank) continue; - remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(i, 0)); - comm->sendMemoryOnSetup(memory, i, 0); - } - comm->setup(); - std::transform(remoteRegMemoryFutures.begin(), remoteRegMemoryFutures.end(), std::back_inserter(remoteMemories), - [](const auto& future) { return future.get(); }); - return remoteMemories; -} - -static std::vector setupSmChannels(mscclComm_t comm, - const std::vector& remoteMemories, - void* src) { - std::vector channels; - std::vector>& smSemaphores = comm->smSemaphores; - size_t nConnections = comm->connections.size(); - for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) { - for (size_t cid = 0; cid < nConnections; ++cid) { - if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - channels.emplace_back(smSemaphores[idx * nConnections + cid], remoteMemories[cid], src, nullptr); - } - } - } - return channels; -} - -static std::shared_ptr> setupSmChannelDeviceHandles( - const std::vector& smChannels) { - std::vector> smChannelDeviceHandles; - std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - std::shared_ptr> ptr = - mscclpp::allocSharedCuda>(smChannelDeviceHandles.size()); - mscclpp::memcpyCuda>(ptr.get(), smChannelDeviceHandles.data(), - smChannelDeviceHandles.size(), cudaMemcpyHostToDevice); - return ptr; -} - -MSCCL_API mscclResult_t mscclGetVersion(int* version) { - if (version == nullptr) return mscclInvalidArgument; - *version = MSCCLPP_VERSION; - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclGetUniqueId(mscclUniqueId* uniqueId) { - if (uniqueId == nullptr) return mscclInvalidArgument; - if (MSCCLPP_UNIQUE_ID_BYTES != MSCCL_UNIQUE_ID_BYTES) return mscclInternalError; - mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId(); - memcpy(uniqueId, &id, sizeof(mscclUniqueId)); - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclCommInitRankConfig(mscclComm_t*, int, mscclUniqueId, int, - mscclConfig_t*) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank) { - if (comm == nullptr) return mscclInvalidArgument; - if (nranks < 0 || rank < 0 || rank >= nranks) return mscclInvalidArgument; - std::shared_ptr bootstrap = std::make_shared(rank, nranks); - mscclpp::UniqueId id; - memcpy(id.data(), &commId, sizeof(mscclUniqueId)); - bootstrap->initialize(id); - std::shared_ptr mscclppComm = std::make_shared(bootstrap); - std::vector>> connectionFutures; - - for (int i = 0; i < mscclppComm->bootstrap()->getNranks(); i++) { - if (i == rank) continue; - mscclpp::Transport transport = getTransport(rank, i); - connectionFutures.push_back(mscclppComm->connectOnSetup(i, 0, transport)); - } - mscclppComm->setup(); - - std::vector> connections; - std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections), - [](const auto& future) { return future.get(); }); - - std::vector> smSemaphores; - for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) { - for (size_t cid = 0; cid < connections.size(); ++cid) { - if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smSemaphores.emplace_back( - std::make_shared(*(mscclppComm), connections[cid])); - } - } - } - mscclppComm->setup(); - - mscclComm* commPtr = new mscclComm(); - commPtr->comm = mscclppComm; - commPtr->connections = std::move(connections); - commPtr->smSemaphores = std::move(smSemaphores); - commPtr->scratchBuff = mscclpp::allocExtSharedCuda(SCRATCH_SIZE); - commPtr->remoteScratchRegMemories = - setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc); - - *comm = commPtr; - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclCommInitAll(mscclComm_t*, int, const int*) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclCommFinalize(mscclComm_t comm) { - comm->comm->bootstrap()->barrier(); - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclCommDestroy(mscclComm_t comm) { - if (comm == nullptr) return mscclInvalidArgument; - delete comm; - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclCommAbort(mscclComm_t) { return mscclSuccess; } - -MSCCL_API mscclResult_t mscclCommSplit(mscclComm_t, int, int, mscclComm_t*, mscclConfig_t*) { - return mscclInternalError; -} - -MSCCL_API const char* mscclGetErrorString(mscclResult_t result) { - switch (result) { - case mscclSuccess: - return "no error"; - case mscclUnhandledCudaError: - return "unhandled cuda error (run with MSCCL_DEBUG=INFO for details)"; - case mscclSystemError: - return "unhandled system error (run with MSCCL_DEBUG=INFO for details)"; - case mscclInternalError: - return "internal error - please report this issue to the MSCCL developers"; - case mscclInvalidArgument: - return "invalid argument (run with MSCCL_DEBUG=WARN for details)"; - case mscclInvalidUsage: - return "invalid usage (run with MSCCL_DEBUG=WARN for details)"; - case mscclRemoteError: - return "remote process exited or there was a network error"; - case mscclInProgress: - return "MSCCL operation in progress"; - default: - return "unknown result code"; - } -} - -MSCCL_API const char* mscclGetLastError(mscclComm_t) { return nullptr; } - -MSCCL_API mscclResult_t mscclCommGetAsyncError(mscclComm_t, mscclResult_t* asyncError) { - if (asyncError == nullptr) return mscclInvalidArgument; - *asyncError = mscclSuccess; - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclCommCount(const mscclComm_t comm, int* count) { - if (comm == nullptr || count == nullptr) return mscclInvalidArgument; - *count = comm->comm->bootstrap()->getNranks(); - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclCommCuDevice(const mscclComm_t comm, int* device) { - if (comm == nullptr || device == nullptr) return mscclInvalidArgument; - *device = comm->comm->bootstrap()->getRank(); - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclCommUserRank(const mscclComm_t comm, int* rank) { - if (comm == nullptr || rank == nullptr) return mscclInvalidArgument; - *rank = comm->comm->bootstrap()->getRank(); - return mscclSuccess; -} - -MSCCL_API mscclResult_t mscclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - mscclDataType_t datatype, mscclComm_t comm, - cudaStream_t stream) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclRedOpCreatePreMulSum(mscclRedOp_t*, void*, mscclDataType_t, - mscclScalarResidence_t, mscclComm_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclRedOpDestroy(mscclRedOp_t, mscclComm_t) { return mscclInternalError; } - -MSCCL_API mscclResult_t mscclReduce(const void*, void*, size_t, mscclDataType_t, mscclRedOp_t, int, - mscclComm_t, cudaStream_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclBcast(void*, size_t, mscclDataType_t, int, mscclComm_t, cudaStream_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclBroadcast(const void*, void*, size_t, mscclDataType_t, int, - mscclComm_t, cudaStream_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclReduceScatter(const void*, void*, size_t, mscclDataType_t, - mscclRedOp_t, mscclComm_t, cudaStream_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclSend(const void*, size_t, mscclDataType_t, int, mscclComm_t, - cudaStream_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclRecv(void*, size_t, mscclDataType_t, int, mscclComm_t, cudaStream_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclAllToAll(const void*, void*, size_t, mscclDataType_t, mscclComm_t, - cudaStream_t) { - return mscclInternalError; -} - -MSCCL_API mscclResult_t mscclGroupStart() { return mscclSuccess; } - -MSCCL_API mscclResult_t mscclGroupEnd() { return mscclSuccess; } diff --git a/3rdparty/mscclpp/include/msccl.h b/3rdparty/mscclpp/include/msccl.h deleted file mode 100644 index 12e4e7222b..0000000000 --- a/3rdparty/mscclpp/include/msccl.h +++ /dev/null @@ -1,494 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef MSCCL_H_ -#define MSCCL_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#include -/* Opaque handle to communicator */ -typedef struct mscclComm* mscclComm_t; -#define MSCCL_COMM_NULL NULL - -#define MSCCL_UNIQUE_ID_BYTES 128 -typedef struct { - char internal[MSCCL_UNIQUE_ID_BYTES]; -} mscclUniqueId; - -/* Error type */ -typedef enum { - mscclSuccess = 0, - mscclUnhandledCudaError = 1, - mscclSystemError = 2, - mscclInternalError = 3, - mscclInvalidArgument = 4, - mscclInvalidUsage = 5, - mscclRemoteError = 6, - mscclInProgress = 7, - mscclNumResults = 8 -} mscclResult_t; - -#define MSCCL_CONFIG_UNDEF_INT INT_MIN -#define MSCCL_CONFIG_UNDEF_PTR NULL -#define MSCCL_SPLIT_NOCOLOR -1 - -/* Communicator configuration. Users can assign value to attributes to specify the - * behavior of a communicator. */ -typedef struct mscclConfig_v21700 { - /* attributes that users should never touch. */ - size_t size; - unsigned int magic; - unsigned int version; - /* attributes that users are able to customize. */ - int blocking; - int cgaClusterSize; - int minCTAs; - int maxCTAs; - const char* netName; - int splitShare; -} mscclConfig_t; - -/* Config initializer must be assigned to initialize config structure when it is created. - * Not initialized config will result in MSCCL error. */ -#define MSCCL_CONFIG_INITIALIZER \ - { \ - sizeof(mscclConfig_t), /* size */ \ - 0xcafebeef, /* magic */ \ - MSCCL_VERSION(MSCCL_MAJOR, MSCCL_MINOR, MSCCL_PATCH), /* version */ \ - MSCCL_CONFIG_UNDEF_INT, /* blocking */ \ - MSCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \ - MSCCL_CONFIG_UNDEF_INT, /* minCTAs */ \ - MSCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \ - MSCCL_CONFIG_UNDEF_PTR, /* netName */ \ - MSCCL_CONFIG_UNDEF_INT /* splitShare */ \ - } - -/* Return the MSCCL_VERSION_CODE of the MSCCL library in the supplied integer. - * This integer is coded with the MAJOR, MINOR and PATCH level of the - * MSCCL library - */ -mscclResult_t mscclGetVersion(int* version); -mscclResult_t pmscclGetVersion(int* version); - -/* Generates an Id to be used in mscclCommInitRank. mscclGetUniqueId should be - * called once and the Id should be distributed to all ranks in the - * communicator before calling mscclCommInitRank. */ -mscclResult_t mscclGetUniqueId(mscclUniqueId* uniqueId); -mscclResult_t pmscclGetUniqueId(mscclUniqueId* uniqueId); - -/* Create a new communicator (multi thread/process version) with a configuration - * set by users. */ -mscclResult_t mscclCommInitRankConfig(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank, - mscclConfig_t* config); -mscclResult_t pmscclCommInitRankConfig(mscclComm_t* comm, int nranks, mscclUniqueId commId, - int rank, mscclConfig_t* config); - -/* Creates a new communicator (multi thread/process version). - * rank must be between 0 and nranks-1 and unique within a communicator clique. - * Each rank is associated to a CUDA device, which has to be set before calling - * mscclCommInitRank. - * mscclCommInitRank implicitly syncronizes with other ranks, so it must be - * called by different threads/processes or use mscclGroupStart/mscclGroupEnd. */ -mscclResult_t mscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank); -mscclResult_t pmscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank); - -/* Creates a clique of communicators (single process version). - * This is a convenience function to create a single-process communicator clique. - * Returns an array of ndev newly initialized communicators in comm. - * comm should be pre-allocated with size at least ndev*sizeof(mscclComm_t). - * If devlist is NULL, the first ndev CUDA devices are used. - * Order of devlist defines user-order of processors within the communicator. */ -mscclResult_t mscclCommInitAll(mscclComm_t* comm, int ndev, const int* devlist); -mscclResult_t pmscclCommInitAll(mscclComm_t* comm, int ndev, const int* devlist); - -/* Finalize a communicator. mscclCommFinalize flushes all issued communications, - * and marks communicator state as mscclInProgress. The state will change to mscclSuccess - * when the communicator is globally quiescent and related resources are freed; then, - * calling mscclCommDestroy can locally free the rest of the resources (e.g. communicator - * itself) without blocking. */ -mscclResult_t mscclCommFinalize(mscclComm_t comm); -mscclResult_t pmscclCommFinalize(mscclComm_t comm); - -/* Frees local resources associated with communicator object. */ -mscclResult_t mscclCommDestroy(mscclComm_t comm); -mscclResult_t pmscclCommDestroy(mscclComm_t comm); - -/* Frees resources associated with communicator object and aborts any operations - * that might still be running on the device. */ -mscclResult_t mscclCommAbort(mscclComm_t comm); -mscclResult_t pmscclCommAbort(mscclComm_t comm); - -/* Creates one or more communicators from an existing one. - * Ranks with the same color will end up in the same communicator. - * Within the new communicator, key will be used to order ranks. - * MSCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group - * and will therefore return a NULL communicator. - * If config is NULL, the new communicator will inherit the original communicator's - * configuration*/ -mscclResult_t mscclCommSplit(mscclComm_t comm, int color, int key, mscclComm_t* newcomm, - mscclConfig_t* config); -mscclResult_t pmscclCommSplit(mscclComm_t comm, int color, int key, mscclComm_t* newcomm, - mscclConfig_t* config); - -/* Returns a string for each error code. */ -const char* mscclGetErrorString(mscclResult_t result); -const char* pmscclGetErrorString(mscclResult_t result); - -/* Returns a human-readable message of the last error that occurred. - * comm is currently unused and can be set to NULL - */ -const char* mscclGetLastError(mscclComm_t comm); -const char* pmscclGetLastError(mscclComm_t comm); - -/* Checks whether the comm has encountered any asynchronous errors */ -mscclResult_t mscclCommGetAsyncError(mscclComm_t comm, mscclResult_t* asyncError); -mscclResult_t pmscclCommGetAsyncError(mscclComm_t comm, mscclResult_t* asyncError); - -/* Gets the number of ranks in the communicator clique. */ -mscclResult_t mscclCommCount(const mscclComm_t comm, int* count); -mscclResult_t pmscclCommCount(const mscclComm_t comm, int* count); - -/* Returns the cuda device number associated with the communicator. */ -mscclResult_t mscclCommCuDevice(const mscclComm_t comm, int* device); -mscclResult_t pmscclCommCuDevice(const mscclComm_t comm, int* device); - -/* Returns the user-ordered "rank" associated with the communicator. */ -mscclResult_t mscclCommUserRank(const mscclComm_t comm, int* rank); -mscclResult_t pmscclCommUserRank(const mscclComm_t comm, int* rank); - -/* Reduction operation selector */ -typedef enum { mscclNumOps_dummy = 5 } mscclRedOp_dummy_t; -typedef enum { - mscclSum = 0, - mscclProd = 1, - mscclMax = 2, - mscclMin = 3, - mscclAvg = 4, - /* mscclNumOps: The number of built-in mscclRedOp_t values. Also - * serves as the least possible value for dynamic mscclRedOp_t's - * as constructed by mscclRedOpCreate*** functions. */ - mscclNumOps = 5, - /* mscclMaxRedOp: The largest valid value for mscclRedOp_t. - * It is defined to be the largest signed value (since compilers - * are permitted to use signed enums) that won't grow - * sizeof(mscclRedOp_t) when compared to previous MSCCL versions to - * maintain ABI compatibility. */ - mscclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(mscclRedOp_dummy_t)) -} mscclRedOp_t; - -/* Data types */ -typedef enum { - mscclInt8 = 0, - mscclChar = 0, - mscclUint8 = 1, - mscclInt32 = 2, - mscclInt = 2, - mscclUint32 = 3, - mscclInt64 = 4, - mscclUint64 = 5, - mscclFloat16 = 6, - mscclHalf = 6, - mscclFloat32 = 7, - mscclFloat = 7, - mscclFloat64 = 8, - mscclDouble = 8, -#if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__) - mscclBfloat16 = 9, - mscclFp8E4M3 = 10, - mscclFp8E5M2 = 11, - mscclNumTypes = 12 -#elif defined(__CUDA_BF16_TYPES_EXIST__) - mscclBfloat16 = 9, - mscclNumTypes = 10 -#else - mscclNumTypes = 9 -#endif -} mscclDataType_t; - -/* mscclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ -typedef enum { - /* mscclScalarDevice: The scalar is in device-visible memory and will be - * dereferenced while the collective is running. */ - mscclScalarDevice = 0, - - /* mscclScalarHostImmediate: The scalar is in host-visible memory and will be - * dereferenced before the mscclRedOpCreate***() function returns. */ - mscclScalarHostImmediate = 1 -} mscclScalarResidence_t; - -/* - * mscclRedOpCreatePreMulSum - * - * Creates a new reduction operator which pre-multiplies input values by a given - * scalar locally before reducing them with peer values via summation. For use - * only with collectives launched against *comm* and *datatype*. The - * *residence* argument indicates how/when the memory pointed to by *scalar* - * will be dereferenced. Upon return, the newly created operator's handle - * is stored in *op*. - */ -mscclResult_t mscclRedOpCreatePreMulSum(mscclRedOp_t* op, void* scalar, mscclDataType_t datatype, - mscclScalarResidence_t residence, mscclComm_t comm); -mscclResult_t pmscclRedOpCreatePreMulSum(mscclRedOp_t* op, void* scalar, mscclDataType_t datatype, - mscclScalarResidence_t residence, mscclComm_t comm); - -/* - * mscclRedOpDestroy - * - * Destroys the reduction operator *op*. The operator must have been created by - * mscclRedOpCreatePreMul with the matching communicator *comm*. An operator may be - * destroyed as soon as the last MSCCL function which is given that operator returns. - */ -mscclResult_t mscclRedOpDestroy(mscclRedOp_t op, mscclComm_t comm); -mscclResult_t pmscclRedOpDestroy(mscclRedOp_t op, mscclComm_t comm); - -/* - * Collective communication operations - * - * Collective communication operations must be called separately for each - * communicator in a communicator clique. - * - * They return when operations have been enqueued on the CUDA stream. - * - * Since they may perform inter-CPU synchronization, each call has to be done - * from a different thread or process, or need to use Group Semantics (see - * below). - */ - -/* - * Reduce - * - * Reduces data arrays of length count in sendbuff into recvbuff using op - * operation. - * recvbuff may be NULL on all calls except for root device. - * root is the rank (not the CUDA device) where data will reside after the - * operation is complete. - * - * In-place operation will happen if sendbuff == recvbuff. - */ -mscclResult_t mscclReduce(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, mscclRedOp_t op, int root, mscclComm_t comm, - cudaStream_t stream); -mscclResult_t pmscclReduce(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, mscclRedOp_t op, int root, mscclComm_t comm, - cudaStream_t stream); - -/* - * (deprecated) Broadcast (in-place) - * - * Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the - * operation is started. - * - * This operation is implicitly in place. - */ -mscclResult_t mscclBcast(void* buff, size_t count, mscclDataType_t datatype, int root, - mscclComm_t comm, cudaStream_t stream); -mscclResult_t pmscclBcast(void* buff, size_t count, mscclDataType_t datatype, int root, - mscclComm_t comm, cudaStream_t stream); - -/* - * Broadcast - * - * Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the - * operation is started. - * - * In-place operation will happen if sendbuff == recvbuff. - */ -mscclResult_t mscclBroadcast(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, int root, mscclComm_t comm, - cudaStream_t stream); -mscclResult_t pmscclBroadcast(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, int root, mscclComm_t comm, - cudaStream_t stream); - -/* - * All-Reduce - * - * Reduces data arrays of length count in sendbuff using op operation, and - * leaves identical copies of result on each recvbuff. - * - * In-place operation will happen if sendbuff == recvbuff. - */ -mscclResult_t mscclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm, - cudaStream_t stream); -mscclResult_t pmscclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm, - cudaStream_t stream); - -/* - * Reduce-Scatter - * - * Reduces data in sendbuff using op operation and leaves reduced result - * scattered over the devices so that recvbuff on rank i will contain the i-th - * block of the result. - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ -mscclResult_t mscclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm, - cudaStream_t stream); -mscclResult_t pmscclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, - mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm, - cudaStream_t stream); - -/* - * All-Gather - * - * Each device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ -mscclResult_t mscclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream); -mscclResult_t pmscclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream); - -/* - * Send - * - * Send data from sendbuff to rank peer. - * - * Rank peer needs to call mscclRecv with the same datatype and the same count from this - * rank. - * - * This operation is blocking for the GPU. If multiple mscclSend and mscclRecv operations - * need to progress concurrently to complete, they must be fused within a mscclGroupStart/ - * mscclGroupEnd section. - */ -mscclResult_t mscclSend(const void* sendbuff, size_t count, mscclDataType_t datatype, int peer, - mscclComm_t comm, cudaStream_t stream); -mscclResult_t pmscclSend(const void* sendbuff, size_t count, mscclDataType_t datatype, int peer, - mscclComm_t comm, cudaStream_t stream); - -/* - * Receive - * - * Receive data from rank peer into recvbuff. - * - * Rank peer needs to call mscclSend with the same datatype and the same count to this - * rank. - * - * This operation is blocking for the GPU. If multiple mscclSend and mscclRecv operations - * need to progress concurrently to complete, they must be fused within a mscclGroupStart/ - * mscclGroupEnd section. - */ -mscclResult_t pmscclRecv(void* recvbuff, size_t count, mscclDataType_t datatype, int peer, - mscclComm_t comm, cudaStream_t stream); -mscclResult_t mscclRecv(void* recvbuff, size_t count, mscclDataType_t datatype, int peer, - mscclComm_t comm, cudaStream_t stream); - -/* All-To-All - * - * Device (i) send (j)th block of data to device (j) and be placed as (i)th - * block. Each block for sending/receiving has count elements, which means - * that recvbuff and sendbuff should have a size of nranks*count elements. - * - * In-place operation will happen if sendbuff == recvbuff. - */ -mscclResult_t mscclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream); -mscclResult_t pmscclAllToAll(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream); -/*! @brief Opaque handle to MSCCL algorithm */ -typedef int mscclAlgoHandle_t; - -/*! @brief MSCCL Load Algorithm - * - * @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return - * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL - * scheduler instead of end users. - */ -mscclResult_t mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, - int rank); -mscclResult_t pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle, - int rank); - -/*! @brief MSCCL Run Algorithm - * - * @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter - * list merges all possible parameters required by different operations as this - * is a general-purposed API. This API is expected to be called by MSCCL - * scheduler instead of end users. - */ -mscclResult_t mscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], - void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], - size_t count, mscclDataType_t dataType, int root, int peer, - mscclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, mscclComm_t comm, - cudaStream_t stream); -mscclResult_t pmscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[], - void* recvBuff, const size_t recvCounts[], const size_t rDisPls[], - size_t count, mscclDataType_t dataType, int root, int peer, - mscclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, mscclComm_t comm, - cudaStream_t stream); - -/*! @brief MSCCL Load Algorithm - * - * @details Unload MSCCL algorithm previous loaded using its handle. This API - * is expected to be called by MSCCL scheduler instead of end users. - */ -mscclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); -mscclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle); - -/* - * Group semantics - * - * When managing multiple GPUs from a single thread, and since MSCCL collective - * calls may perform inter-CPU synchronization, we need to "group" calls for - * different ranks/devices into a single call. - * - * Grouping MSCCL calls as being part of the same collective operation is done - * using mscclGroupStart and mscclGroupEnd. mscclGroupStart will enqueue all - * collective calls until the mscclGroupEnd call, which will wait for all calls - * to be complete. Note that for collective communication, mscclGroupEnd only - * guarantees that the operations are enqueued on the streams, not that - * the operation is effectively done. - * - * Both collective communication and mscclCommInitRank can be used in conjunction - * of mscclGroupStart/mscclGroupEnd, but not together. - * - * Group semantics also allow to fuse multiple operations on the same device - * to improve performance (for aggregated collective calls), or to permit - * concurrent progress of multiple send/receive operations. - */ - -/* - * Group Start - * - * Start a group call. All calls to MSCCL until mscclGroupEnd will be fused into - * a single MSCCL operation. Nothing will be started on the CUDA stream until - * mscclGroupEnd. - */ -mscclResult_t mscclGroupStart(); -mscclResult_t pmscclGroupStart(); - -/* - * Group End - * - * End a group call. Start a fused MSCCL operation consisting of all calls since - * mscclGroupStart. Operations on the CUDA stream depending on the MSCCL operations - * need to be called after mscclGroupEnd. - */ -mscclResult_t mscclGroupEnd(); -mscclResult_t pmscclGroupEnd(); - -#ifdef __cplusplus -} // end extern "C" -#endif - -#endif // end include guard diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a5523a0d1..4ecc6f0a6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -559,7 +559,6 @@ include(cmake/modules/contrib/ExampleTargetHooks.cmake) include(cmake/modules/contrib/Random.cmake) include(cmake/modules/contrib/Posit.cmake) include(cmake/modules/contrib/MicroStandaloneRuntime.cmake) -include(cmake/modules/contrib/MSCCLPP.cmake) include(cmake/modules/contrib/Sort.cmake) include(cmake/modules/contrib/NNPack.cmake) include(cmake/modules/contrib/LibTorch.cmake) @@ -935,8 +934,8 @@ endif() if(USE_CUDA AND USE_NCCL) find_library(LIBRT rt) - target_link_libraries(tvm PRIVATE nccl msccl ${LIBRT}) - target_link_libraries(tvm_runtime PRIVATE nccl msccl ${LIBRT}) + target_link_libraries(tvm PRIVATE nccl ${LIBRT}) + target_link_libraries(tvm_runtime PRIVATE nccl ${LIBRT}) endif() if(USE_ROCM AND USE_RCCL) diff --git a/cmake/modules/contrib/MSCCLPP.cmake b/cmake/modules/contrib/MSCCLPP.cmake deleted file mode 100644 index 5f7dd19890..0000000000 --- a/cmake/modules/contrib/MSCCLPP.cmake +++ /dev/null @@ -1,50 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -if(USE_CUDA AND USE_NCCL) - include(FetchContent) - FetchContent_Declare( - mscclpp - GIT_REPOSITORY https://github.com/csullivan/mscclpp.git - GIT_TAG feature/2024-03-19/msccl-nccl-equivalents - ) - set(USE_CUDA ON) - set(BYPASS_PEERMEM_CHECK ON) - set(BUILD_PYTHON_BINDINGS OFF) - set(BUILD_TESTS OFF) - FetchContent_MakeAvailable(mscclpp) - - tvm_file_glob(GLOB MSCCL_SRCS - ${PROJECT_SOURCE_DIR}/src/runtime/contrib/mscclpp/*.cu - ) - - add_library(msccl SHARED ${MSCCL_SRCS}) - target_link_libraries(msccl PUBLIC mscclpp) - target_compile_definitions(msccl PRIVATE DMLC_USE_LOGGING_LIBRARY=) - target_include_directories(msccl PUBLIC - $ - $ - $ - ) - - install(TARGETS mscclpp_obj - EXPORT ${PROJECT_NAME}Targets - FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include) - install(TARGETS mscclpp EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX}) - install(TARGETS msccl EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX}) - -endif() diff --git a/src/runtime/contrib/mscclpp/allreduce.cu b/src/runtime/contrib/mscclpp/allreduce.cu deleted file mode 100644 index 7ead504340..0000000000 --- a/src/runtime/contrib/mscclpp/allreduce.cu +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include -#include - -#include "msccl.cuh" - -namespace tvm { -namespace runtime { - -template -cudaError_t allreduce(const T* buff, T* scratch, T* resultBuff, - mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, int rank, - int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream); - -MSCCL_API mscclResult_t mscclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm, - cudaStream_t stream) { - size_t bytes = count * mscclTypeSize(datatype); - if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr || - op != mscclSum || bytes > (1 << 24)) { - return mscclInvalidArgument; - } - - int rank = comm->comm->bootstrap()->getRank(); - channelKey key{sendbuff, recvbuff, bytes}; - mscclpp::DeviceHandle* smChannels = nullptr; - mscclpp::DeviceHandle* smOutChannels = nullptr; - - auto it = comm->channelInfos.find(key); - if (it == comm->channelInfos.end()) { - // setup smChannels (src: sendbuff, dst: remote scratch buff) - std::vector channels = - setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast(sendbuff)); - ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr}; - it = comm->channelInfos.emplace(key, channelInfo).first; - - // TODO(csullivan): Consider supporting allreduce for larger transfers - // setup smOutChannels (src: recvbuff, dst: remote recvbuff) - // if (bytes > (1 << 24)) { - // std::vector remoteMemories = - // setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc); - // std::vector outChannels = setupSmChannels(comm, remoteMemories, - // recvbuff); it->second.smOutChannels = outChannels; it->second.smOutChannelDeviceHandles = - // setupSmChannelDeviceHandles(outChannels); - // } - } - - smChannels = it->second.smChannelDeviceHandles.get(); - smOutChannels = it->second.smOutChannelDeviceHandles.get(); - - switch (datatype) { - case mscclFloat16: - CUDACHECK(allreduce(reinterpret_cast(sendbuff), - reinterpret_cast(comm->scratchBuff.get()), - reinterpret_cast(recvbuff), smChannels, smOutChannels, rank, - NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); - break; - case mscclFloat32: - CUDACHECK(allreduce(reinterpret_cast(sendbuff), - reinterpret_cast(comm->scratchBuff.get()), - reinterpret_cast(recvbuff), smChannels, smOutChannels, - comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, - comm->comm->bootstrap()->getNranks(), count, stream)); - break; - case mscclInt32: - case mscclUint32: - CUDACHECK(allreduce(reinterpret_cast(sendbuff), - reinterpret_cast(comm->scratchBuff.get()), - reinterpret_cast(recvbuff), smChannels, smOutChannels, - comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, - comm->comm->bootstrap()->getNranks(), count, stream)); - break; - default: - return mscclInvalidArgument; - } - return mscclSuccess; -} - -template -__global__ void __launch_bounds__(1024, 1) - allreduce_simple(mscclpp::SmChannelDeviceHandle* smChans, const T* buff, T* scratch, - void* resultBuff, int rank, int worldSize, size_t nelems, - const uint32_t flag) { - nelems = nelems / (sizeof(int) / sizeof(T)); - - const int nPeers = worldSize - 1; - const size_t nPkts = nelems / 2; - const int nelemsPerRank = nelems / worldSize; - const int nPktsPerRank = nelemsPerRank / 2; - const int nBlocksPerPeer = gridDim.x / nPeers; - const int localBlockIdx = blockIdx.x % nBlocksPerPeer; - const int peerIdx = blockIdx.x / nBlocksPerPeer; - const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; - mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx]; - const int tid = threadIdx.x + localBlockIdx * blockDim.x; - - size_t scratchOffset = rank * nPktsPerRank * sizeof(mscclpp::LLPacket); - size_t resultOffset = 2 * nPkts * sizeof(mscclpp::LLPacket); - size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int); - const uint2* src = reinterpret_cast(reinterpret_cast(buff) + - rank * nelemsPerRank * sizeof(int)); - uint2* dst = reinterpret_cast(reinterpret_cast(resultBuff) + - rank * nelemsPerRank * sizeof(int)); - - // Step 1. Write to scratch buffer which exposes memory to peers via cuda IPC memory - smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); - - // Step 2. Get data from scratch buffer, reduce data, and write result back to peer scratch - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; - idx += blockDim.x * gridDim.x) { - uint2 data = make_uint2(0, 0); - for (int index = 0; index < nPeers; index++) { - const int remoteRank = index < rank ? index : index + 1; - mscclpp::LLPacket* dstPkt = - reinterpret_cast(scratch) + remoteRank * nPktsPerRank; - uint2 val = dstPkt[idx].read(flag); - data = add_vectors(val, data); - } - data = add_vectors(data, src[idx]); - dst[idx] = data; - - mscclpp::LLPacket packet; - packet.data1 = data.x; - packet.flag1 = flag; - packet.data2 = data.y; - packet.flag2 = flag; - size_t offset = resultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank); - for (int index = 0; index < nPeers; index++) { - smChans[index].write(offset, packet); - } - } - - // Step 3. Update local GPU's final result from peer scratch buffers - mscclpp::LLPacket* dstPkt = - reinterpret_cast(reinterpret_cast(scratch) + resultOffset); - const int dstOffset = remoteRank * nPktsPerRank; - uint2* result = reinterpret_cast(reinterpret_cast(resultBuff) + - remoteRank * nelemsPerRank * sizeof(int)); - for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank; - idx += blockDim.x * nBlocksPerPeer) { - uint2 data = dstPkt[idx + dstOffset].read(flag); - result[idx].x = data.x; - result[idx].y = data.y; - } -} - -template -cudaError_t allreduce(const T* buff, T* scratch, T* resultBuff, - mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, int rank, - int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { - static uint32_t flag = 1; - size_t num_bytes = sizeof(T) * nelems; - ICHECK(num_bytes <= (1 << 24)) << "mscclpp allreduce expects bytes transfered < " << (1 << 24) - << " but got num_bytes = " << num_bytes << " bytes"; - allreduce_simple<<<105, 1024, 0, stream>>>(smChannels, buff, scratch, resultBuff, rank, worldSize, - nelems, flag++); - - return cudaGetLastError(); -} - -} // namespace runtime -} // namespace tvm