diff --git a/3rdparty/mscclpp/include/common.h b/3rdparty/mscclpp/include/common.h
deleted file mode 100644
index ccde5a3ef4..0000000000
--- a/3rdparty/mscclpp/include/common.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCL_COMMON_HPP_
-#define MSCCL_COMMON_HPP_
-
-#if defined(__HIP_PLATFORM_AMD__)
-#define WARP_SIZE 64
-#define __syncwarp() __builtin_amdgcn_wave_barrier()
-#else
-#define WARP_SIZE 32
-#endif
-
-constexpr int NRANKS_PER_NODE = 8;
-constexpr int SCRATCH_SIZE = 1024 * 1024 * 70;  // 35 thread-blocks * 8 ranks * 256KB = 70MB
-
-template <typename To, typename From>
-__forceinline__ __device__ To bit_cast(const From& src) {
-  static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");
-
-  union {
-    From f;
-    To t;
-  } u;
-  u.f = src;
-  return u.t;
-}
-
-template <typename T>
-__forceinline__ __device__ T add_elements(T a, T b) {
-  return a + b;
-}
-
-template <>
-__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
-  return __hadd2(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) {
-  int4 ret;
-  ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
-  return ret;
-}
-
-template <typename T>
-__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) {
-  uint2 ret;
-  ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
-  ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
-  return ret;
-}
-
-template <typename T>
-__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ int add_vectors_helper(int a, int b) {
-  return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
-}
-
-template <typename T>
-__forceinline__ __device__ int add_vectors(int a, int b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ int add_vectors<__half>(int a, int b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-template <typename T>
-__forceinline__ __device__ uint32_t add_vectors_helper(uint32_t a, uint32_t b) {
-  return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
-}
-
-template <typename T>
-__forceinline__ __device__ uint32_t add_vectors(uint32_t a, uint32_t b) {
-  return add_vectors_helper<T>(a, b);
-}
-
-template <>
-__forceinline__ __device__ uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
-  return add_vectors_helper<__half2>(a, b);
-}
-
-#endif  // MSCCL_COMMON_HPP_
diff --git a/3rdparty/mscclpp/include/msccl.cuh b/3rdparty/mscclpp/include/msccl.cuh
deleted file mode 100644
index 93612126dc..0000000000
--- a/3rdparty/mscclpp/include/msccl.cuh
+++ /dev/null
@@ -1,323 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <algorithm>
-#include <mscclpp/concurrency_device.hpp>
-#include <mscclpp/core.hpp>
-#include <mscclpp/sm_channel.hpp>
-#include <mscclpp/sm_channel_device.hpp>
-#include <unordered_map>
-#include <vector>
-
-#include "common.h"
-#include "msccl.h"
-
-#define MSCCL_API extern "C" __attribute__((visibility("default")))
-
-#define CUDACHECK(cmd)                                                                      \
-  do {                                                                                      \
-    cudaError_t e = cmd;                                                                    \
-    if (e != cudaSuccess) {                                                                 \
-      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-      exit(EXIT_FAILURE);                                                                   \
-    }                                                                                       \
-  } while (0)
-
-#define NUM_CHANNELS_PER_CONNECTION 64
-
-struct channelKey {
-  const void* sendbuff;
-  const void* recvbuff;
-  size_t bytes;
-  bool operator==(const channelKey& other) const {
-    return sendbuff == other.sendbuff && recvbuff == other.recvbuff && bytes == other.bytes;
-  }
-};
-
-namespace std {
-template <>
-struct hash<channelKey> {
-  std::size_t operator()(const channelKey& k) const {
-    return std::hash<const void*>()(k.sendbuff) ^ std::hash<const void*>()(k.recvbuff) ^ std::hash<size_t>()(k.bytes);
-  }
-};
-}  // namespace std
-
-struct ChannelInfo {
-  std::vector<mscclpp::SmChannel> smChannels;
-  std::vector<mscclpp::SmChannel> smOutChannels;
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> smOutChannelDeviceHandles;
-};
-
-struct mscclComm {
-  std::shared_ptr<mscclpp::Communicator> comm;
-  std::vector<std::shared_ptr<mscclpp::Connection>> connections;
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
-
-  std::unordered_map<channelKey, ChannelInfo> channelInfos;
-  std::shared_ptr<char> scratchBuff;
-  std::vector<mscclpp::RegisteredMemory> remoteScratchRegMemories;
-};
-
-static size_t mscclTypeSize(mscclDataType_t type) {
-  switch (type) {
-    case mscclInt8:
-    case mscclUint8:
-      return 1;
-    case mscclFloat16:
-      return 2;
-    case mscclInt32:
-    case mscclUint32:
-      return 4;
-    case mscclInt64:
-    case mscclUint64:
-      return 8;
-    case mscclFloat32:
-      return 4;
-    case mscclFloat64:
-      return 8;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-    case mscclBfloat16:
-      return 2;
-#endif  // defined(__CUDA_BF16_TYPES_EXIST__)
-#if defined(__CUDA_FP8_TYPES_EXIST__)
-    case mscclFp8E4M3:
-    case mscclFp8E5M2:
-      return 1;
-#endif  // defined(__CUDA_FP8_TYPES_EXIST__)
-    case mscclNumTypes:
-      return 0;
-  }
-  return 0;
-}
-
-static mscclpp::Transport getTransport(int, int) { return mscclpp::Transport::CudaIpc; }
-
-static std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_ptr<mscclpp::Communicator> comm, int rank,
-                                                                  void* buff, size_t bytes,
-                                                                  mscclpp::TransportFlags transport) {
-  std::vector<mscclpp::RegisteredMemory> remoteMemories;
-  mscclpp::RegisteredMemory memory = comm->registerMemory(buff, bytes, transport);
-  std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>> remoteRegMemoryFutures;
-  for (int i = 0; i < comm->bootstrap()->getNranks(); i++) {
-    if (i == rank) continue;
-    remoteRegMemoryFutures.push_back(comm->recvMemoryOnSetup(i, 0));
-    comm->sendMemoryOnSetup(memory, i, 0);
-  }
-  comm->setup();
-  std::transform(remoteRegMemoryFutures.begin(), remoteRegMemoryFutures.end(), std::back_inserter(remoteMemories),
-                 [](const auto& future) { return future.get(); });
-  return remoteMemories;
-}
-
-static std::vector<mscclpp::SmChannel> setupSmChannels(mscclComm_t comm,
-                                                       const std::vector<mscclpp::RegisteredMemory>& remoteMemories,
-                                                       void* src) {
-  std::vector<mscclpp::SmChannel> channels;
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>>& smSemaphores = comm->smSemaphores;
-  size_t nConnections = comm->connections.size();
-  for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) {
-    for (size_t cid = 0; cid < nConnections; ++cid) {
-      if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-        channels.emplace_back(smSemaphores[idx * nConnections + cid], remoteMemories[cid], src, nullptr);
-      }
-    }
-  }
-  return channels;
-}
-
-static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannelDeviceHandles(
-    const std::vector<mscclpp::SmChannel>& smChannels) {
-  std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
-  std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles),
-                 [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
-      mscclpp::allocSharedCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
-  mscclpp::memcpyCuda<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
-                       smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
-  return ptr;
-}
-
-MSCCL_API mscclResult_t mscclGetVersion(int* version) {
-  if (version == nullptr) return mscclInvalidArgument;
-  *version = MSCCLPP_VERSION;
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclGetUniqueId(mscclUniqueId* uniqueId) {
-  if (uniqueId == nullptr) return mscclInvalidArgument;
-  if (MSCCLPP_UNIQUE_ID_BYTES != MSCCL_UNIQUE_ID_BYTES) return mscclInternalError;
-  mscclpp::UniqueId id = mscclpp::TcpBootstrap::createUniqueId();
-  memcpy(uniqueId, &id, sizeof(mscclUniqueId));
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommInitRankConfig(mscclComm_t*, int, mscclUniqueId, int,
-                                                mscclConfig_t*) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank) {
-  if (comm == nullptr) return mscclInvalidArgument;
-  if (nranks < 0 || rank < 0 || rank >= nranks) return mscclInvalidArgument;
-  std::shared_ptr<mscclpp::TcpBootstrap> bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, nranks);
-  mscclpp::UniqueId id;
-  memcpy(id.data(), &commId, sizeof(mscclUniqueId));
-  bootstrap->initialize(id);
-  std::shared_ptr<mscclpp::Communicator> mscclppComm = std::make_shared<mscclpp::Communicator>(bootstrap);
-  std::vector<mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
-
-  for (int i = 0; i < mscclppComm->bootstrap()->getNranks(); i++) {
-    if (i == rank) continue;
-    mscclpp::Transport transport = getTransport(rank, i);
-    connectionFutures.push_back(mscclppComm->connectOnSetup(i, 0, transport));
-  }
-  mscclppComm->setup();
-
-  std::vector<std::shared_ptr<mscclpp::Connection>> connections;
-  std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
-                 [](const auto& future) { return future.get(); });
-
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
-  for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) {
-    for (size_t cid = 0; cid < connections.size(); ++cid) {
-      if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-        smSemaphores.emplace_back(
-            std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*(mscclppComm), connections[cid]));
-      }
-    }
-  }
-  mscclppComm->setup();
-
-  mscclComm* commPtr = new mscclComm();
-  commPtr->comm = mscclppComm;
-  commPtr->connections = std::move(connections);
-  commPtr->smSemaphores = std::move(smSemaphores);
-  commPtr->scratchBuff = mscclpp::allocExtSharedCuda<char>(SCRATCH_SIZE);
-  commPtr->remoteScratchRegMemories =
-      setupRemoteMemories(commPtr->comm, rank, commPtr->scratchBuff.get(), SCRATCH_SIZE, mscclpp::Transport::CudaIpc);
-
-  *comm = commPtr;
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommInitAll(mscclComm_t*, int, const int*) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclCommFinalize(mscclComm_t comm) {
-  comm->comm->bootstrap()->barrier();
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommDestroy(mscclComm_t comm) {
-  if (comm == nullptr) return mscclInvalidArgument;
-  delete comm;
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommAbort(mscclComm_t) { return mscclSuccess; }
-
-MSCCL_API mscclResult_t mscclCommSplit(mscclComm_t, int, int, mscclComm_t*, mscclConfig_t*) {
-  return mscclInternalError;
-}
-
-MSCCL_API const char* mscclGetErrorString(mscclResult_t result) {
-  switch (result) {
-    case mscclSuccess:
-      return "no error";
-    case mscclUnhandledCudaError:
-      return "unhandled cuda error (run with MSCCL_DEBUG=INFO for details)";
-    case mscclSystemError:
-      return "unhandled system error (run with MSCCL_DEBUG=INFO for details)";
-    case mscclInternalError:
-      return "internal error - please report this issue to the MSCCL developers";
-    case mscclInvalidArgument:
-      return "invalid argument (run with MSCCL_DEBUG=WARN for details)";
-    case mscclInvalidUsage:
-      return "invalid usage (run with MSCCL_DEBUG=WARN for details)";
-    case mscclRemoteError:
-      return "remote process exited or there was a network error";
-    case mscclInProgress:
-      return "MSCCL operation in progress";
-    default:
-      return "unknown result code";
-  }
-}
-
-MSCCL_API const char* mscclGetLastError(mscclComm_t) { return nullptr; }
-
-MSCCL_API mscclResult_t mscclCommGetAsyncError(mscclComm_t, mscclResult_t* asyncError) {
-  if (asyncError == nullptr) return mscclInvalidArgument;
-  *asyncError = mscclSuccess;
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommCount(const mscclComm_t comm, int* count) {
-  if (comm == nullptr || count == nullptr) return mscclInvalidArgument;
-  *count = comm->comm->bootstrap()->getNranks();
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommCuDevice(const mscclComm_t comm, int* device) {
-  if (comm == nullptr || device == nullptr) return mscclInvalidArgument;
-  *device = comm->comm->bootstrap()->getRank();
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclCommUserRank(const mscclComm_t comm, int* rank) {
-  if (comm == nullptr || rank == nullptr) return mscclInvalidArgument;
-  *rank = comm->comm->bootstrap()->getRank();
-  return mscclSuccess;
-}
-
-MSCCL_API mscclResult_t mscclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                                       mscclDataType_t datatype, mscclComm_t comm,
-                                       cudaStream_t stream) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclRedOpCreatePreMulSum(mscclRedOp_t*, void*, mscclDataType_t,
-                                                  mscclScalarResidence_t, mscclComm_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclRedOpDestroy(mscclRedOp_t, mscclComm_t) { return mscclInternalError; }
-
-MSCCL_API mscclResult_t mscclReduce(const void*, void*, size_t, mscclDataType_t, mscclRedOp_t, int,
-                                    mscclComm_t, cudaStream_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclBcast(void*, size_t, mscclDataType_t, int, mscclComm_t, cudaStream_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclBroadcast(const void*, void*, size_t, mscclDataType_t, int,
-                                       mscclComm_t, cudaStream_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclReduceScatter(const void*, void*, size_t, mscclDataType_t,
-                                           mscclRedOp_t, mscclComm_t, cudaStream_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclSend(const void*, size_t, mscclDataType_t, int, mscclComm_t,
-                                  cudaStream_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclRecv(void*, size_t, mscclDataType_t, int, mscclComm_t, cudaStream_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclAllToAll(const void*, void*, size_t, mscclDataType_t, mscclComm_t,
-                                      cudaStream_t) {
-  return mscclInternalError;
-}
-
-MSCCL_API mscclResult_t mscclGroupStart() { return mscclSuccess; }
-
-MSCCL_API mscclResult_t mscclGroupEnd() { return mscclSuccess; }
diff --git a/3rdparty/mscclpp/include/msccl.h b/3rdparty/mscclpp/include/msccl.h
deleted file mode 100644
index 12e4e7222b..0000000000
--- a/3rdparty/mscclpp/include/msccl.h
+++ /dev/null
@@ -1,494 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef MSCCL_H_
-#define MSCCL_H_
-
-#include <mscclpp/gpu.hpp>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <limits.h>
-/* Opaque handle to communicator */
-typedef struct mscclComm* mscclComm_t;
-#define MSCCL_COMM_NULL NULL
-
-#define MSCCL_UNIQUE_ID_BYTES 128
-typedef struct {
-  char internal[MSCCL_UNIQUE_ID_BYTES];
-} mscclUniqueId;
-
-/* Error type */
-typedef enum {
-  mscclSuccess = 0,
-  mscclUnhandledCudaError = 1,
-  mscclSystemError = 2,
-  mscclInternalError = 3,
-  mscclInvalidArgument = 4,
-  mscclInvalidUsage = 5,
-  mscclRemoteError = 6,
-  mscclInProgress = 7,
-  mscclNumResults = 8
-} mscclResult_t;
-
-#define MSCCL_CONFIG_UNDEF_INT INT_MIN
-#define MSCCL_CONFIG_UNDEF_PTR NULL
-#define MSCCL_SPLIT_NOCOLOR -1
-
-/* Communicator configuration. Users can assign value to attributes to specify the
- * behavior of a communicator. */
-typedef struct mscclConfig_v21700 {
-  /* attributes that users should never touch. */
-  size_t size;
-  unsigned int magic;
-  unsigned int version;
-  /* attributes that users are able to customize. */
-  int blocking;
-  int cgaClusterSize;
-  int minCTAs;
-  int maxCTAs;
-  const char* netName;
-  int splitShare;
-} mscclConfig_t;
-
-/* Config initializer must be assigned to initialize config structure when it is created.
- * Not initialized config will result in MSCCL error. */
-#define MSCCL_CONFIG_INITIALIZER                                                   \
-  {                                                                                \
-    sizeof(mscclConfig_t),                                    /* size */           \
-        0xcafebeef,                                           /* magic */          \
-        MSCCL_VERSION(MSCCL_MAJOR, MSCCL_MINOR, MSCCL_PATCH), /* version */        \
-        MSCCL_CONFIG_UNDEF_INT,                               /* blocking */       \
-        MSCCL_CONFIG_UNDEF_INT,                               /* cgaClusterSize */ \
-        MSCCL_CONFIG_UNDEF_INT,                               /* minCTAs */        \
-        MSCCL_CONFIG_UNDEF_INT,                               /* maxCTAs */        \
-        MSCCL_CONFIG_UNDEF_PTR,                               /* netName */        \
-        MSCCL_CONFIG_UNDEF_INT                                /* splitShare */     \
-  }
-
-/* Return the MSCCL_VERSION_CODE of the MSCCL library in the supplied integer.
- * This integer is coded with the MAJOR, MINOR and PATCH level of the
- * MSCCL library
- */
-mscclResult_t mscclGetVersion(int* version);
-mscclResult_t pmscclGetVersion(int* version);
-
-/* Generates an Id to be used in mscclCommInitRank. mscclGetUniqueId should be
- * called once and the Id should be distributed to all ranks in the
- * communicator before calling mscclCommInitRank. */
-mscclResult_t mscclGetUniqueId(mscclUniqueId* uniqueId);
-mscclResult_t pmscclGetUniqueId(mscclUniqueId* uniqueId);
-
-/* Create a new communicator (multi thread/process version) with a configuration
- * set by users. */
-mscclResult_t mscclCommInitRankConfig(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank,
-                                      mscclConfig_t* config);
-mscclResult_t pmscclCommInitRankConfig(mscclComm_t* comm, int nranks, mscclUniqueId commId,
-                                       int rank, mscclConfig_t* config);
-
-/* Creates a new communicator (multi thread/process version).
- * rank must be between 0 and nranks-1 and unique within a communicator clique.
- * Each rank is associated to a CUDA device, which has to be set before calling
- * mscclCommInitRank.
- * mscclCommInitRank implicitly syncronizes with other ranks, so it must be
- * called by different threads/processes or use mscclGroupStart/mscclGroupEnd. */
-mscclResult_t mscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank);
-mscclResult_t pmscclCommInitRank(mscclComm_t* comm, int nranks, mscclUniqueId commId, int rank);
-
-/* Creates a clique of communicators (single process version).
- * This is a convenience function to create a single-process communicator clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(mscclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator. */
-mscclResult_t mscclCommInitAll(mscclComm_t* comm, int ndev, const int* devlist);
-mscclResult_t pmscclCommInitAll(mscclComm_t* comm, int ndev, const int* devlist);
-
-/* Finalize a communicator. mscclCommFinalize flushes all issued communications,
- * and marks communicator state as mscclInProgress. The state will change to mscclSuccess
- * when the communicator is globally quiescent and related resources are freed; then,
- * calling mscclCommDestroy can locally free the rest of the resources (e.g. communicator
- * itself) without blocking. */
-mscclResult_t mscclCommFinalize(mscclComm_t comm);
-mscclResult_t pmscclCommFinalize(mscclComm_t comm);
-
-/* Frees local resources associated with communicator object. */
-mscclResult_t mscclCommDestroy(mscclComm_t comm);
-mscclResult_t pmscclCommDestroy(mscclComm_t comm);
-
-/* Frees resources associated with communicator object and aborts any operations
- * that might still be running on the device. */
-mscclResult_t mscclCommAbort(mscclComm_t comm);
-mscclResult_t pmscclCommAbort(mscclComm_t comm);
-
-/* Creates one or more communicators from an existing one.
- * Ranks with the same color will end up in the same communicator.
- * Within the new communicator, key will be used to order ranks.
- * MSCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
- * and will therefore return a NULL communicator.
- * If config is NULL, the new communicator will inherit the original communicator's
- * configuration*/
-mscclResult_t mscclCommSplit(mscclComm_t comm, int color, int key, mscclComm_t* newcomm,
-                             mscclConfig_t* config);
-mscclResult_t pmscclCommSplit(mscclComm_t comm, int color, int key, mscclComm_t* newcomm,
-                              mscclConfig_t* config);
-
-/* Returns a string for each error code. */
-const char* mscclGetErrorString(mscclResult_t result);
-const char* pmscclGetErrorString(mscclResult_t result);
-
-/* Returns a human-readable message of the last error that occurred.
- * comm is currently unused and can be set to NULL
- */
-const char* mscclGetLastError(mscclComm_t comm);
-const char* pmscclGetLastError(mscclComm_t comm);
-
-/* Checks whether the comm has encountered any asynchronous errors */
-mscclResult_t mscclCommGetAsyncError(mscclComm_t comm, mscclResult_t* asyncError);
-mscclResult_t pmscclCommGetAsyncError(mscclComm_t comm, mscclResult_t* asyncError);
-
-/* Gets the number of ranks in the communicator clique. */
-mscclResult_t mscclCommCount(const mscclComm_t comm, int* count);
-mscclResult_t pmscclCommCount(const mscclComm_t comm, int* count);
-
-/* Returns the cuda device number associated with the communicator. */
-mscclResult_t mscclCommCuDevice(const mscclComm_t comm, int* device);
-mscclResult_t pmscclCommCuDevice(const mscclComm_t comm, int* device);
-
-/* Returns the user-ordered "rank" associated with the communicator. */
-mscclResult_t mscclCommUserRank(const mscclComm_t comm, int* rank);
-mscclResult_t pmscclCommUserRank(const mscclComm_t comm, int* rank);
-
-/* Reduction operation selector */
-typedef enum { mscclNumOps_dummy = 5 } mscclRedOp_dummy_t;
-typedef enum {
-  mscclSum = 0,
-  mscclProd = 1,
-  mscclMax = 2,
-  mscclMin = 3,
-  mscclAvg = 4,
-  /* mscclNumOps: The number of built-in mscclRedOp_t values. Also
-   * serves as the least possible value for dynamic mscclRedOp_t's
-   * as constructed by mscclRedOpCreate*** functions. */
-  mscclNumOps = 5,
-  /* mscclMaxRedOp: The largest valid value for mscclRedOp_t.
-   * It is defined to be the largest signed value (since compilers
-   * are permitted to use signed enums) that won't grow
-   * sizeof(mscclRedOp_t) when compared to previous MSCCL versions to
-   * maintain ABI compatibility. */
-  mscclMaxRedOp = 0x7fffffff >> (32 - 8 * sizeof(mscclRedOp_dummy_t))
-} mscclRedOp_t;
-
-/* Data types */
-typedef enum {
-  mscclInt8 = 0,
-  mscclChar = 0,
-  mscclUint8 = 1,
-  mscclInt32 = 2,
-  mscclInt = 2,
-  mscclUint32 = 3,
-  mscclInt64 = 4,
-  mscclUint64 = 5,
-  mscclFloat16 = 6,
-  mscclHalf = 6,
-  mscclFloat32 = 7,
-  mscclFloat = 7,
-  mscclFloat64 = 8,
-  mscclDouble = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__)
-  mscclBfloat16 = 9,
-  mscclFp8E4M3 = 10,
-  mscclFp8E5M2 = 11,
-  mscclNumTypes = 12
-#elif defined(__CUDA_BF16_TYPES_EXIST__)
-  mscclBfloat16 = 9,
-  mscclNumTypes = 10
-#else
-  mscclNumTypes = 9
-#endif
-} mscclDataType_t;
-
-/* mscclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
-typedef enum {
-  /* mscclScalarDevice: The scalar is in device-visible memory and will be
-   * dereferenced while the collective is running. */
-  mscclScalarDevice = 0,
-
-  /* mscclScalarHostImmediate: The scalar is in host-visible memory and will be
-   * dereferenced before the mscclRedOpCreate***() function returns. */
-  mscclScalarHostImmediate = 1
-} mscclScalarResidence_t;
-
-/*
- * mscclRedOpCreatePreMulSum
- *
- * Creates a new reduction operator which pre-multiplies input values by a given
- * scalar locally before reducing them with peer values via summation. For use
- * only with collectives launched against *comm* and *datatype*. The
- * *residence* argument indicates how/when the memory pointed to by *scalar*
- * will be dereferenced. Upon return, the newly created operator's handle
- * is stored in *op*.
- */
-mscclResult_t mscclRedOpCreatePreMulSum(mscclRedOp_t* op, void* scalar, mscclDataType_t datatype,
-                                        mscclScalarResidence_t residence, mscclComm_t comm);
-mscclResult_t pmscclRedOpCreatePreMulSum(mscclRedOp_t* op, void* scalar, mscclDataType_t datatype,
-                                         mscclScalarResidence_t residence, mscclComm_t comm);
-
-/*
- * mscclRedOpDestroy
- *
- * Destroys the reduction operator *op*. The operator must have been created by
- * mscclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
- * destroyed as soon as the last MSCCL function which is given that operator returns.
- */
-mscclResult_t mscclRedOpDestroy(mscclRedOp_t op, mscclComm_t comm);
-mscclResult_t pmscclRedOpDestroy(mscclRedOp_t op, mscclComm_t comm);
-
-/*
- * Collective communication operations
- *
- * Collective communication operations must be called separately for each
- * communicator in a communicator clique.
- *
- * They return when operations have been enqueued on the CUDA stream.
- *
- * Since they may perform inter-CPU synchronization, each call has to be done
- * from a different thread or process, or need to use Group Semantics (see
- * below).
- */
-
-/*
- * Reduce
- *
- * Reduces data arrays of length count in sendbuff into recvbuff using op
- * operation.
- * recvbuff may be NULL on all calls except for root device.
- * root is the rank (not the CUDA device) where data will reside after the
- * operation is complete.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclReduce(const void* sendbuff, void* recvbuff, size_t count,
-                          mscclDataType_t datatype, mscclRedOp_t op, int root, mscclComm_t comm,
-                          cudaStream_t stream);
-mscclResult_t pmscclReduce(const void* sendbuff, void* recvbuff, size_t count,
-                           mscclDataType_t datatype, mscclRedOp_t op, int root, mscclComm_t comm,
-                           cudaStream_t stream);
-
-/*
- * (deprecated) Broadcast (in-place)
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * This operation is implicitly in place.
- */
-mscclResult_t mscclBcast(void* buff, size_t count, mscclDataType_t datatype, int root,
-                         mscclComm_t comm, cudaStream_t stream);
-mscclResult_t pmscclBcast(void* buff, size_t count, mscclDataType_t datatype, int root,
-                          mscclComm_t comm, cudaStream_t stream);
-
-/*
- * Broadcast
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
-                             mscclDataType_t datatype, int root, mscclComm_t comm,
-                             cudaStream_t stream);
-mscclResult_t pmscclBroadcast(const void* sendbuff, void* recvbuff, size_t count,
-                              mscclDataType_t datatype, int root, mscclComm_t comm,
-                              cudaStream_t stream);
-
-/*
- * All-Reduce
- *
- * Reduces data arrays of length count in sendbuff using op operation, and
- * leaves identical copies of result on each recvbuff.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-                             mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm,
-                             cudaStream_t stream);
-mscclResult_t pmscclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-                              mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm,
-                              cudaStream_t stream);
-
-/*
- * Reduce-Scatter
- *
- * Reduces data in sendbuff using op operation and leaves reduced result
- * scattered over the devices so that recvbuff on rank i will contain the i-th
- * block of the result.
- * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
- * should have a size of at least nranks*recvcount elements.
- *
- * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
- */
-mscclResult_t mscclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-                                 mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm,
-                                 cudaStream_t stream);
-mscclResult_t pmscclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-                                  mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm,
-                                  cudaStream_t stream);
-
-/*
- * All-Gather
- *
- * Each device gathers sendcount values from other GPUs into recvbuff,
- * receiving data from rank i at offset i*sendcount.
- * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
- * should have a size of at least nranks*sendcount elements.
- *
- * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
- */
-mscclResult_t mscclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                             mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream);
-mscclResult_t pmscclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                              mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream);
-
-/*
- * Send
- *
- * Send data from sendbuff to rank peer.
- *
- * Rank peer needs to call mscclRecv with the same datatype and the same count from this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple mscclSend and mscclRecv operations
- * need to progress concurrently to complete, they must be fused within a mscclGroupStart/
- * mscclGroupEnd section.
- */
-mscclResult_t mscclSend(const void* sendbuff, size_t count, mscclDataType_t datatype, int peer,
-                        mscclComm_t comm, cudaStream_t stream);
-mscclResult_t pmscclSend(const void* sendbuff, size_t count, mscclDataType_t datatype, int peer,
-                         mscclComm_t comm, cudaStream_t stream);
-
-/*
- * Receive
- *
- * Receive data from rank peer into recvbuff.
- *
- * Rank peer needs to call mscclSend with the same datatype and the same count to this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple mscclSend and mscclRecv operations
- * need to progress concurrently to complete, they must be fused within a mscclGroupStart/
- * mscclGroupEnd section.
- */
-mscclResult_t pmscclRecv(void* recvbuff, size_t count, mscclDataType_t datatype, int peer,
-                         mscclComm_t comm, cudaStream_t stream);
-mscclResult_t mscclRecv(void* recvbuff, size_t count, mscclDataType_t datatype, int peer,
-                        mscclComm_t comm, cudaStream_t stream);
-
-/* All-To-All
- *
- * Device (i) send (j)th block of data to device (j) and be placed as (i)th
- * block. Each block for sending/receiving has count elements, which means
- * that recvbuff and sendbuff should have a size of nranks*count elements.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-mscclResult_t mscclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
-                            mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream);
-mscclResult_t pmscclAllToAll(const void* sendbuff, void* recvbuff, size_t count,
-                             mscclDataType_t datatype, mscclComm_t comm, cudaStream_t stream);
-/*! @brief Opaque handle to MSCCL algorithm */
-typedef int mscclAlgoHandle_t;
-
-/*! @brief MSCCL Load Algorithm
- *
- * @details Load MSCCL algorithm file specified in mscclAlgoFilePath and return
- * its handle via mscclAlgoHandle. This API is expected to be called by MSCCL
- * scheduler instead of end users.
- */
-mscclResult_t mscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle,
-                            int rank);
-mscclResult_t pmscclLoadAlgo(const char* mscclAlgoFilePath, mscclAlgoHandle_t* mscclAlgoHandle,
-                             int rank);
-
-/*! @brief MSCCL Run Algorithm
- *
- * @details Run MSCCL algorithm specified by mscclAlgoHandle. The parameter
- * list merges all possible parameters required by different operations as this
- * is a general-purposed API. This API is expected to be called by MSCCL
- * scheduler instead of end users.
- */
-mscclResult_t mscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
-                           void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
-                           size_t count, mscclDataType_t dataType, int root, int peer,
-                           mscclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, mscclComm_t comm,
-                           cudaStream_t stream);
-mscclResult_t pmscclRunAlgo(const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
-                            void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
-                            size_t count, mscclDataType_t dataType, int root, int peer,
-                            mscclRedOp_t op, mscclAlgoHandle_t mscclAlgoHandle, mscclComm_t comm,
-                            cudaStream_t stream);
-
-/*! @brief MSCCL Load Algorithm
- *
- * @details Unload MSCCL algorithm previous loaded using its handle. This API
- * is expected to be called by MSCCL scheduler instead of end users.
- */
-mscclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
-mscclResult_t pmscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle);
-
-/*
- * Group semantics
- *
- * When managing multiple GPUs from a single thread, and since MSCCL collective
- * calls may perform inter-CPU synchronization, we need to "group" calls for
- * different ranks/devices into a single call.
- *
- * Grouping MSCCL calls as being part of the same collective operation is done
- * using mscclGroupStart and mscclGroupEnd. mscclGroupStart will enqueue all
- * collective calls until the mscclGroupEnd call, which will wait for all calls
- * to be complete. Note that for collective communication, mscclGroupEnd only
- * guarantees that the operations are enqueued on the streams, not that
- * the operation is effectively done.
- *
- * Both collective communication and mscclCommInitRank can be used in conjunction
- * of mscclGroupStart/mscclGroupEnd, but not together.
- *
- * Group semantics also allow to fuse multiple operations on the same device
- * to improve performance (for aggregated collective calls), or to permit
- * concurrent progress of multiple send/receive operations.
- */
-
-/*
- * Group Start
- *
- * Start a group call. All calls to MSCCL until mscclGroupEnd will be fused into
- * a single MSCCL operation. Nothing will be started on the CUDA stream until
- * mscclGroupEnd.
- */
-mscclResult_t mscclGroupStart();
-mscclResult_t pmscclGroupStart();
-
-/*
- * Group End
- *
- * End a group call. Start a fused MSCCL operation consisting of all calls since
- * mscclGroupStart. Operations on the CUDA stream depending on the MSCCL operations
- * need to be called after mscclGroupEnd.
- */
-mscclResult_t mscclGroupEnd();
-mscclResult_t pmscclGroupEnd();
-
-#ifdef __cplusplus
-}  // end extern "C"
-#endif
-
-#endif  // end include guard
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a5523a0d1..4ecc6f0a6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -559,7 +559,6 @@ include(cmake/modules/contrib/ExampleTargetHooks.cmake)
 include(cmake/modules/contrib/Random.cmake)
 include(cmake/modules/contrib/Posit.cmake)
 include(cmake/modules/contrib/MicroStandaloneRuntime.cmake)
-include(cmake/modules/contrib/MSCCLPP.cmake)
 include(cmake/modules/contrib/Sort.cmake)
 include(cmake/modules/contrib/NNPack.cmake)
 include(cmake/modules/contrib/LibTorch.cmake)
@@ -935,8 +934,8 @@ endif()
 
 if(USE_CUDA AND USE_NCCL)
   find_library(LIBRT rt)
-  target_link_libraries(tvm PRIVATE nccl msccl ${LIBRT})
-  target_link_libraries(tvm_runtime PRIVATE nccl msccl ${LIBRT})
+  target_link_libraries(tvm PRIVATE nccl ${LIBRT})
+  target_link_libraries(tvm_runtime PRIVATE nccl ${LIBRT})
 endif()
 
 if(USE_ROCM AND USE_RCCL)
diff --git a/cmake/modules/contrib/MSCCLPP.cmake b/cmake/modules/contrib/MSCCLPP.cmake
deleted file mode 100644
index 5f7dd19890..0000000000
--- a/cmake/modules/contrib/MSCCLPP.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if(USE_CUDA AND USE_NCCL)
-  include(FetchContent)
-  FetchContent_Declare(
-    mscclpp
-    GIT_REPOSITORY https://github.com/csullivan/mscclpp.git
-    GIT_TAG feature/2024-03-19/msccl-nccl-equivalents
-  )
-  set(USE_CUDA ON)
-  set(BYPASS_PEERMEM_CHECK ON)
-  set(BUILD_PYTHON_BINDINGS OFF)
-  set(BUILD_TESTS OFF)
-  FetchContent_MakeAvailable(mscclpp)
-
-  tvm_file_glob(GLOB MSCCL_SRCS
-    ${PROJECT_SOURCE_DIR}/src/runtime/contrib/mscclpp/*.cu
-  )
-
-  add_library(msccl SHARED ${MSCCL_SRCS})
-  target_link_libraries(msccl PUBLIC mscclpp)
-  target_compile_definitions(msccl PRIVATE DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-  target_include_directories(msccl PUBLIC
-    $<BUILD_INTERFACE:${mscclpp_SOURCE_DIR}/include>
-    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/3rdparty/mscclpp/include>
-    $<INSTALL_INTERFACE:include/msccl>
-  )
-
-  install(TARGETS mscclpp_obj
-    EXPORT ${PROJECT_NAME}Targets
-    FILE_SET HEADERS DESTINATION ${INSTALL_PREFIX}/include)
-  install(TARGETS mscclpp EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
-  install(TARGETS msccl EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
-
-endif()
diff --git a/src/runtime/contrib/mscclpp/allreduce.cu b/src/runtime/contrib/mscclpp/allreduce.cu
deleted file mode 100644
index 7ead504340..0000000000
--- a/src/runtime/contrib/mscclpp/allreduce.cu
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/registry.h>
-
-#include "msccl.cuh"
-
-namespace tvm {
-namespace runtime {
-
-template <typename T>
-cudaError_t allreduce(const T* buff, T* scratch, T* resultBuff,
-                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, int rank,
-                      int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream);
-
-MSCCL_API mscclResult_t mscclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-                                       mscclDataType_t datatype, mscclRedOp_t op, mscclComm_t comm,
-                                       cudaStream_t stream) {
-  size_t bytes = count * mscclTypeSize(datatype);
-  if (sendbuff == nullptr || recvbuff == nullptr || bytes == 0 || comm == nullptr ||
-      op != mscclSum || bytes > (1 << 24)) {
-    return mscclInvalidArgument;
-  }
-
-  int rank = comm->comm->bootstrap()->getRank();
-  channelKey key{sendbuff, recvbuff, bytes};
-  mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels = nullptr;
-  mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels = nullptr;
-
-  auto it = comm->channelInfos.find(key);
-  if (it == comm->channelInfos.end()) {
-    // setup smChannels (src: sendbuff, dst: remote scratch buff)
-    std::vector<mscclpp::SmChannel> channels =
-        setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>(sendbuff));
-    ChannelInfo channelInfo{channels, {}, setupSmChannelDeviceHandles(channels), nullptr};
-    it = comm->channelInfos.emplace(key, channelInfo).first;
-
-    // TODO(csullivan): Consider supporting allreduce for larger transfers
-    // setup smOutChannels (src: recvbuff, dst: remote recvbuff)
-    // if (bytes > (1 << 24)) {
-    //   std::vector<mscclpp::RegisteredMemory> remoteMemories =
-    //       setupRemoteMemories(comm->comm, rank, recvbuff, bytes, mscclpp::Transport::CudaIpc);
-    //   std::vector<mscclpp::SmChannel> outChannels = setupSmChannels(comm, remoteMemories,
-    //   recvbuff); it->second.smOutChannels = outChannels; it->second.smOutChannelDeviceHandles =
-    //   setupSmChannelDeviceHandles(outChannels);
-    // }
-  }
-
-  smChannels = it->second.smChannelDeviceHandles.get();
-  smOutChannels = it->second.smOutChannelDeviceHandles.get();
-
-  switch (datatype) {
-    case mscclFloat16:
-      CUDACHECK(allreduce(reinterpret_cast<const half*>(sendbuff),
-                          reinterpret_cast<half*>(comm->scratchBuff.get()),
-                          reinterpret_cast<half*>(recvbuff), smChannels, smOutChannels, rank,
-                          NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream));
-      break;
-    case mscclFloat32:
-      CUDACHECK(allreduce(reinterpret_cast<const float*>(sendbuff),
-                          reinterpret_cast<float*>(comm->scratchBuff.get()),
-                          reinterpret_cast<float*>(recvbuff), smChannels, smOutChannels,
-                          comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE,
-                          comm->comm->bootstrap()->getNranks(), count, stream));
-      break;
-    case mscclInt32:
-    case mscclUint32:
-      CUDACHECK(allreduce(reinterpret_cast<const int*>(sendbuff),
-                          reinterpret_cast<int*>(comm->scratchBuff.get()),
-                          reinterpret_cast<int*>(recvbuff), smChannels, smOutChannels,
-                          comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE,
-                          comm->comm->bootstrap()->getNranks(), count, stream));
-      break;
-    default:
-      return mscclInvalidArgument;
-  }
-  return mscclSuccess;
-}
-
-template <typename T>
-__global__ void __launch_bounds__(1024, 1)
-    allreduce_simple(mscclpp::SmChannelDeviceHandle* smChans, const T* buff, T* scratch,
-                     void* resultBuff, int rank, int worldSize, size_t nelems,
-                     const uint32_t flag) {
-  nelems = nelems / (sizeof(int) / sizeof(T));
-
-  const int nPeers = worldSize - 1;
-  const size_t nPkts = nelems / 2;
-  const int nelemsPerRank = nelems / worldSize;
-  const int nPktsPerRank = nelemsPerRank / 2;
-  const int nBlocksPerPeer = gridDim.x / nPeers;
-  const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
-  const int peerIdx = blockIdx.x / nBlocksPerPeer;
-  const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
-  mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx];
-  const int tid = threadIdx.x + localBlockIdx * blockDim.x;
-
-  size_t scratchOffset = rank * nPktsPerRank * sizeof(mscclpp::LLPacket);
-  size_t resultOffset = 2 * nPkts * sizeof(mscclpp::LLPacket);
-  size_t srcOffset = remoteRank * nelemsPerRank * sizeof(int);
-  const uint2* src = reinterpret_cast<const uint2*>(reinterpret_cast<const char*>(buff) +
-                                                    rank * nelemsPerRank * sizeof(int));
-  uint2* dst = reinterpret_cast<uint2*>(reinterpret_cast<char*>(resultBuff) +
-                                        rank * nelemsPerRank * sizeof(int));
-
-  // Step 1. Write to scratch buffer which exposes memory to peers via cuda IPC memory
-  smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
-                    blockDim.x * nBlocksPerPeer, flag);
-
-  // Step 2. Get data from scratch buffer, reduce data, and write result back to peer scratch
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank;
-       idx += blockDim.x * gridDim.x) {
-    uint2 data = make_uint2(0, 0);
-    for (int index = 0; index < nPeers; index++) {
-      const int remoteRank = index < rank ? index : index + 1;
-      mscclpp::LLPacket* dstPkt =
-          reinterpret_cast<mscclpp::LLPacket*>(scratch) + remoteRank * nPktsPerRank;
-      uint2 val = dstPkt[idx].read(flag);
-      data = add_vectors<T>(val, data);
-    }
-    data = add_vectors<T>(data, src[idx]);
-    dst[idx] = data;
-
-    mscclpp::LLPacket packet;
-    packet.data1 = data.x;
-    packet.flag1 = flag;
-    packet.data2 = data.y;
-    packet.flag2 = flag;
-    size_t offset = resultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
-    for (int index = 0; index < nPeers; index++) {
-      smChans[index].write(offset, packet);
-    }
-  }
-
-  // Step 3. Update local GPU's final result from peer scratch buffers
-  mscclpp::LLPacket* dstPkt =
-      reinterpret_cast<mscclpp::LLPacket*>(reinterpret_cast<char*>(scratch) + resultOffset);
-  const int dstOffset = remoteRank * nPktsPerRank;
-  uint2* result = reinterpret_cast<uint2*>(reinterpret_cast<char*>(resultBuff) +
-                                           remoteRank * nelemsPerRank * sizeof(int));
-  for (int idx = threadIdx.x + localBlockIdx * blockDim.x; idx < nPktsPerRank;
-       idx += blockDim.x * nBlocksPerPeer) {
-    uint2 data = dstPkt[idx + dstOffset].read(flag);
-    result[idx].x = data.x;
-    result[idx].y = data.y;
-  }
-}
-
-template <typename T>
-cudaError_t allreduce(const T* buff, T* scratch, T* resultBuff,
-                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, int rank,
-                      int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) {
-  static uint32_t flag = 1;
-  size_t num_bytes = sizeof(T) * nelems;
-  ICHECK(num_bytes <= (1 << 24)) << "mscclpp allreduce expects bytes transfered < " << (1 << 24)
-                                 << " but got num_bytes = " << num_bytes << " bytes";
-  allreduce_simple<<<105, 1024, 0, stream>>>(smChannels, buff, scratch, resultBuff, rank, worldSize,
-                                             nelems, flag++);
-
-  return cudaGetLastError();
-}
-
-}  // namespace runtime
-}  // namespace tvm