Skip to content

Commit

Permalink
Merge branch 'apache:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
jikechao authored Oct 30, 2024
2 parents 46e7f0e + e3e27f5 commit beac1b2
Show file tree
Hide file tree
Showing 1,532 changed files with 107,416 additions and 23,056 deletions.
16 changes: 10 additions & 6 deletions .github/actions/setup/action.yml
Original file line number Diff line number Diff line change
@@ -1,35 +1,39 @@
runs:
using: "composite"
steps:
- uses: actions/cache@v1
- uses: actions/cache@v3
env:
CACHE_NUMBER: 0
CACHE_NUMBER: 1
with:
path: ~/conda_pkgs_dir
key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda/build-environment.yaml') }}
- uses: conda-incubator/setup-miniconda@v2
- uses: conda-incubator/setup-miniconda@v3
continue-on-error: true
id: conda1
with:
activate-environment: tvm-build
channel-priority: strict
environment-file: conda/build-environment.yaml
auto-activate-base: false
conda-solver: classic
use-only-tar-bz2: true
python-version: 3.7
python-version: 3.9
condarc-file: conda/condarc
- uses: conda-incubator/setup-miniconda@v2
- uses: conda-incubator/setup-miniconda@v3
if: steps.conda1.outcome == 'failure'
with:
activate-environment: tvm-build
channel-priority: strict
environment-file: conda/build-environment.yaml
auto-activate-base: false
conda-solver: classic
use-only-tar-bz2: true
python-version: 3.7
python-version: 3.9
condarc-file: conda/condarc
- name: Conda info
shell: pwsh
run: |
conda info
conda list
conda info --envs
conda list --name base
22 changes: 12 additions & 10 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,17 @@ jobs:
- name: Minimal Metal Compile-and-Run
shell: bash -l {0}
run: >-
python -m pytest -v -s 'tests/python/codegen/test_target_codegen_metal.py'
python -m pytest -v -s 'tests/python/codegen/test_target_codegen_gpu_common.py'
python -m pytest -v -s 'tests/python/codegen/test_gpu_codegen_allreduce.py::test_allreduce_sum[dims0-metal]'
- name: Test iOS RPC
shell: bash -l {0}
run: >-
python -m pip install tornado psutil cloudpickle &&
export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
export BUNDLE_ID=org.apache.tvmrpc &&
export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
python -m pytest -v tests/python/contrib/test_rpc_server_device.py
# - name: Test iOS RPC
# shell: bash -l {0}
# run: >-
# python -m pip install tornado psutil cloudpickle &&
# export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
# export BUNDLE_ID=org.apache.tvmrpc &&
# export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
# python -m pytest -v tests/python/contrib/test_rpc_server_device.py

Windows:
if: ${{ github.repository == 'apache/tvm' }}
Expand Down Expand Up @@ -173,7 +175,7 @@ jobs:
export PATH="${ANDROID_NDK_LATEST_HOME}:$PATH"
gradle clean build
- name: Upload android_rpc APK
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: android_rpc-debug.apk
path: ./apps/android_rpc/app/build/outputs/apk/debug/app-debug.apk
Expand All @@ -184,7 +186,7 @@ jobs:
export PATH="${ANDROID_NDK_LATEST_HOME}:$PATH"
gradle clean build
- name: Upload android_deploy APK
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: android_deploy-debug.apk
path: ./apps/android_deploy/app/build/outputs/apk/debug/app-debug.apk
2 changes: 1 addition & 1 deletion 3rdparty/cutlass
Submodule cutlass updated 1843 files
2 changes: 1 addition & 1 deletion 3rdparty/flashinfer
Submodule flashinfer updated 168 files
107 changes: 107 additions & 0 deletions 3rdparty/mscclpp/include/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef MSCCL_COMMON_HPP_
#define MSCCL_COMMON_HPP_

#if defined(__HIP_PLATFORM_AMD__)
#define WARP_SIZE 64
#define __syncwarp() __builtin_amdgcn_wave_barrier()
#else
#define WARP_SIZE 32
#endif

constexpr int NRANKS_PER_NODE = 8;
constexpr int SCRATCH_SIZE = 1024 * 1024 * 70; // 35 thread-blocks * 8 ranks * 256KB = 70MB

template <typename To, typename From>
__forceinline__ __device__ To bit_cast(const From& src) {
static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast");

union {
From f;
To t;
} u;
u.f = src;
return u.t;
}

template <typename T>
__forceinline__ __device__ T add_elements(T a, T b) {
return a + b;
}

template <>
__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) {
return __hadd2(a, b);
}

template <typename T>
__forceinline__ __device__ int4 add_vectors_helper(int4 a, int4 b) {
int4 ret;
ret.w = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.w), bit_cast<T, int>(b.w)));
ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
ret.z = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.z), bit_cast<T, int>(b.z)));
return ret;
}

template <typename T>
__forceinline__ __device__ int4 add_vectors(int4 a, int4 b) {
return add_vectors_helper<T>(a, b);
}

template <>
__forceinline__ __device__ int4 add_vectors<__half>(int4 a, int4 b) {
return add_vectors_helper<__half2>(a, b);
}

template <typename T>
__forceinline__ __device__ uint2 add_vectors_helper(uint2 a, uint2 b) {
uint2 ret;
ret.x = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.x), bit_cast<T, int>(b.x)));
ret.y = bit_cast<int, T>(add_elements(bit_cast<T, int>(a.y), bit_cast<T, int>(b.y)));
return ret;
}

template <typename T>
__forceinline__ __device__ uint2 add_vectors(uint2 a, uint2 b) {
return add_vectors_helper<T>(a, b);
}

template <>
__forceinline__ __device__ uint2 add_vectors<__half>(uint2 a, uint2 b) {
return add_vectors_helper<__half2>(a, b);
}

template <typename T>
__forceinline__ __device__ int add_vectors_helper(int a, int b) {
return bit_cast<int, T>(add_elements(bit_cast<T, int>(a), bit_cast<T, int>(b)));
}

template <typename T>
__forceinline__ __device__ int add_vectors(int a, int b) {
return add_vectors_helper<T>(a, b);
}

template <>
__forceinline__ __device__ int add_vectors<__half>(int a, int b) {
return add_vectors_helper<__half2>(a, b);
}

template <typename T>
__forceinline__ __device__ uint32_t add_vectors_helper(uint32_t a, uint32_t b) {
return bit_cast<uint32_t, T>(add_elements(bit_cast<T, uint32_t>(a), bit_cast<T, uint32_t>(b)));
}

template <typename T>
__forceinline__ __device__ uint32_t add_vectors(uint32_t a, uint32_t b) {
return add_vectors_helper<T>(a, b);
}

template <>
__forceinline__ __device__ uint32_t add_vectors<__half>(uint32_t a, uint32_t b) {
return add_vectors_helper<__half2>(a, b);
}

#endif // MSCCL_COMMON_HPP_
Loading

0 comments on commit beac1b2

Please sign in to comment.