包含以下内容:
- histogram_i32_kernel
- histogram_i32x4_kernel(int4向量化版本)
- PyTorch bindings
# 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
export TORCH_CUDA_ARCH_LIST=Ada
python3 histogram.py
输出:
--------------------------------------------------------------------------------
h_i32 0: 1000
h_i32 1: 1000
h_i32 2: 1000
h_i32 3: 1000
h_i32 4: 1000
h_i32 5: 1000
h_i32 6: 1000
h_i32 7: 1000
h_i32 8: 1000
h_i32 9: 1000
--------------------------------------------------------------------------------
h_i32x4 0: 1000
h_i32x4 1: 1000
h_i32x4 2: 1000
h_i32x4 3: 1000
h_i32x4 4: 1000
h_i32x4 5: 1000
h_i32x4 6: 1000
h_i32x4 7: 1000
h_i32x4 8: 1000
h_i32x4 9: 1000
--------------------------------------------------------------------------------