From b84f7ecc0a0613442beec96fee11fa38be57d12c Mon Sep 17 00:00:00 2001
From: GiggleLiu <cacate0129@gmail.com>
Date: Wed, 1 Jun 2022 12:30:36 -0400
Subject: [PATCH] rm benchmark script

---
 benchmark/permutedims.jl | 79 ----------------------------------------
 1 file changed, 79 deletions(-)
 delete mode 100644 benchmark/permutedims.jl

diff --git a/benchmark/permutedims.jl b/benchmark/permutedims.jl
deleted file mode 100644
index 6f79a32..0000000
--- a/benchmark/permutedims.jl
+++ /dev/null
@@ -1,79 +0,0 @@
-using CUDA, Random, LinearAlgebra
-using CUDA: AbstractGPUArray, @linearidx, gpu_call
-using Base.Cartesian: @nexprs
-using BenchmarkTools
-
-function LinearAlgebra.permutedims!(dest::AbstractGPUArray, src::AbstractGPUArray,
-                                    perm::NTuple{N}) where N
-    @assert length(src) < typemax(Int32)
-    Base.checkdims_perm(dest, src, perm)
-
-    # get the new strides of destination tensor
-    dest_strides = ntuple(k->k==1 ? 1 : prod(i->size(dest, i), 1:k-1), N)
-    dest_strides_perm = ntuple(i->Int32(dest_strides[findfirst(==(i), perm)]), N)
-    size_src = Int32.(size(src))
-
-    function permutedims_kernel(ctx, dest, src, size_src, dest_strides_perm)
-        # find the cartesian index in source tensor
-        LI = @linearidx src
-        I = @inbounds CartesianIndices(size_src)[LI]
-
-        # the corresponding linear index in the destination tensor
-        dest_index = map_index(I.I, dest_strides_perm)
-        @inbounds dest[dest_index] = src[LI]
-        return
-    end
-    gpu_call(permutedims_kernel, vec(dest), vec(src), size_src, dest_strides_perm)
-    return dest
-end
-
-# get linear index from cartesian indices and strides.
-@inline @generated function map_index(I::NTuple{N,T}, dest_strides::NTuple{N,T}) where {N,T}
-    Expr(:call, :+, one(T), [:(@inbounds (I[$i]-one(T)) * dest_strides[$i]) for i in 1:N]...)
-end
-
-@inline @generated function permute_linearindex(size::NTuple{N,T}, l::T, strides::NTuple{N,T}) where {N,T}
-    quote
-        l -= one(T)
-        res = one(T)
-        @nexprs $(N-1) i->begin
-            @inbounds l, s = divrem(l, size[i])
-            @inbounds res += s * strides[i]
-        end
-        return @inbounds res + strides[N] * l
-    end
-end
-function permutedims2!(dest::AbstractGPUArray, src::AbstractGPUArray,
-                                    perm::NTuple{N}) where N
-    @assert length(src) < typemax(Int32)
-    Base.checkdims_perm(dest, src, perm)
-    dest_strides = ntuple(k->k==1 ? 1 : prod(i->size(dest, i), 1:k-1), N)
-    dest_strides_perm = ntuple(i->Int32(dest_strides[findfirst(==(i), perm)]), N)
-    size_src = Int32.(size(src))
-    LEN = Int32(length(src))
-    function permutedims_kernel(ctx, dest, src, size_src, dest_strides_perm, LEN)
-        LI = (blockIdx().x-one(Int32)) * blockDim().x + threadIdx().x
-        LI > LEN && return
-        dest_index = permute_linearindex(size_src, LI, dest_strides_perm)
-        @inbounds dest[dest_index] = src[LI]
-        return
-    end
-    gpu_call(permutedims_kernel, vec(dest), vec(src), size_src, dest_strides_perm, LEN)
-    return dest
-end
-
-let D = 24
-    a = CUDA.randn(fill(2, D)...);
-    b = CUDA.randn(fill(2, D)...);
-    pm = (randperm(D)...,);
-    @benchmark CUDA.@sync permutedims2!($b, $a, $pm)
-    #CUDA.@sync permutedims!(b, a, pm)
-    #CUDA.@sync permutedims!(b, a, pm)
-end
-
-let D = 4
-    a = CUDA.randn(fill(2, D)...);
-    b = CUDA.randn(fill(2, D)...);
-    pm = (randperm(D)...,);
-    CUDA.@device_code_llvm permutedims2!(b, a, pm)
-end