under-Peter · GiggleLiu · Jun 1, 2022 · Apr 19, 2022 · Apr 19, 2022 · Apr 20, 2022
diff --git a/Project.toml b/Project.toml
@@ -18,7 +18,7 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 [compat]
 AbstractTrees = "0.3"
 BatchedRoutines = "0.2"
-CUDA = "3.5"
+CUDA = "3.10"
 ChainRulesCore = "1"
 Combinatorics = "1.0"
 MacroTools = "0.5"

diff --git a/src/autodiff.jl b/src/autodiff.jl
@@ -32,7 +32,8 @@ end
 function ChainRulesCore.rrule(::typeof(einsum), code::EinCode, @nospecialize(xs), size_dict)
     y = einsum(code, xs, size_dict)
     function einsum_pullback(dy)
-        dxs = ChainRulesCore.@thunk ntuple(i -> einsum_grad(getixs(code), xs, getiy(code), size_dict, map(conj, dy), i), length(xs))
+        dy = convert(typeof(y), dy)  # for filled array/cuarray et al.
+        dxs = ChainRulesCore.@thunk ntuple(i -> einsum_grad(getixs(code), xs, getiy(code), size_dict, conj(dy), i), length(xs))
         return (NoTangent(), NoTangent(), dxs, NoTangent())
     end
     einsum_pullback(::NoTangent) = (NoTangent(), NoTangent(), NoTangent(), NoTangent())

diff --git a/src/cueinsum.jl b/src/cueinsum.jl
@@ -1,5 +1,10 @@
 using .CUDA
 
+const CUDAArrayTypes{T,N} = Union{LinearAlgebra.Transpose{T,<:CuArray{T,N}}, DenseCuArray{T,N}, LinearAlgebra.Adjoint{T,<:CuArray{T,N}}}
+_unwrap(x::LinearAlgebra.Adjoint{T,<:CuArray{T}}) where T = CuArray(x)
+_unwrap(x::LinearAlgebra.Transpose{T,<:CuArray{T}}) where T = CuArray(x)
+_unwrap(x::CuArray) = x
+
 asarray(x, arr::CuArray) where T = CuArray(fill(x, ()))
 asarray(x::AbstractArray, y::CuArray) = x
 asscalar(x::DenseCuArray) = Array(x)[]
@@ -9,6 +14,9 @@ Base.Array(x::Base.ReshapedArray{T,0,<:CuArray}) where T = Array(x.parent)
 function get_output_array(xs::NTuple{N, DenseCuArray{<:Any,M} where M}, size; has_repeated_indices=true) where N
     CUDA.zeros(promote_type(map(eltype,xs)...), size...)
 end
+function get_output_array(xs::NTuple{N, DenseCuArray{T,M} where M}, size; has_repeated_indices=true) where {T,N}
+    CUDA.zeros(T, size...)
+end
 
 CUDA.cudaconvert(A::EinArray{T}) where T = EinArray{T}(cudaconvert.(A.xs), A.x_indexers, A.y_indexer, A.size, A.ICIS, A.OCIS)
 CUDA.cu(A::EinArray{T}) where T = EinArray{T}(cu.(A.xs), A.x_indexers, A.y_indexer, A.size, A.ICIS, A.OCIS)
@@ -81,7 +89,7 @@ end
 
 Base.ndims(::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{0}}) = 0
 
-function einsum(neinsum::NestedEinsum, @nospecialize(xs::NTuple{N,DenseCuArray} where N), size_dict::Dict; active_free=false)
+function einsum(neinsum::NestedEinsum, @nospecialize(xs::NTuple{N,CUDAArrayTypes} where N), size_dict::Dict; active_free=false)
     # do not use map because the static overhead is too large
     # do not use `setindex!` because we need to make the AD work
     mxs = Vector{AbstractArray}(undef, length(neinsum.args))
@@ -95,4 +103,15 @@ function einsum(neinsum::NestedEinsum, @nospecialize(xs::NTuple{N,DenseCuArray}
     return res
 end
 
+# to dispatch Adjoint correctly
+@generated function einsum(code::StaticEinCode{ixs, iy}, xs::NTuple{N,CUDAArrayTypes} where N, size_dict::Dict{LT}) where {LT, ixs, iy}
+    rule = match_rule(ixs, iy)
+    :(einsum($rule, $ixs, $iy, _unwrap.(xs), size_dict))
+end
+
+function einsum(code::DynamicEinCode, @nospecialize(xs::NTuple{N,CUDAArrayTypes} where N), size_dict::Dict)
+    rule = match_rule(getixs(code), getiy(code))
+    einsum(rule, getixs(code), getiy(code), _unwrap.(xs), size_dict)
+end
+
 @info("OMEinsum loaded the CUDA module successfully")
diff --git a/src/loop_einsum.jl b/src/loop_einsum.jl
@@ -42,7 +42,15 @@ function reduce_einarray!(A::EinArray{T}, y) where T
     y
 end
 
-@inline function get_output_array(xs::NTuple{N, AbstractArray{<:Any,M} where M}, size; has_repeated_indices=true) where N
+# speed up the get output array for the case when the inputs have the same type.
+function get_output_array(xs::NTuple{N, AbstractArray{T,M} where M}, size; has_repeated_indices=true) where {T,N}
+    if has_repeated_indices
+        zeros(T, size...)
+    else
+        Array{T}(undef, size...)
+    end
+end
+function get_output_array(xs::NTuple{N, AbstractArray{<:Any,M} where M}, size; has_repeated_indices=true) where N
     if has_repeated_indices
         zeros(promote_type(map(eltype,xs)...), size...)
     else

diff --git a/test/cueinsum.jl b/test/cueinsum.jl
@@ -122,4 +122,11 @@ end
     @test array_match(gradient(a->Array(einsum(EinCode(((1,2), (2,3)), ()), (a, b)))[] |> abs, a)[1], a)
     b = CUDA.randn(3,3)
     @test array_match(gradient(a->Array(einsum(EinCode(((1,2), (2,3)), ()), (a, b)))[] |> abs, a)[1], a)
+end
+
+@testset "adjoint dispatch" begin
+    u = CUDA.rand(2,2); A = CUDA.rand(2,2,2);
+    @test Array(ein"(ip,pql),qj -> ijl"(u', A, u)) ≈ ein"(ip,pql),qj -> ijl"(Array(CuArray(u')), Array(A), Array(u))
+    @test Array(DynamicEinCode(ein"mk, ijk -> ijm")(u', A)) ≈ DynamicEinCode(ein"mk, ijk -> ijm")(Array(u'), Array(A))
+    @test Array(ein"mk, ijk -> ijm"(u', A)) ≈ DynamicEinCode(ein"mk, ijk -> ijm")(Array(u'), Array(A))
 end