Bugfix _mapreducedim! and mul! (#22)

* Bugfix _mapreducedim! and mul! - Fixes an issue when one of the reduction dimensions is 0, while `initop` is nothing. Fixed by adding special case. - Fixes an issue when multiplying over an empty dimension * Formatter * fuse conditions * Revert to not using blasstrides Fixed via StridedViews _normalizestrides convention. * Add manual CI trigger
Jutho · May 25, 2023 · 1aabb3f · 1aabb3f
1 parent f7e2ec9
commit 1aabb3f
Show file tree

Hide file tree

Showing 8 changed files with 75 additions and 50 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -3,6 +3,7 @@ on:
   push:
     branches:
       - main
+      - workflow_dispatch
     tags: ['*']
   pull_request:
 concurrency:

diff --git a/benchmarks/benchtests.jl b/benchmarks/benchtests.jl
@@ -6,7 +6,7 @@ using TensorOperations
 using Tullio
 using LoopVectorization
 
-sizes = ceil.(Int, 2 .^(2:1.5:20))
+sizes = ceil.(Int, 2 .^ (2:1.5:20))
 
 function benchmark_sum(sizes)
     times = zeros(length(sizes), 3)
@@ -22,8 +22,7 @@ function benchmark_sum(sizes)
     return times
 end
 
-
-function benchmark_permute(sizes, p = (4,3,2,1))
+function benchmark_permute(sizes, p=(4, 3, 2, 1))
     times = zeros(length(sizes), 4)
     for (i, s) in enumerate(sizes)
         A = randn(Float64, s .* one.(p))
@@ -38,84 +37,95 @@ function benchmark_permute(sizes, p = (4,3,2,1))
     end
     return times
 end
-permute_times1 = benchmark_permute(sizes, (4,3,2,1))
-permute_times2 = benchmark_permute(sizes, (2,3,4,1))
-permute_times3 = benchmark_permute(sizes, (3,4,1,2))
+permute_times1 = benchmark_permute(sizes, (4, 3, 2, 1))
+permute_times2 = benchmark_permute(sizes, (2, 3, 4, 1))
+permute_times3 = benchmark_permute(sizes, (3, 4, 1, 2))
 
-function benchmark_mul(sizesm, sizesk = sizesm, sizesn = sizesm)
+function benchmark_mul(sizesm, sizesk=sizesm, sizesn=sizesm)
     N = Threads.nthreads()
     @assert length(sizesm) == length(sizesk) == length(sizesn)
     times = zeros(length(sizesm), 4)
-    @inbounds for i = 1:length(sizesm)
+    @inbounds for i in 1:length(sizesm)
         m = sizesm[i]
         k = sizesk[i]
         n = sizesn[i]
-        A = randn(Float64, (m,k))
-        B = randn(Float64, (k,n))
-        C = randn(Float64, (m,n))
+        A = randn(Float64, (m, k))
+        B = randn(Float64, (k, n))
+        C = randn(Float64, (m, n))
 
         BLAS.set_num_threads(1) # base case: single-threaded blas
-        times[i, 1] = @belapsed mul!($C,$A,$B)
+        times[i, 1] = @belapsed mul!($C, $A, $B)
         BLAS.set_num_threads(N) # multithreaded blas
-        times[i, 2] = @belapsed mul!($C,$A,$B)
+        times[i, 2] = @belapsed mul!($C, $A, $B)
         Strided.disable_threaded_mul() # same, except for small overhead from strided
-        times[i, 3] = @belapsed @strided mul!($C,$A,$B)
+        times[i, 3] = @belapsed @strided mul!($C, $A, $B)
         BLAS.set_num_threads(1) # single-threaded blas with strided multithreading
         Strided.enable_threaded_mul()
-        times[i, 4] = @belapsed @strided mul!($C,$A,$B)
+        times[i, 4] = @belapsed @strided mul!($C, $A, $B)
         println("step $i: sizes $((m,k,n)) => times = $(times[i, :])")
     end
     return times
 end
 
-function tensorcontraction!(wEnv, hamAB,hamBA,rhoBA,rhoAB,w,v,u)
-    @tensor wEnv[-1,-2,-3] =
-    	hamAB[7,8,-1,9]*rhoBA[4,3,-3,2]*conj(w[7,5,4])*u[9,10,-2,11]*conj(u[8,10,5,6])*v[1,11,2]*conj(v[1,6,3]) +
-    	hamBA[1,2,3,4]*rhoBA[10,7,-3,6]*conj(w[-1,11,10])*u[3,4,-2,8]*conj(u[1,2,11,9])*v[5,8,6]*conj(v[5,9,7]) +
-    	hamAB[5,7,3,1]*rhoBA[10,9,-3,8]*conj(w[-1,11,10])*u[4,3,-2,2]*conj(u[4,5,11,6])*v[1,2,8]*conj(v[7,6,9]) +
-    	hamBA[3,7,2,-1]*rhoAB[5,6,4,-3]*v[2,1,4]*conj(v[3,1,5])*conj(w[7,-2,6])
+function tensorcontraction!(wEnv, hamAB, hamBA, rhoBA, rhoAB, w, v, u)
+    @tensor wEnv[-1, -2, -3] = hamAB[7, 8, -1, 9] * rhoBA[4, 3, -3, 2] * conj(w[7, 5, 4]) *
+                               u[9, 10, -2, 11] * conj(u[8, 10, 5, 6]) * v[1, 11, 2] *
+                               conj(v[1, 6, 3]) +
+                               hamBA[1, 2, 3, 4] * rhoBA[10, 7, -3, 6] *
+                               conj(w[-1, 11, 10]) * u[3, 4, -2, 8] * conj(u[1, 2, 11, 9]) *
+                               v[5, 8, 6] * conj(v[5, 9, 7]) +
+                               hamAB[5, 7, 3, 1] * rhoBA[10, 9, -3, 8] *
+                               conj(w[-1, 11, 10]) * u[4, 3, -2, 2] * conj(u[4, 5, 11, 6]) *
+                               v[1, 2, 8] * conj(v[7, 6, 9]) +
+                               hamBA[3, 7, 2, -1] * rhoAB[5, 6, 4, -3] * v[2, 1, 4] *
+                               conj(v[3, 1, 5]) * conj(w[7, -2, 6])
     return wEnv
 end
 
 function benchmark_tensorcontraction(sizes)
     N = Threads.nthreads()
     times = zeros(length(sizes), 5)
     @inbounds for (i, s) in enumerate(sizes)
-        hamAB = randn(Float64, (s,s,s,s))
-        hamBA = randn(Float64, (s,s,s,s))
-        rhoAB = randn(Float64, (s,s,s,s))
-        rhoBA = randn(Float64, (s,s,s,s))
-        v = randn(Float64, (s,s,s))
-        w = randn(Float64, (s,s,s))
-        u = randn(Float64, (s,s,s,s))
-        wEnv = randn(Float64, (s,s,s))
+        hamAB = randn(Float64, (s, s, s, s))
+        hamBA = randn(Float64, (s, s, s, s))
+        rhoAB = randn(Float64, (s, s, s, s))
+        rhoBA = randn(Float64, (s, s, s, s))
+        v = randn(Float64, (s, s, s))
+        w = randn(Float64, (s, s, s))
+        u = randn(Float64, (s, s, s, s))
+        wEnv = randn(Float64, (s, s, s))
 
-        tensorcontraction!(wEnv, hamAB,hamBA,rhoBA,rhoAB,w,v,u)
+        tensorcontraction!(wEnv, hamAB, hamBA, rhoBA, rhoAB, w, v, u)
 
         BLAS.set_num_threads(1)
         Strided.disable_threads()
         Strided.disable_threaded_mul()
-        times[i,1] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
+        times[i, 1] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
+                                                   $w, $v, $u)
 
         BLAS.set_num_threads(1)
         Strided.enable_threads()
         Strided.disable_threaded_mul()
-        times[i,2] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
+        times[i, 2] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
+                                                   $w, $v, $u)
 
         BLAS.set_num_threads(N)
         Strided.disable_threads()
         Strided.disable_threaded_mul()
-        times[i,3] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
+        times[i, 3] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
+                                                   $w, $v, $u)
 
         BLAS.set_num_threads(N)
         Strided.enable_threads()
         Strided.disable_threaded_mul()
-        times[i,4] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
+        times[i, 4] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
+                                                   $w, $v, $u)
 
         BLAS.set_num_threads(1)
         Strided.enable_threads()
         Strided.enable_threaded_mul()
-        times[i,5] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
+        times[i, 5] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
+                                                   $w, $v, $u)
 
         println("step $i: size $s => times = $(times[i, :])")
     end

diff --git a/src/linalg.jl b/src/linalg.jl
@@ -160,12 +160,3 @@ function __mul!(C::StridedView{<:Any,2}, A::StridedView{<:Any,2}, B::StridedView
     end
     return C
 end
-
-# function blasstrides(a::StridedView{T,2,A,F}) where {T,A<:DenseArray,F}
-#     # canonicalize strides to make compatible with gemm
-#     if size(a, 2) == 1 && stride(a, 1) == 1
-#         return StridedView{T,2,A,F}(a.parent, a.size, (1, size(a, 1)), a.offset, a.op)
-#     else
-#         return a
-#     end
-# end
diff --git a/src/macros.jl b/src/macros.jl
@@ -34,7 +34,8 @@ maybestrided(A::Tuple) = maybestrided.(A)
 maybestrided(A) = A
 function maybeunstrided(A::StridedView)
     Ap = A.parent
-    if size(A) == size(Ap) && strides(A) == strides(Ap) && offset(A) == 0 && A.op == identity
+    if size(A) == size(Ap) && strides(A) == strides(Ap) && offset(A) == 0 &&
+       A.op == identity
         return Ap
     else
         return reshape(copy(A).parent, size(A))

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -86,7 +86,7 @@ end
 function _mapreducedim!(@nospecialize(f), @nospecialize(op), @nospecialize(initop),
                         dims::Dims, arrays::Tuple{Vararg{StridedView}})
     if any(isequal(0), dims)
-        if !any(isequal(0), size(arrays[1]))
+        if length(arrays[1]) != 0 && !isnothing(initop)
             map!(initop, arrays[1], arrays[1])
         end
     else

diff --git a/test/blasmultests.jl b/test/blasmultests.jl
@@ -84,3 +84,15 @@ for T1 in (Float32, Float64, Complex{Float32}, Complex{Float64})
         end
     end
 end
+
+@testset "Matrix multiplication with length 0" begin
+    A = rand(2, 0)
+    B = rand(0, 2)
+    C = rand(2, 2)
+    α = rand()
+    β = rand()
+    A1 = StridedView(copy(A))
+    B1 = StridedView(copy(B))
+    C1 = StridedView(copy(C))
+    @test mul!(C, A, B, α, β) ≈ mul!(C1, A1, B1, α, β)
+end
diff --git a/test/othertests.jl b/test/othertests.jl
@@ -179,7 +179,7 @@ end
               (@strided(B2' .* B3 .- max.(abs.(B1), real.(B3b)))) ≈
               view(A2, :, 3)' .* reshape(view(A3, 1:5, :, :), 5, 10, 5, 2) .-
               max.(abs.(view(A1, 1:5)), real.(view(A3, 4:4, 4:4, 2:2:10)))
-        
+
         x = @strided begin
             p = :A => A1
             f = pair -> (pair.first, pair.second)
@@ -280,10 +280,20 @@ end
                 @test B3 ≈ op3(α * op1(A1) * op2(A2)) # op3 is its own inverse
                 copyto!(B3, B4)
                 mul!(op3(B3), op1(B1), op2(B2))
-                @test B3 ≈ op3( op1(A1) * op2(A2)) # op3 is its own inverse
+                @test B3 ≈ op3(op1(A1) * op2(A2)) # op3 is its own inverse
             end
         end
     end
+
+    A = map(complex, rand(-100:100, (2, 0)), rand(-100:100, (2, 0)))
+    B = map(complex, rand(-100:100, (0, 2)), rand(-100:100, (0, 2)))
+    C = map(complex, rand(-100:100, (2, 2)), rand(-100:100, (2, 2)))
+    α = complex(rand(-100:100), rand(-100:100))
+    β = one(eltype(C))
+    A1 = StridedView(copy(A))
+    B1 = StridedView(copy(B))
+    C1 = StridedView(copy(C))
+    @test mul!(C, A, B, α, β) ≈ mul!(C1, A1, B1, α, β)
 end
 
 @testset "multiplication with StridedView: Rational{Int}" begin

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -15,7 +15,7 @@ include("blasmultests.jl")
 
 println("Running tests multi-threaded:")
 Strided.enable_threads()
-Strided.set_num_threads(Base.Threads.nthreads()+1)
+Strided.set_num_threads(Base.Threads.nthreads() + 1)
 include("othertests.jl")
 include("blasmultests.jl")
 
@@ -24,4 +24,4 @@ include("blasmultests.jl")
 Strided.disable_threaded_mul()
 
 using Aqua
-Aqua.test_all(Strided)
+Aqua.test_all(Strided)