Skip to content

Commit

Permalink
Bugfix _mapreducedim! and mul! (#22)
Browse files Browse the repository at this point in the history
* Bugfix _mapreducedim! and mul!

- Fixes an issue when one of the reduction dimensions is 0,  while `initop` is nothing.
Fixed by adding special case.
- Fixes an issue when multiplying over an empty dimension

* Formatter

* fuse conditions

* Revert to not using blasstrides

Fixed via StridedViews _normalizestrides convention.

* Add manual CI trigger
  • Loading branch information
lkdvos authored May 25, 2023
1 parent f7e2ec9 commit 1aabb3f
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 50 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ on:
push:
branches:
- main
- workflow_dispatch
tags: ['*']
pull_request:
concurrency:
Expand Down
80 changes: 45 additions & 35 deletions benchmarks/benchtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using TensorOperations
using Tullio
using LoopVectorization

sizes = ceil.(Int, 2 .^(2:1.5:20))
sizes = ceil.(Int, 2 .^ (2:1.5:20))

function benchmark_sum(sizes)
times = zeros(length(sizes), 3)
Expand All @@ -22,8 +22,7 @@ function benchmark_sum(sizes)
return times
end


function benchmark_permute(sizes, p = (4,3,2,1))
function benchmark_permute(sizes, p=(4, 3, 2, 1))
times = zeros(length(sizes), 4)
for (i, s) in enumerate(sizes)
A = randn(Float64, s .* one.(p))
Expand All @@ -38,84 +37,95 @@ function benchmark_permute(sizes, p = (4,3,2,1))
end
return times
end
permute_times1 = benchmark_permute(sizes, (4,3,2,1))
permute_times2 = benchmark_permute(sizes, (2,3,4,1))
permute_times3 = benchmark_permute(sizes, (3,4,1,2))
permute_times1 = benchmark_permute(sizes, (4, 3, 2, 1))
permute_times2 = benchmark_permute(sizes, (2, 3, 4, 1))
permute_times3 = benchmark_permute(sizes, (3, 4, 1, 2))

function benchmark_mul(sizesm, sizesk = sizesm, sizesn = sizesm)
function benchmark_mul(sizesm, sizesk=sizesm, sizesn=sizesm)
N = Threads.nthreads()
@assert length(sizesm) == length(sizesk) == length(sizesn)
times = zeros(length(sizesm), 4)
@inbounds for i = 1:length(sizesm)
@inbounds for i in 1:length(sizesm)
m = sizesm[i]
k = sizesk[i]
n = sizesn[i]
A = randn(Float64, (m,k))
B = randn(Float64, (k,n))
C = randn(Float64, (m,n))
A = randn(Float64, (m, k))
B = randn(Float64, (k, n))
C = randn(Float64, (m, n))

BLAS.set_num_threads(1) # base case: single-threaded blas
times[i, 1] = @belapsed mul!($C,$A,$B)
times[i, 1] = @belapsed mul!($C, $A, $B)
BLAS.set_num_threads(N) # multithreaded blas
times[i, 2] = @belapsed mul!($C,$A,$B)
times[i, 2] = @belapsed mul!($C, $A, $B)
Strided.disable_threaded_mul() # same, except for small overhead from strided
times[i, 3] = @belapsed @strided mul!($C,$A,$B)
times[i, 3] = @belapsed @strided mul!($C, $A, $B)
BLAS.set_num_threads(1) # single-threaded blas with strided multithreading
Strided.enable_threaded_mul()
times[i, 4] = @belapsed @strided mul!($C,$A,$B)
times[i, 4] = @belapsed @strided mul!($C, $A, $B)
println("step $i: sizes $((m,k,n)) => times = $(times[i, :])")
end
return times
end

function tensorcontraction!(wEnv, hamAB,hamBA,rhoBA,rhoAB,w,v,u)
@tensor wEnv[-1,-2,-3] =
hamAB[7,8,-1,9]*rhoBA[4,3,-3,2]*conj(w[7,5,4])*u[9,10,-2,11]*conj(u[8,10,5,6])*v[1,11,2]*conj(v[1,6,3]) +
hamBA[1,2,3,4]*rhoBA[10,7,-3,6]*conj(w[-1,11,10])*u[3,4,-2,8]*conj(u[1,2,11,9])*v[5,8,6]*conj(v[5,9,7]) +
hamAB[5,7,3,1]*rhoBA[10,9,-3,8]*conj(w[-1,11,10])*u[4,3,-2,2]*conj(u[4,5,11,6])*v[1,2,8]*conj(v[7,6,9]) +
hamBA[3,7,2,-1]*rhoAB[5,6,4,-3]*v[2,1,4]*conj(v[3,1,5])*conj(w[7,-2,6])
function tensorcontraction!(wEnv, hamAB, hamBA, rhoBA, rhoAB, w, v, u)
@tensor wEnv[-1, -2, -3] = hamAB[7, 8, -1, 9] * rhoBA[4, 3, -3, 2] * conj(w[7, 5, 4]) *
u[9, 10, -2, 11] * conj(u[8, 10, 5, 6]) * v[1, 11, 2] *
conj(v[1, 6, 3]) +
hamBA[1, 2, 3, 4] * rhoBA[10, 7, -3, 6] *
conj(w[-1, 11, 10]) * u[3, 4, -2, 8] * conj(u[1, 2, 11, 9]) *
v[5, 8, 6] * conj(v[5, 9, 7]) +
hamAB[5, 7, 3, 1] * rhoBA[10, 9, -3, 8] *
conj(w[-1, 11, 10]) * u[4, 3, -2, 2] * conj(u[4, 5, 11, 6]) *
v[1, 2, 8] * conj(v[7, 6, 9]) +
hamBA[3, 7, 2, -1] * rhoAB[5, 6, 4, -3] * v[2, 1, 4] *
conj(v[3, 1, 5]) * conj(w[7, -2, 6])
return wEnv
end

function benchmark_tensorcontraction(sizes)
N = Threads.nthreads()
times = zeros(length(sizes), 5)
@inbounds for (i, s) in enumerate(sizes)
hamAB = randn(Float64, (s,s,s,s))
hamBA = randn(Float64, (s,s,s,s))
rhoAB = randn(Float64, (s,s,s,s))
rhoBA = randn(Float64, (s,s,s,s))
v = randn(Float64, (s,s,s))
w = randn(Float64, (s,s,s))
u = randn(Float64, (s,s,s,s))
wEnv = randn(Float64, (s,s,s))
hamAB = randn(Float64, (s, s, s, s))
hamBA = randn(Float64, (s, s, s, s))
rhoAB = randn(Float64, (s, s, s, s))
rhoBA = randn(Float64, (s, s, s, s))
v = randn(Float64, (s, s, s))
w = randn(Float64, (s, s, s))
u = randn(Float64, (s, s, s, s))
wEnv = randn(Float64, (s, s, s))

tensorcontraction!(wEnv, hamAB,hamBA,rhoBA,rhoAB,w,v,u)
tensorcontraction!(wEnv, hamAB, hamBA, rhoBA, rhoAB, w, v, u)

BLAS.set_num_threads(1)
Strided.disable_threads()
Strided.disable_threaded_mul()
times[i,1] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
times[i, 1] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
$w, $v, $u)

BLAS.set_num_threads(1)
Strided.enable_threads()
Strided.disable_threaded_mul()
times[i,2] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
times[i, 2] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
$w, $v, $u)

BLAS.set_num_threads(N)
Strided.disable_threads()
Strided.disable_threaded_mul()
times[i,3] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
times[i, 3] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
$w, $v, $u)

BLAS.set_num_threads(N)
Strided.enable_threads()
Strided.disable_threaded_mul()
times[i,4] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
times[i, 4] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
$w, $v, $u)

BLAS.set_num_threads(1)
Strided.enable_threads()
Strided.enable_threaded_mul()
times[i,5] = @belapsed tensorcontraction!($wEnv,$hamAB,$hamBA,$rhoBA,$rhoAB,$w,$v,$u)
times[i, 5] = @belapsed tensorcontraction!($wEnv, $hamAB, $hamBA, $rhoBA, $rhoAB,
$w, $v, $u)

println("step $i: size $s => times = $(times[i, :])")
end
Expand Down
9 changes: 0 additions & 9 deletions src/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,3 @@ function __mul!(C::StridedView{<:Any,2}, A::StridedView{<:Any,2}, B::StridedView
end
return C
end

# function blasstrides(a::StridedView{T,2,A,F}) where {T,A<:DenseArray,F}
# # canonicalize strides to make compatible with gemm
# if size(a, 2) == 1 && stride(a, 1) == 1
# return StridedView{T,2,A,F}(a.parent, a.size, (1, size(a, 1)), a.offset, a.op)
# else
# return a
# end
# end
3 changes: 2 additions & 1 deletion src/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ maybestrided(A::Tuple) = maybestrided.(A)
maybestrided(A) = A
function maybeunstrided(A::StridedView)
Ap = A.parent
if size(A) == size(Ap) && strides(A) == strides(Ap) && offset(A) == 0 && A.op == identity
if size(A) == size(Ap) && strides(A) == strides(Ap) && offset(A) == 0 &&
A.op == identity
return Ap
else
return reshape(copy(A).parent, size(A))
Expand Down
2 changes: 1 addition & 1 deletion src/mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ end
function _mapreducedim!(@nospecialize(f), @nospecialize(op), @nospecialize(initop),
dims::Dims, arrays::Tuple{Vararg{StridedView}})
if any(isequal(0), dims)
if !any(isequal(0), size(arrays[1]))
if length(arrays[1]) != 0 && !isnothing(initop)
map!(initop, arrays[1], arrays[1])
end
else
Expand Down
12 changes: 12 additions & 0 deletions test/blasmultests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,15 @@ for T1 in (Float32, Float64, Complex{Float32}, Complex{Float64})
end
end
end

@testset "Matrix multiplication with length 0" begin
A = rand(2, 0)
B = rand(0, 2)
C = rand(2, 2)
α = rand()
β = rand()
A1 = StridedView(copy(A))
B1 = StridedView(copy(B))
C1 = StridedView(copy(C))
@test mul!(C, A, B, α, β) mul!(C1, A1, B1, α, β)
end
14 changes: 12 additions & 2 deletions test/othertests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ end
(@strided(B2' .* B3 .- max.(abs.(B1), real.(B3b))))
view(A2, :, 3)' .* reshape(view(A3, 1:5, :, :), 5, 10, 5, 2) .-
max.(abs.(view(A1, 1:5)), real.(view(A3, 4:4, 4:4, 2:2:10)))

x = @strided begin
p = :A => A1
f = pair -> (pair.first, pair.second)
Expand Down Expand Up @@ -280,10 +280,20 @@ end
@test B3 op3* op1(A1) * op2(A2)) # op3 is its own inverse
copyto!(B3, B4)
mul!(op3(B3), op1(B1), op2(B2))
@test B3 op3( op1(A1) * op2(A2)) # op3 is its own inverse
@test B3 op3(op1(A1) * op2(A2)) # op3 is its own inverse
end
end
end

A = map(complex, rand(-100:100, (2, 0)), rand(-100:100, (2, 0)))
B = map(complex, rand(-100:100, (0, 2)), rand(-100:100, (0, 2)))
C = map(complex, rand(-100:100, (2, 2)), rand(-100:100, (2, 2)))
α = complex(rand(-100:100), rand(-100:100))
β = one(eltype(C))
A1 = StridedView(copy(A))
B1 = StridedView(copy(B))
C1 = StridedView(copy(C))
@test mul!(C, A, B, α, β) mul!(C1, A1, B1, α, β)
end

@testset "multiplication with StridedView: Rational{Int}" begin
Expand Down
4 changes: 2 additions & 2 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ include("blasmultests.jl")

println("Running tests multi-threaded:")
Strided.enable_threads()
Strided.set_num_threads(Base.Threads.nthreads()+1)
Strided.set_num_threads(Base.Threads.nthreads() + 1)
include("othertests.jl")
include("blasmultests.jl")

Expand All @@ -24,4 +24,4 @@ include("blasmultests.jl")
Strided.disable_threaded_mul()

using Aqua
Aqua.test_all(Strided)
Aqua.test_all(Strided)

0 comments on commit 1aabb3f

Please sign in to comment.