Skip to content

Commit

Permalink
math/big: rewrite addVW to use fast path on s390x
Browse files Browse the repository at this point in the history
Rewrite addVW to use a fast path and remove the original
vector and non vector implementation of addVW in assembly. This CL uses
a similar idea as CL 164968, where we copy the rest of words when we
know carry bit is zero.

In addition, since we are copying vector of words, a faster
implementation of copy is written in this CL to copy a word or multiple
words at a time.

Benchmarks:
name             old time/op    new time/op     delta
AddVW/1-18         4.56ns ± 0%     4.01ns ± 6%   -12.14%  (p=0.000 n=18+20)
AddVW/2-18         5.54ns ± 0%     4.42ns ± 5%   -20.20%  (p=0.000 n=18+20)
AddVW/3-18         6.55ns ± 0%     4.61ns ± 0%   -29.62%  (p=0.000 n=16+18)
AddVW/4-18         6.11ns ± 2%     5.12ns ± 6%   -16.19%  (p=0.000 n=20+20)
AddVW/5-18         7.32ns ± 4%     5.14ns ± 0%   -29.77%  (p=0.000 n=20+19)
AddVW/10-18        10.6ns ± 2%      7.2ns ± 1%   -31.47%  (p=0.000 n=20+20)
AddVW/100-18       49.6ns ± 2%     18.0ns ± 0%   -63.63%  (p=0.000 n=20+20)
AddVW/1000-18       465ns ± 3%      244ns ± 0%   -47.54%  (p=0.000 n=20+20)
AddVW/10000-18     4.99µs ± 4%     2.97µs ± 0%   -40.54%  (p=0.000 n=20+20)
AddVW/100000-18    48.3µs ± 3%     30.8µs ± 1%   -36.29%  (p=0.000 n=20+20)
[Geo mean]         58.1ns          38.0ns        -34.57%

name             old speed      new speed       delta
AddVW/1-18       1.76GB/s ± 0%   2.00GB/s ± 6%   +14.04%  (p=0.000 n=20+20)
AddVW/2-18       2.89GB/s ± 0%   3.63GB/s ± 5%   +25.55%  (p=0.000 n=18+20)
AddVW/3-18       3.66GB/s ± 0%   5.21GB/s ± 0%   +42.25%  (p=0.000 n=18+19)
AddVW/4-18       5.24GB/s ± 2%   6.27GB/s ± 6%   +19.61%  (p=0.000 n=20+20)
AddVW/5-18       5.47GB/s ± 4%   7.78GB/s ± 0%   +42.28%  (p=0.000 n=20+18)
AddVW/10-18      7.55GB/s ± 2%  11.04GB/s ± 1%   +46.09%  (p=0.000 n=20+20)
AddVW/100-18     16.1GB/s ± 2%   44.3GB/s ± 0%  +174.77%  (p=0.000 n=20+20)
AddVW/1000-18    17.2GB/s ± 3%   32.8GB/s ± 1%   +90.58%  (p=0.000 n=20+20)
AddVW/10000-18   16.0GB/s ± 4%   26.9GB/s ± 0%   +68.11%  (p=0.000 n=20+20)
AddVW/100000-18  16.6GB/s ± 3%   26.0GB/s ± 1%   +56.94%  (p=0.000 n=20+20)
[Geo mean]       7.03GB/s       10.75GB/s        +52.93%

Change-Id: Idbb73f3178311bd2b18a93bdc1e48f26869d2f6a
Reviewed-on: https://go-review.googlesource.com/c/go/+/209679
Reviewed-by: Michael Munday <[email protected]>
Run-TryBot: Michael Munday <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
  • Loading branch information
Ruixin(Peter) Bao authored and mundaym committed Apr 24, 2020
1 parent 82f2989 commit ee8972c
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 213 deletions.
3 changes: 0 additions & 3 deletions src/math/big/arith_decl_s390x.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ func addVV_novec(z, x, y []Word) (c Word)
func subVV_check(z, x, y []Word) (c Word)
func subVV_vec(z, x, y []Word) (c Word)
func subVV_novec(z, x, y []Word) (c Word)
func addVW_check(z, x []Word, y Word) (c Word)
func addVW_vec(z, x []Word, y Word) (c Word)
func addVW_novec(z, x []Word, y Word) (c Word)
func subVW_check(z, x []Word, y Word) (c Word)
func subVW_vec(z, x []Word, y Word) (c Word)
func subVW_novec(z, x []Word, y Word) (c Word)
Expand Down
289 changes: 83 additions & 206 deletions src/math/big/arith_s390x.s
Original file line number Diff line number Diff line change
Expand Up @@ -541,216 +541,93 @@ E1:
RET

TEXT ·addVW(SB), NOSPLIT, $0
MOVD addwvectorfacility+0x00(SB), R1
BR (R1)

TEXT ·addVW_check(SB), NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $addwvectorfacility+0x00(SB), R1
MOVD $·addVW_novec(SB), R2
MOVD R2, 0(R1)

// MOVD $·addVW_novec(SB), 0(R1)
BR ·addVW_novec(SB)

vectorimpl:
MOVD $addwvectorfacility+0x00(SB), R1
MOVD $·addVW_vec(SB), R2
MOVD R2, 0(R1)

// MOVD $·addVW_vec(SB), 0(R1)
BR ·addVW_vec(SB)

GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)

// func addVW_vec(z, x []Word, y Word) (c Word)
TEXT ·addVW_vec(SB), NOSPLIT, $0
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2

MOVD $0, R0 // make sure it's zero
MOVD $0, R10 // i = 0
MOVD R8, R5
MOVD R2, R7

// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v10 // if n < 0 goto v10
SUB $12, R3
BLT A10

// n >= 0
// regular loop body unrolled 16x

VZERO V0 // prepare V0 to be final carry register
VZERO V9 // to ensure upper half is zero
VLVGG $1, R4, V9

UU1:
VLM 0(R5), V1, V4 // 64-bytes into V1..V4
ADD $64, R5
VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order

VACCCQ V1, V9, V0, V25
VACQ V1, V9, V0, V17
VZERO V9
VACCCQ V2, V9, V25, V26
VACQ V2, V9, V25, V18

VLM 0(R5), V5, V6 // 32-bytes into V5..V6
ADD $32, R5

VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order

VACCCQ V3, V9, V26, V27
VACQ V3, V9, V26, V19
VACCCQ V4, V9, V27, V28
VACQ V4, V9, V27, V20

VLM 0(R5), V7, V8 // 32-bytes into V7..V8
ADD $32, R5

VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order

VACCCQ V5, V9, V28, V29
VACQ V5, V9, V28, V21
VACCCQ V6, V9, V29, V30
VACQ V6, V9, V29, V22

VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order

VACCCQ V7, V9, V30, V31
VACQ V7, V9, V30, V23
VACCCQ V8, V9, V31, V0 // V0 has carry-over
VACQ V8, V9, V31, V24

VPDI $0x4, V17, V17, V17 // flip the doublewords to big-endian order
VPDI $0x4, V18, V18, V18 // flip the doublewords to big-endian order
VPDI $0x4, V19, V19, V19 // flip the doublewords to big-endian order
VPDI $0x4, V20, V20, V20 // flip the doublewords to big-endian order
VPDI $0x4, V21, V21, V21 // flip the doublewords to big-endian order
VPDI $0x4, V22, V22, V22 // flip the doublewords to big-endian order
VPDI $0x4, V23, V23, V23 // flip the doublewords to big-endian order
VPDI $0x4, V24, V24, V24 // flip the doublewords to big-endian order
VSTM V17, V24, 0(R7) // 128-bytes into z
ADD $128, R7
ADD $128, R10 // i += 16
SUB $16, R3 // n -= 16
BGE UU1 // if n >= 0 goto U1
VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10

A10:
ADD $12, R3 // n += 16

// s/JL/JMP/ below to disable the unrolled loop

BLT v10 // if n < 0 goto v10

U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R5
ADDE R0, R6
ADDE R0, R7
ADDE R0, R1
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)

ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4

v10:
ADD $4, R3 // n += 4
BLE E10 // if n <= 0 goto E4

L4: // n > 0
MOVD 0(R8)(R10*1), R5
ADDC R4, R5
MOVD z_len+8(FP), R5 // length of z
MOVD x+24(FP), R6
MOVD y+48(FP), R7 // c = y
MOVD z+0(FP), R8

CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return

// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
ADDC 0(R6), R7
MOVD R7, 0(R8)
CMPBEQ R5, $1, returnResult // len(z) == 1
MOVD $0, R9
ADDE 8(R6), R9
MOVD R9, 8(R8)
CMPBEQ R5, $2, returnResult // len(z) == 2

// Update the counters
MOVD $16, R12 // i = 2
MOVD $-2(R5), R5 // n = n - 2

loopOverEachWord:
BRC $12, copySetup // carry = 0, copy the rest
MOVD $1, R9

// Originally we used the carry flag generated in the previous iteration
// (i.e: ADDE could be used here to do the addition). However, since we
// already know carry is 1 (otherwise we will go to copy section), we can use
// ADDC here so the current iteration does not depend on the carry flag
// generated in the previous iteration. This could be useful when branch prediction happens.
ADDC 0(R6)(R12*1), R9
MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c

MOVD $8(R12), R12 // i++
BRCTG R5, loopOverEachWord // n--

// Return the current carry value
returnResult:
MOVD $0, R0
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)

ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4

E10:
MOVD R4, c+56(FP) // return c

MOVD R0, c+56(FP)
RET

TEXT ·addVW_novec(SB), NOSPLIT, $0
// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
MOVD z_len+8(FP), R3
MOVD x+24(FP), R8
MOVD y+48(FP), R4 // c = y
MOVD z+0(FP), R2
MOVD $0, R0 // make sure it's 0
MOVD $0, R10 // i = 0

// s/JL/JMP/ below to disable the unrolled loop
SUB $4, R3 // n -= 4
BLT v4 // if n < 4 goto v4

U4: // n >= 0
// regular loop body unrolled 4x
MOVD 0(R8)(R10*1), R5
MOVD 8(R8)(R10*1), R6
MOVD 16(R8)(R10*1), R7
MOVD 24(R8)(R10*1), R1
ADDC R4, R5
ADDE R0, R6
ADDE R0, R7
ADDE R0, R1
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)
MOVD R6, 8(R2)(R10*1)
MOVD R7, 16(R2)(R10*1)
MOVD R1, 24(R2)(R10*1)

ADD $32, R10 // i += 4 -> i +=32
SUB $4, R3 // n -= 4
BGE U4 // if n >= 0 goto U4

v4:
ADD $4, R3 // n += 4
BLE E4 // if n <= 0 goto E4

L4: // n > 0
MOVD 0(R8)(R10*1), R5
ADDC R4, R5
ADDE R0, R0
MOVD R0, R4 // save CF
SUB R0, R0
MOVD R5, 0(R2)(R10*1)

ADD $8, R10 // i++
SUB $1, R3 // n--
BGT L4 // if n > 0 goto L4

E4:
MOVD R4, c+56(FP) // return c
// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
// With the assumption that x and z will not overlap with each other or x and z will
// point to same memory region, we can use a faster version of copy using only MVC here.
// In the following implementation, we have three copy loops, each copying a word, 4 words, and
// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
copySetup:
ADD R12, R6
ADD R12, R8

CMPBGE R5, $4, mediumLoop

smallLoop: // does a loop unrolling to copy word when n < 4
CMPBEQ R5, $0, returnZero
MVC $8, 0(R6), 0(R8)
CMPBEQ R5, $1, returnZero
MVC $8, 8(R6), 8(R8)
CMPBEQ R5, $2, returnZero
MVC $8, 16(R6), 16(R8)

returnZero:
MOVD $0, c+56(FP) // return 0 as carry
RET

mediumLoop:
CMPBLT R5, $4, smallLoop
CMPBLT R5, $32, mediumLoopBody

largeLoop: // Copying 256 bytes at a time.
MVC $256, 0(R6), 0(R8)
MOVD $256(R6), R6
MOVD $256(R8), R8
MOVD $-32(R5), R5
CMPBGE R5, $32, largeLoop
BR mediumLoop

mediumLoopBody: // Copying 32 bytes at a time
MVC $32, 0(R6), 0(R8)
MOVD $32(R6), R6
MOVD $32(R8), R8
MOVD $-4(R5), R5
CMPBGE R5, $4, mediumLoopBody
BR smallLoop

returnC:
MOVD R7, c+56(FP)
RET

TEXT ·subVW(SB), NOSPLIT, $0
Expand Down
5 changes: 1 addition & 4 deletions src/math/big/arith_s390x_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,7 @@ func TestFunVVnovec(t *testing.T) {
func TestFunVWnovec(t *testing.T) {
if hasVX == true {
for _, a := range sumVW {
arg := a
testFunVW(t, "addVW_novec", addVW_novec, arg)

arg = argVW{a.x, a.z, a.y, a.c}
arg := argVW{a.x, a.z, a.y, a.c}
testFunVW(t, "subVW_novec", subVW_novec, arg)
}
}
Expand Down

0 comments on commit ee8972c

Please sign in to comment.