Skip to content

Commit

Permalink
runtime: improve memmove performance ppc64,ppc64le
Browse files Browse the repository at this point in the history
This change improves the performance of memmove
on ppc64 & ppc64le mainly for moves >=32 bytes.
In addition, the test to detect backward moves
 was enhanced to avoid backward moves if source
and dest were in different types of storage, since
backward moves might not always be efficient.

Fixes golang#14507

The following shows some of the improvements from the test
in the runtime package:

BenchmarkMemmove32                   4229.56      4717.13      1.12x
BenchmarkMemmove64                   6156.03      7810.42      1.27x
BenchmarkMemmove128                  7521.69      12468.54     1.66x
BenchmarkMemmove256                  6729.90      18260.33     2.71x
BenchmarkMemmove512                  8521.59      18033.81     2.12x
BenchmarkMemmove1024                 9760.92      25762.61     2.64x
BenchmarkMemmove2048                 10241.00     29584.94     2.89x
BenchmarkMemmove4096                 10399.37     31882.31     3.07x

BenchmarkMemmoveUnalignedDst16       1943.69      2258.33      1.16x
BenchmarkMemmoveUnalignedDst32       3885.08      3965.81      1.02x
BenchmarkMemmoveUnalignedDst64       5121.63      6965.54      1.36x
BenchmarkMemmoveUnalignedDst128      7212.34      11372.68     1.58x
BenchmarkMemmoveUnalignedDst256      6564.52      16913.59     2.58x
BenchmarkMemmoveUnalignedDst512      8364.35      17782.57     2.13x
BenchmarkMemmoveUnalignedDst1024     9539.87      24914.72     2.61x
BenchmarkMemmoveUnalignedDst2048     9199.23      21235.11     2.31x
BenchmarkMemmoveUnalignedDst4096     10077.39     25231.99     2.50x

BenchmarkMemmoveUnalignedSrc32       3249.83      3742.52      1.15x
BenchmarkMemmoveUnalignedSrc64       5562.35      6627.96      1.19x
BenchmarkMemmoveUnalignedSrc128      6023.98      10200.84     1.69x
BenchmarkMemmoveUnalignedSrc256      6921.83      15258.43     2.20x
BenchmarkMemmoveUnalignedSrc512      8593.13      16541.97     1.93x
BenchmarkMemmoveUnalignedSrc1024     9730.95      22927.84     2.36x
BenchmarkMemmoveUnalignedSrc2048     9793.28      21537.73     2.20x
BenchmarkMemmoveUnalignedSrc4096     10132.96     26295.06     2.60x

Change-Id: I73af59970d4c97c728deabb9708b31ec7e01bdf2
Reviewed-on: https://go-review.googlesource.com/21990
Reviewed-by: Bill O'Farrell <[email protected]>
Reviewed-by: Brad Fitzpatrick <[email protected]>
  • Loading branch information
laboger authored and ceseo committed May 3, 2016
1 parent 1e080b5 commit 8231284
Showing 1 changed file with 74 additions and 43 deletions.
117 changes: 74 additions & 43 deletions src/runtime/memmove_ppc64x.s
Original file line number Diff line number Diff line change
Expand Up @@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
MOVD to+0(FP), R3
MOVD from+8(FP), R4
MOVD n+16(FP), R5
CMP R5, $0
BNE check
RET

// Determine if there are doublewords to
// copy so a more efficient move can be done
check:
ANDCC $7, R5, R7 // R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
SRAD $3, R5, R6 // R6 is the number of words to copy
CMP R6, $0, CR1 // CR1[EQ] is set if there are no words to copy.

CMP R3, R4, CR2
BC 12, 9, backward // I think you should be able to write this as "BGT CR2, backward"
ANDCC $7, R5, R7 // R7: bytes to copy
SRAD $3, R5, R6 // R6: double words to copy
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy

// Copying forward proceeds by copying R6 words then copying R7 bytes.
// R3 and R4 are advanced as we copy. Becuase PPC64 lacks post-increment
// load/store, R3 and R4 point before the bytes that are to be copied.
// Determine overlap by subtracting dest - src and comparing against the
// length. The catches the cases where src and dest are in different types
// of storage such as stack and static to avoid doing backward move when not
// necessary.

BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"

MOVD R6, CTR
SUB R4, R3, R8 // dest - src
CMPU R8, R5, CR2 // < len?
BC 12, 8, backward // BLT CR2 backward

SUB $8, R3
SUB $8, R4
// Copying forward if no overlap.

forwardlargeloop:
MOVDU 8(R4), R8
MOVDU R8, 8(R3)
BC 16, 0, forwardlargeloop // "BDNZ"

ADD $8, R3
ADD $8, R4
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
MOVD R6,CTR // R6 = number of double words
SRADCC $2,R6,R8 // 32 byte chunks?
BNE forward32setup //

// Move double words

forward8:
MOVD 0(R4), R8 // double word
ADD $8,R4
MOVD R8, 0(R3) //
ADD $8,R3
BC 16, 0, forward8
BR noforwardlarge // handle remainder

// Prepare for moves of 32 bytes at a time.

forward32setup:
DCBTST (R3) // prepare data cache
DCBT (R4)
MOVD R8, CTR // double work count

forward32:
MOVD 0(R4), R8 // load 4 double words
MOVD 8(R4), R9
MOVD 16(R4), R14
MOVD 24(R4), R15
ADD $32,R4
MOVD R8, 0(R3) // store those 4
MOVD R9, 8(R3)
MOVD R14,16(R3)
MOVD R15,24(R3)
ADD $32,R3 // bump up for next set
BC 16, 0, forward32 // continue
RLDCLCC $61,R5,$3,R6 // remaining doublewords
BEQ noforwardlarge
MOVD R6,CTR // set up the CTR
BR forward8

noforwardlarge:
BNE forwardtail // Tests the bit set by ANDCC above
RET
CMP R7,$0 // any remaining bytes
BC 4, 1, LR

forwardtail:
SUB $1, R3
SUB $1, R4
MOVD R7, CTR
MOVD R7, CTR // move tail bytes

forwardtailloop:
MOVBZU 1(R4), R8
MOVBZU R8, 1(R3)
MOVBZ 0(R4), R8 // move single bytes
ADD $1,R4
MOVBZ R8, 0(R3)
ADD $1,R3
BC 16, 0, forwardtailloop
RET

backward:
// Copying backwards proceeds by copying R7 bytes then copying R6 words.
// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
// R3 and R4 are advanced to the end of the destination/source buffers
// respectively and moved back as we copy.

ADD R5, R4, R4
ADD R3, R5, R3
ADD R5, R4, R4 // end of source
ADD R3, R5, R3 // end of dest

BEQ nobackwardtail
BEQ nobackwardtail // earlier condition

MOVD R7, CTR
MOVD R7, CTR // bytes to move

backwardtailloop:
MOVBZU -1(R4), R8
MOVBZU R8, -1(R3)
MOVBZ -1(R4), R8 // point to last byte
SUB $1,R4
MOVBZ R8, -1(R3)
SUB $1,R3
BC 16, 0, backwardtailloop

nobackwardtail:
BC 4, 6, backwardlarge // "BNE CR1"
RET
CMP R6,$0
BC 4, 5, LR

backwardlarge:
MOVD R6, CTR

backwardlargeloop:
MOVDU -8(R4), R8
MOVDU R8, -8(R3)
BC 16, 0, backwardlargeloop // "BDNZ"
MOVD -8(R4), R8
SUB $8,R4
MOVD R8, -8(R3)
SUB $8,R3
BC 16, 0, backwardlargeloop //
RET

0 comments on commit 8231284

Please sign in to comment.