runtime: improve memmove performance ppc64,ppc64le

This change improves the performance of memmove on ppc64 & ppc64le mainly for moves >=32 bytes. In addition, the test to detect backward moves was enhanced to avoid backward moves if source and dest were in different types of storage, since backward moves might not always be efficient. Fixes golang#14507 The following shows some of the improvements from the test in the runtime package: BenchmarkMemmove32 4229.56 4717.13 1.12x BenchmarkMemmove64 6156.03 7810.42 1.27x BenchmarkMemmove128 7521.69 12468.54 1.66x BenchmarkMemmove256 6729.90 18260.33 2.71x BenchmarkMemmove512 8521.59 18033.81 2.12x BenchmarkMemmove1024 9760.92 25762.61 2.64x BenchmarkMemmove2048 10241.00 29584.94 2.89x BenchmarkMemmove4096 10399.37 31882.31 3.07x BenchmarkMemmoveUnalignedDst16 1943.69 2258.33 1.16x BenchmarkMemmoveUnalignedDst32 3885.08 3965.81 1.02x BenchmarkMemmoveUnalignedDst64 5121.63 6965.54 1.36x BenchmarkMemmoveUnalignedDst128 7212.34 11372.68 1.58x BenchmarkMemmoveUnalignedDst256 6564.52 16913.59 2.58x BenchmarkMemmoveUnalignedDst512 8364.35 17782.57 2.13x BenchmarkMemmoveUnalignedDst1024 9539.87 24914.72 2.61x BenchmarkMemmoveUnalignedDst2048 9199.23 21235.11 2.31x BenchmarkMemmoveUnalignedDst4096 10077.39 25231.99 2.50x BenchmarkMemmoveUnalignedSrc32 3249.83 3742.52 1.15x BenchmarkMemmoveUnalignedSrc64 5562.35 6627.96 1.19x BenchmarkMemmoveUnalignedSrc128 6023.98 10200.84 1.69x BenchmarkMemmoveUnalignedSrc256 6921.83 15258.43 2.20x BenchmarkMemmoveUnalignedSrc512 8593.13 16541.97 1.93x BenchmarkMemmoveUnalignedSrc1024 9730.95 22927.84 2.36x BenchmarkMemmoveUnalignedSrc2048 9793.28 21537.73 2.20x BenchmarkMemmoveUnalignedSrc4096 10132.96 26295.06 2.60x Change-Id: I73af59970d4c97c728deabb9708b31ec7e01bdf2 Reviewed-on: https://go-review.googlesource.com/21990 Reviewed-by: Bill O'Farrell <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
powertechpreview · May 3, 2016 · 8231284 · 8231284
1 parent 1e080b5
commit 8231284
Showing 1 changed file with 74 additions and 43 deletions.
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
@@ -11,78 +11,109 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
 	MOVD	to+0(FP), R3
 	MOVD	from+8(FP), R4
 	MOVD	n+16(FP), R5
-	CMP	R5, $0
-	BNE	check
-	RET
 
+	// Determine if there are doublewords to
+	// copy so a more efficient move can be done
 check:
-	ANDCC	$7, R5, R7	// R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
-	SRAD	$3, R5, R6	// R6 is the number of words to copy
-	CMP	R6, $0, CR1	// CR1[EQ] is set if there are no words to copy.
-
-	CMP	R3, R4, CR2
-	BC	12, 9, backward	// I think you should be able to write this as "BGT CR2, backward"
+	ANDCC	$7, R5, R7	// R7: bytes to copy
+	SRAD	$3, R5, R6	// R6: double words to copy
+	CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy
 
-	// Copying forward proceeds by copying R6 words then copying R7 bytes.
-	// R3 and R4 are advanced as we copy. Becuase PPC64 lacks post-increment
-	// load/store, R3 and R4 point before the bytes that are to be copied.
+	// Determine overlap by subtracting dest - src and comparing against the
+	// length.  The catches the cases where src and dest are in different types
+	// of storage such as stack and static to avoid doing backward move when not
+	// necessary.
 
-	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
-
-	MOVD	R6, CTR
+	SUB	R4, R3, R8	// dest - src
+	CMPU	R8, R5, CR2	// < len?
+	BC	12, 8, backward // BLT CR2 backward
 
-	SUB	$8, R3
-	SUB	$8, R4
+	// Copying forward if no overlap.
 
-forwardlargeloop:
-	MOVDU	8(R4), R8
-	MOVDU	R8, 8(R3)
-	BC	16, 0, forwardlargeloop // "BDNZ"
-
-	ADD	$8, R3
-	ADD	$8, R4
+	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
+	MOVD	R6,CTR			// R6 = number of double words
+	SRADCC	$2,R6,R8		// 32 byte chunks?
+	BNE	forward32setup		//
+
+	// Move double words
+
+forward8:
+	MOVD    0(R4), R8		// double word
+	ADD     $8,R4
+	MOVD    R8, 0(R3)		//
+	ADD     $8,R3
+	BC      16, 0, forward8
+	BR	noforwardlarge		// handle remainder
+
+	// Prepare for moves of 32 bytes at a time.
+
+forward32setup:
+	DCBTST	(R3)			// prepare data cache
+	DCBT	(R4)
+	MOVD	R8, CTR			// double work count
+
+forward32:
+	MOVD	0(R4), R8		// load 4 double words
+	MOVD	8(R4), R9
+	MOVD	16(R4), R14
+	MOVD	24(R4), R15
+	ADD	$32,R4
+	MOVD	R8, 0(R3)		// store those 4
+	MOVD	R9, 8(R3)
+	MOVD	R14,16(R3)
+	MOVD	R15,24(R3)
+	ADD	$32,R3			// bump up for next set
+	BC	16, 0, forward32	// continue
+	RLDCLCC	$61,R5,$3,R6		// remaining doublewords
+	BEQ	noforwardlarge
+	MOVD	R6,CTR			// set up the CTR
+	BR	forward8
 
 noforwardlarge:
-	BNE	forwardtail	// Tests the bit set by ANDCC above
-	RET
+	CMP	R7,$0			// any remaining bytes
+	BC	4, 1, LR
 
 forwardtail:
-	SUB	$1, R3
-	SUB	$1, R4
-	MOVD	R7, CTR
+	MOVD	R7, CTR			// move tail bytes
 
 forwardtailloop:
-	MOVBZU	1(R4), R8
-	MOVBZU	R8, 1(R3)
+	MOVBZ	0(R4), R8		// move single bytes
+	ADD	$1,R4
+	MOVBZ	R8, 0(R3)
+	ADD	$1,R3
 	BC	16, 0, forwardtailloop
 	RET
 
 backward:
-	// Copying backwards proceeds by copying R7 bytes then copying R6 words.
+	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
 	// R3 and R4 are advanced to the end of the destination/source buffers
 	// respectively and moved back as we copy.
 
-	ADD	R5, R4, R4
-	ADD	R3, R5, R3
+	ADD	R5, R4, R4		// end of source
+	ADD	R3, R5, R3		// end of dest
 
-	BEQ	nobackwardtail
+	BEQ	nobackwardtail		// earlier condition
 
-	MOVD	R7, CTR
+	MOVD	R7, CTR			// bytes to move
 
 backwardtailloop:
-	MOVBZU	-1(R4), R8
-	MOVBZU	R8, -1(R3)
+	MOVBZ 	-1(R4), R8		// point to last byte
+	SUB	$1,R4
+	MOVBZ 	R8, -1(R3)
+	SUB	$1,R3
 	BC	16, 0, backwardtailloop
 
 nobackwardtail:
-	BC	4, 6, backwardlarge		// "BNE CR1"
-	RET
+	CMP	R6,$0
+	BC	4, 5, LR
 
 backwardlarge:
 	MOVD	R6, CTR
 
 backwardlargeloop:
-	MOVDU	-8(R4), R8
-	MOVDU	R8, -8(R3)
-	BC	16, 0, backwardlargeloop	// "BDNZ"
+	MOVD 	-8(R4), R8
+	SUB	$8,R4
+	MOVD 	R8, -8(R3)
+	SUB	$8,R3
+	BC	16, 0, backwardlargeloop	//
 	RET