Skip to content

Commit

Permalink
Merge pull request #56 from AWSjswinney/arm64-port-pr
Browse files Browse the repository at this point in the history
bug fix to encode_arm64.s: some registers overwritten in memmove call

ARM64 memmove clobbers R16 and R17 as of
https://go-review.googlesource.com/c/go/+/243357
  • Loading branch information
nigeltao authored Nov 3, 2020
2 parents 196ae77 + f81760e commit 674baa8
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 71 deletions.
45 changes: 18 additions & 27 deletions decode_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ loop:
// x := uint32(src[s] >> 2)
// switch
MOVW $60, R1
ADD R4>>2, ZR, R4
LSRW $2, R4, R4
CMPW R4, R1
BLS tagLit60Plus

Expand Down Expand Up @@ -111,13 +111,12 @@ doLit:
// is contiguous in memory and so it needs to leave enough source bytes to
// read the next tag without refilling buffers, but Go's Decode assumes
// contiguousness (the src argument is a []byte).
MOVD $16, R1
CMP R1, R4
BGT callMemmove
CMP R1, R2
BLT callMemmove
CMP R1, R3
BLT callMemmove
CMP $16, R4
BGT callMemmove
CMP $16, R2
BLT callMemmove
CMP $16, R3
BLT callMemmove

// !!! Implement the copy from src to dst as a 16-byte load and store.
// (Decode's documentation says that dst and src must not overlap.)
Expand All @@ -130,9 +129,8 @@ doLit:
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.

VLD1 0(R6), [V0.B16]
VST1 [V0.B16], 0(R7)
LDP 0(R6), (R14, R15)
STP (R14, R15), 0(R7)

// d += length
// s += length
Expand Down Expand Up @@ -210,8 +208,7 @@ tagLit61:
B doLit

tagLit62Plus:
MOVW $62, R1
CMPW R1, R4
CMPW $62, R4
BHI tagLit63

// case x == 62:
Expand Down Expand Up @@ -273,10 +270,9 @@ tagCopy:
// We have a copy tag. We assume that:
// - R3 == src[s] & 0x03
// - R4 == src[s]
MOVD $2, R1
CMP R1, R3
BEQ tagCopy2
BGT tagCopy4
CMP $2, R3
BEQ tagCopy2
BGT tagCopy4

// case tagCopy1:
// s += 2
Expand Down Expand Up @@ -346,13 +342,11 @@ doCopy:
// }
// copy 16 bytes
// d += length
MOVD $16, R1
MOVD $8, R0
CMP R1, R4
CMP $16, R4
BGT slowForwardCopy
CMP R0, R5
CMP $8, R5
BLT slowForwardCopy
CMP R1, R14
CMP $16, R14
BLT slowForwardCopy
MOVD 0(R15), R2
MOVD R2, 0(R7)
Expand Down Expand Up @@ -426,8 +420,7 @@ makeOffsetAtLeast8:
// // The two previous lines together means that d-offset, and therefore
// // R15, is unchanged.
// }
MOVD $8, R1
CMP R1, R5
CMP $8, R5
BGE fixUpSlowForwardCopy
MOVD (R15), R3
MOVD R3, (R7)
Expand Down Expand Up @@ -477,9 +470,7 @@ verySlowForwardCopy:
ADD $1, R15, R15
ADD $1, R7, R7
SUB $1, R4, R4
MOVD $0, R1
CMP R1, R4
BNE verySlowForwardCopy
CBNZ R4, verySlowForwardCopy
B loop

// The code above handles copy tags.
Expand Down
81 changes: 37 additions & 44 deletions encode_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,9 @@ TEXT ·emitLiteral(SB), NOSPLIT, $32-56
MOVW R3, R4
SUBW $1, R4, R4

MOVW $60, R2
CMPW R2, R4
CMPW $60, R4
BLT oneByte
MOVW $256, R2
CMPW R2, R4
CMPW $256, R4
BLT twoBytes

threeBytes:
Expand Down Expand Up @@ -98,8 +96,7 @@ TEXT ·emitCopy(SB), NOSPLIT, $0-48

loop0:
// for length >= 68 { etc }
MOVW $68, R2
CMPW R2, R3
CMPW $68, R3
BLT step1

// Emit a length 64 copy, encoded as 3 bytes.
Expand All @@ -112,9 +109,8 @@ loop0:

step1:
// if length > 64 { etc }
MOVD $64, R2
CMP R2, R3
BLE step2
CMP $64, R3
BLE step2

// Emit a length 60 copy, encoded as 3 bytes.
MOVD $0xee, R2
Expand All @@ -125,11 +121,9 @@ step1:

step2:
// if length >= 12 || offset >= 2048 { goto step3 }
MOVD $12, R2
CMP R2, R3
CMP $12, R3
BGE step3
MOVW $2048, R2
CMPW R2, R11
CMPW $2048, R11
BGE step3

// Emit the remaining copy, encoded as 2 bytes.
Expand Down Expand Up @@ -295,27 +289,24 @@ varTable:
// var table [maxTableSize]uint16
//
// In the asm code, unlike the Go code, we can zero-initialize only the
// first tableSize elements. Each uint16 element is 2 bytes and each VST1
// writes 64 bytes, so we can do only tableSize/32 writes instead of the
// 2048 writes that would zero-initialize all of table's 32768 bytes.
// This clear could overrun the first tableSize elements, but it won't
// overrun the allocated stack size.
// first tableSize elements. Each uint16 element is 2 bytes and each
// iterations writes 64 bytes, so we can do only tableSize/32 writes
// instead of the 2048 writes that would zero-initialize all of table's
// 32768 bytes. This clear could overrun the first tableSize elements, but
// it won't overrun the allocated stack size.
ADD $128, RSP, R17
MOVD R17, R4

// !!! R6 = &src[tableSize]
ADD R6<<1, R17, R6

// zero the SIMD registers
VEOR V0.B16, V0.B16, V0.B16
VEOR V1.B16, V1.B16, V1.B16
VEOR V2.B16, V2.B16, V2.B16
VEOR V3.B16, V3.B16, V3.B16

memclr:
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R4)
CMP R4, R6
BHI memclr
STP.P (ZR, ZR), 64(R4)
STP (ZR, ZR), -48(R4)
STP (ZR, ZR), -32(R4)
STP (ZR, ZR), -16(R4)
CMP R4, R6
BHI memclr

// !!! R6 = &src[0]
MOVD R7, R6
Expand Down Expand Up @@ -404,8 +395,7 @@ fourByteMatch:
// on inputMargin in encode.go.
MOVD R7, R3
SUB R10, R3, R3
MOVD $16, R2
CMP R2, R3
CMP $16, R3
BLE emitLiteralFastPath

// ----------------------------------------
Expand Down Expand Up @@ -454,18 +444,21 @@ inlineEmitLiteralMemmove:
MOVD R3, 24(RSP)

// Finish the "d +=" part of "d += emitLiteral(etc)".
ADD R3, R8, R8
MOVD R7, 80(RSP)
MOVD R8, 88(RSP)
MOVD R15, 120(RSP)
CALL runtime·memmove(SB)
MOVD 64(RSP), R5
MOVD 72(RSP), R6
MOVD 80(RSP), R7
MOVD 88(RSP), R8
MOVD 96(RSP), R9
MOVD 120(RSP), R15
B inner1
ADD R3, R8, R8
MOVD R7, 80(RSP)
MOVD R8, 88(RSP)
MOVD R15, 120(RSP)
CALL runtime·memmove(SB)
MOVD 64(RSP), R5
MOVD 72(RSP), R6
MOVD 80(RSP), R7
MOVD 88(RSP), R8
MOVD 96(RSP), R9
MOVD 120(RSP), R15
ADD $128, RSP, R17
MOVW $0xa7bd, R16
MOVKW $(0x1e35<<16), R16
B inner1

inlineEmitLiteralEnd:
// End inline of the emitLiteral call.
Expand All @@ -489,9 +482,9 @@ emitLiteralFastPath:
// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
VLD1 0(R10), [V0.B16]
VST1 [V0.B16], 0(R8)
ADD R3, R8, R8
LDP 0(R10), (R0, R1)
STP (R0, R1), 0(R8)
ADD R3, R8, R8

inner1:
// for { etc }
Expand Down

0 comments on commit 674baa8

Please sign in to comment.