Assembler implementation for arm64

Benchmark results on Raspberry Pi 4B, Linux, Go 1.17.1: name old speed new speed delta Sum64/4B-4 180MB/s ± 0% 251MB/s ± 0% +39.13% (p=0.000 n=10+10) Sum64/100B-4 994MB/s ± 0% 1135MB/s ± 0% +14.25% (p=0.000 n=10+9) Sum64/4KB-4 1.92GB/s ± 0% 1.93GB/s ± 0% +0.43% (p=0.000 n=10+10) Sum64/10MB-4 1.88GB/s ± 0% 1.88GB/s ± 0% ~ (p=0.754 n=10+10) Sum64String/4B-4 133MB/s ± 4% 228MB/s ± 0% +71.37% (p=0.000 n=10+9) Sum64String/100B-4 949MB/s ± 0% 1103MB/s ± 0% +16.17% (p=0.000 n=10+10) Sum64String/4KB-4 1.92GB/s ± 0% 1.93GB/s ± 0% +0.40% (p=0.000 n=9+8) Sum64String/10MB-4 1.88GB/s ± 0% 1.88GB/s ± 0% ~ (p=0.146 n=10+8) DigestBytes/4B-4 61.9MB/s ± 0% 61.9MB/s ± 0% ~ (p=0.158 n=10+9) DigestBytes/100B-4 695MB/s ± 0% 719MB/s ± 0% +3.37% (p=0.000 n=10+10) DigestBytes/4KB-4 1.89GB/s ± 0% 1.90GB/s ± 0% +0.43% (p=0.000 n=9+10) DigestBytes/10MB-4 1.88GB/s ± 0% 1.89GB/s ± 0% +0.92% (p=0.000 n=10+9) DigestString/4B-4 58.9MB/s ± 0% 58.5MB/s ± 1% -0.60% (p=0.032 n=8+10) DigestString/100B-4 669MB/s ± 0% 696MB/s ± 1% +4.05% (p=0.000 n=10+10) DigestString/4KB-4 1.89GB/s ± 0% 1.89GB/s ± 0% +0.34% (p=0.000 n=10+10) DigestString/10MB-4 1.88GB/s ± 0% 1.89GB/s ± 0% +0.90% (p=0.000 n=10+10)
cespare · Oct 30, 2021 · e3a7fe6 · e3a7fe6
1 parent 532df6a
commit e3a7fe6
Show file tree

Hide file tree

Showing 3 changed files with 196 additions and 4 deletions.
diff --git a/xxhash_arm64.s b/xxhash_arm64.s
@@ -0,0 +1,189 @@
+// +build gc,!purego
+
+#include "textflag.h"
+
+// Register allocation.
+#define digest	R1
+#define h	R2	// Return value.
+#define p	R3	// Input pointer.
+#define len	R4
+#define nblocks	R5	// len / 32.
+#define prime1	R7
+#define prime2	R8
+#define prime3	R9
+#define prime4	R10
+#define prime5	R11
+#define v1	R12
+#define v2	R13
+#define v3	R14
+#define v4	R15
+#define x1	R20
+#define x2	R21
+#define x3	R22
+#define x4	R23
+
+#define round(acc, x) 			\
+	MADD prime2, acc, x, acc	\
+	ROR  $64-31, acc		\
+	MUL  prime1, acc		\
+
+// x = round(0, x).
+#define round0(x)	\
+	MUL prime2, x	\
+	ROR $64-31, x	\
+	MUL prime1, x	\
+
+#define mergeRound(x)			\
+	round0(x)			\
+	EOR  x, h			\
+	MADD h, prime4, prime1, h	\
+
+// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
+#define blocksLoop()		\
+	LSR  $5, len, nblocks	\
+	PCALIGN $16		\
+loop:				\
+	LDP.P 32(p), (x1, x2)	\
+	round(v1, x1)		\
+	LDP  -16(p), (x3, x4)	\
+	round(v2, x2)		\
+	SUB  $1, nblocks	\
+	round(v3, x3)		\
+	round(v4, x4)		\
+	CBNZ nblocks, loop	\
+
+
+// The primes are repeated here to ensure that they're stored
+// in a contiguous array, so we can load them with LDP.
+DATA  primes<> +0(SB)/8, $11400714785074694791
+DATA  primes<> +8(SB)/8, $14029467366897019727
+DATA  primes<>+16(SB)/8, $1609587929392839161
+DATA  primes<>+24(SB)/8, $9650029242287828579
+DATA  primes<>+32(SB)/8, $2870177450012600261
+GLOBL primes<>(SB), NOPTR+RODATA, $40
+
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
+	LDP  b_base+0(FP), (p, len)
+
+	LDP  primes<> +0(SB), (prime1, prime2)
+	LDP  primes<>+16(SB), (prime3, prime4)
+	MOVD primes<>+32(SB), prime5
+
+	CMP  $32, len
+	CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
+	BLO  afterLoop
+
+	ADD  prime1, prime2, v1
+	MOVD prime2, v2
+	MOVD $0, v3
+	NEG  prime1, v4
+
+	blocksLoop()
+
+	ROR $64-1, v1, x1
+	ROR $64-7, v2, x2
+	ADD x1, x2
+	ROR $64-12, v3, x3
+	ROR $64-18, v4, x4
+	ADD x3, x4
+	ADD x2, x4, h
+
+	mergeRound(v1)
+	mergeRound(v2)
+	mergeRound(v3)
+	mergeRound(v4)
+
+afterLoop:
+	ADD len, h
+
+	TBZ   $4, len, try8
+	LDP.P 16(p), (x1, x2)
+
+	round0(x1)
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h, h
+	MADD h, prime4, prime1, h
+
+	round0(x2)
+	ROR  $64-27, h
+	EOR  x2 @> 64-27, h
+	MADD h, prime4, prime1, h
+
+try8:
+	TBZ    $3, len, try4
+	MOVD.P 8(p), x1
+
+	round0(x1)
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h
+	MADD h, prime4, prime1, h
+
+try4:
+	TBZ     $2, len, try2
+	MOVWU.P 4(p), x2
+
+	MUL  prime1, x2
+	ROR  $64-23, h
+	EOR  x2 @> 64-23, h
+	MADD h, prime3, prime2, h
+
+try2:
+	TBZ     $1, len, try1
+	MOVHU.P 2(p), x3
+	AND     $255, x3, x1
+	LSR     $8, x3, x2
+
+	MUL prime5, x1
+	ROR $64-11, h
+	EOR x1 @> 64-11, h
+	MUL prime1, h
+
+	MUL prime5, x2
+	ROR $64-11, h
+	EOR x2 @> 64-11, h
+	MUL prime1, h
+
+try1:
+	TBZ   $0, len, end
+	MOVBU (p), x4
+
+	MUL prime5, x4
+	ROR $64-11, h
+	EOR x4 @> 64-11, h
+	MUL prime1, h
+
+end:
+	EOR h >> 33, h
+	MUL prime2,  h
+	EOR h >> 29, h
+	MUL prime3,  h
+	EOR h >> 32, h
+
+	MOVD h, ret+24(FP)
+	RET
+
+
+// func writeBlocks(d *Digest, b []byte) int
+//
+// Assumes len(b) >= 32.
+TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
+	LDP  primes<>(SB), (prime1, prime2)
+
+	// Load state. Assume v[1-4] are stored contiguously.
+	MOVD d+0(FP), digest
+	LDP   0(digest), (v1, v2)
+	LDP  16(digest), (v3, v4)
+
+	LDP b_base+8(FP), (p, len)
+
+	blocksLoop()
+
+	// Store updated state.
+	STP (v1, v2),  0(digest)
+	STP (v3, v4), 16(digest)
+
+	BIC  $31, len
+	MOVD len, ret+32(FP)
+	RET
diff --git a/xxhash_amd64.go → xxhash_asm.go b/xxhash_amd64.go → xxhash_asm.go
@@ -1,5 +1,8 @@
-//go:build !appengine && gc && !purego
-// +build !appengine,gc,!purego
+//go:build (amd64 || arm64) && !appengine && gc && !purego
+// +build amd64 arm64
+// +build !appengine
+// +build gc
+// +build !purego
 
 package xxhash
 

diff --git a/xxhash_other.go b/xxhash_other.go
@@ -1,5 +1,5 @@
-//go:build !amd64 || appengine || !gc || purego
-// +build !amd64 appengine !gc purego
+//go:build (!amd64 && !arm64) || appengine || !gc || purego
+// +build !amd64,!arm64 appengine !gc purego
 
 package xxhash