From 777d1787b879d869a3334d4f948c9f1fba17637a Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sun, 13 Nov 2022 13:58:39 +0100 Subject: [PATCH] Assembler implementation for arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmark results on Raspberry Pi 4B, Linux, Go 1.17.1: name old speed new speed delta Sum64/4B-4 180MB/s ± 0% 251MB/s ± 0% +39.13% (p=0.000 n=10+10) Sum64/100B-4 994MB/s ± 0% 1135MB/s ± 0% +14.25% (p=0.000 n=10+9) Sum64/4KB-4 1.92GB/s ± 0% 1.93GB/s ± 0% +0.43% (p=0.000 n=10+10) Sum64/10MB-4 1.88GB/s ± 0% 1.88GB/s ± 0% ~ (p=0.754 n=10+10) Sum64String/4B-4 133MB/s ± 4% 228MB/s ± 0% +71.37% (p=0.000 n=10+9) Sum64String/100B-4 949MB/s ± 0% 1103MB/s ± 0% +16.17% (p=0.000 n=10+10) Sum64String/4KB-4 1.92GB/s ± 0% 1.93GB/s ± 0% +0.40% (p=0.000 n=9+8) Sum64String/10MB-4 1.88GB/s ± 0% 1.88GB/s ± 0% ~ (p=0.146 n=10+8) DigestBytes/4B-4 61.9MB/s ± 0% 61.9MB/s ± 0% ~ (p=0.158 n=10+9) DigestBytes/100B-4 695MB/s ± 0% 719MB/s ± 0% +3.37% (p=0.000 n=10+10) DigestBytes/4KB-4 1.89GB/s ± 0% 1.90GB/s ± 0% +0.43% (p=0.000 n=9+10) DigestBytes/10MB-4 1.88GB/s ± 0% 1.89GB/s ± 0% +0.92% (p=0.000 n=10+9) DigestString/4B-4 58.9MB/s ± 0% 58.5MB/s ± 1% -0.60% (p=0.032 n=8+10) DigestString/100B-4 669MB/s ± 0% 696MB/s ± 1% +4.05% (p=0.000 n=10+10) DigestString/4KB-4 1.89GB/s ± 0% 1.89GB/s ± 0% +0.34% (p=0.000 n=10+10) DigestString/10MB-4 1.88GB/s ± 0% 1.89GB/s ± 0% +0.90% (p=0.000 n=10+10) --- xxhash_arm64.s | 189 +++++++++++++++++++++++++++++++ xxhash_amd64.go => xxhash_asm.go | 7 +- xxhash_other.go | 4 +- 3 files changed, 196 insertions(+), 4 deletions(-) create mode 100644 xxhash_arm64.s rename xxhash_amd64.go => xxhash_asm.go (55%) diff --git a/xxhash_arm64.s b/xxhash_arm64.s new file mode 100644 index 0000000..7ea336b --- /dev/null +++ b/xxhash_arm64.s @@ -0,0 +1,189 @@ +// +build gc,!purego + +#include "textflag.h" + +// Register allocation. +#define digest R1 +#define h R2 // Return value. +#define p R3 // Input pointer. +#define len R4 +#define nblocks R5 // len / 32. +#define prime1 R7 +#define prime2 R8 +#define prime3 R9 +#define prime4 R10 +#define prime5 R11 +#define v1 R12 +#define v2 R13 +#define v3 R14 +#define v4 R15 +#define x1 R20 +#define x2 R21 +#define x3 R22 +#define x4 R23 + +#define round(acc, x) \ + MADD prime2, acc, x, acc \ + ROR $64-31, acc \ + MUL prime1, acc \ + +// x = round(0, x). +#define round0(x) \ + MUL prime2, x \ + ROR $64-31, x \ + MUL prime1, x \ + +#define mergeRound(x) \ + round0(x) \ + EOR x, h \ + MADD h, prime4, prime1, h \ + +// Update v[1-4] with 32-byte blocks. Assumes len >= 32. +#define blocksLoop() \ + LSR $5, len, nblocks \ + PCALIGN $16 \ +loop: \ + LDP.P 32(p), (x1, x2) \ + round(v1, x1) \ + LDP -16(p), (x3, x4) \ + round(v2, x2) \ + round(v3, x3) \ + round(v4, x4) \ + SUB $1, nblocks \ + CBNZ nblocks, loop \ + + +// The primes are repeated here to ensure that they're stored +// in a contiguous array, so we can load them with LDP. +DATA primes<> +0(SB)/8, $11400714785074694791 +DATA primes<> +8(SB)/8, $14029467366897019727 +DATA primes<>+16(SB)/8, $1609587929392839161 +DATA primes<>+24(SB)/8, $9650029242287828579 +DATA primes<>+32(SB)/8, $2870177450012600261 +GLOBL primes<>(SB), NOPTR+RODATA, $40 + + +// func Sum64(b []byte) uint64 +TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 + LDP b_base+0(FP), (p, len) + + LDP primes<> +0(SB), (prime1, prime2) + LDP primes<>+16(SB), (prime3, prime4) + MOVD primes<>+32(SB), prime5 + + CMP $32, len + CSEL LT, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } + BLT afterLoop + + ADD prime1, prime2, v1 + MOVD prime2, v2 + MOVD $0, v3 + NEG prime1, v4 + + blocksLoop() + + ROR $64-1, v1, x1 + ROR $64-7, v2, x2 + ADD x1, x2 + ROR $64-12, v3, x3 + ROR $64-18, v4, x4 + ADD x3, x4 + ADD x2, x4, h + + mergeRound(v1) + mergeRound(v2) + mergeRound(v3) + mergeRound(v4) + +afterLoop: + ADD len, h + + TBZ $4, len, try8 + LDP.P 16(p), (x1, x2) + + round0(x1) + ROR $64-27, h + EOR x1 @> 64-27, h, h + MADD h, prime4, prime1, h + + round0(x2) + ROR $64-27, h + EOR x2 @> 64-27, h + MADD h, prime4, prime1, h + +try8: + TBZ $3, len, try4 + MOVD.P 8(p), x1 + + round0(x1) + ROR $64-27, h + EOR x1 @> 64-27, h + MADD h, prime4, prime1, h + +try4: + TBZ $2, len, try2 + MOVWU.P 4(p), x2 + + MUL prime1, x2 + ROR $64-23, h + EOR x2 @> 64-23, h + MADD h, prime3, prime2, h + +try2: + TBZ $1, len, try1 + MOVHU.P 2(p), x3 + AND $255, x3, x1 + LSR $8, x3, x2 + + MUL prime5, x1 + ROR $64-11, h + EOR x1 @> 64-11, h + MUL prime1, h + + MUL prime5, x2 + ROR $64-11, h + EOR x2 @> 64-11, h + MUL prime1, h + +try1: + TBZ $0, len, end + MOVBU (p), x4 + + MUL prime5, x4 + ROR $64-11, h + EOR x4 @> 64-11, h + MUL prime1, h + +end: + EOR h >> 33, h + MUL prime2, h + EOR h >> 29, h + MUL prime3, h + EOR h >> 32, h + + MOVD h, ret+24(FP) + RET + + +// func writeBlocks(d *Digest, b []byte) int +// +// Assumes len(b) >= 32. +TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 + LDP primes<>(SB), (prime1, prime2) + + // Load state. Assume v[1-4] are stored contiguously. + MOVD d+0(FP), digest + LDP 0(digest), (v1, v2) + LDP 16(digest), (v3, v4) + + LDP b_base+8(FP), (p, len) + + blocksLoop() + + // Store updated state. + STP (v1, v2), 0(digest) + STP (v3, v4), 16(digest) + + BIC $31, len + MOVD len, ret+32(FP) + RET diff --git a/xxhash_amd64.go b/xxhash_asm.go similarity index 55% rename from xxhash_amd64.go rename to xxhash_asm.go index 0ae847f..9216e0a 100644 --- a/xxhash_amd64.go +++ b/xxhash_asm.go @@ -1,5 +1,8 @@ -//go:build !appengine && gc && !purego -// +build !appengine,gc,!purego +//go:build (amd64 || arm64) && !appengine && gc && !purego +// +build amd64 arm64 +// +build !appengine +// +build gc +// +build !purego package xxhash diff --git a/xxhash_other.go b/xxhash_other.go index 1f52f29..2deb1ca 100644 --- a/xxhash_other.go +++ b/xxhash_other.go @@ -1,5 +1,5 @@ -//go:build !amd64 || appengine || !gc || purego -// +build !amd64 appengine !gc purego +//go:build (!amd64 && !arm64) || appengine || !gc || purego +// +build !amd64,!arm64 appengine !gc purego package xxhash