-
Notifications
You must be signed in to change notification settings - Fork 125
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Benchmark results on Raspberry Pi 4B, Linux, Go 1.17.1: name old speed new speed delta Sum64/4B-4 180MB/s ± 0% 251MB/s ± 0% +39.13% (p=0.000 n=10+10) Sum64/100B-4 994MB/s ± 0% 1135MB/s ± 0% +14.25% (p=0.000 n=10+9) Sum64/4KB-4 1.92GB/s ± 0% 1.93GB/s ± 0% +0.43% (p=0.000 n=10+10) Sum64/10MB-4 1.88GB/s ± 0% 1.88GB/s ± 0% ~ (p=0.754 n=10+10) Sum64String/4B-4 133MB/s ± 4% 228MB/s ± 0% +71.37% (p=0.000 n=10+9) Sum64String/100B-4 949MB/s ± 0% 1103MB/s ± 0% +16.17% (p=0.000 n=10+10) Sum64String/4KB-4 1.92GB/s ± 0% 1.93GB/s ± 0% +0.40% (p=0.000 n=9+8) Sum64String/10MB-4 1.88GB/s ± 0% 1.88GB/s ± 0% ~ (p=0.146 n=10+8) DigestBytes/4B-4 61.9MB/s ± 0% 61.9MB/s ± 0% ~ (p=0.158 n=10+9) DigestBytes/100B-4 695MB/s ± 0% 719MB/s ± 0% +3.37% (p=0.000 n=10+10) DigestBytes/4KB-4 1.89GB/s ± 0% 1.90GB/s ± 0% +0.43% (p=0.000 n=9+10) DigestBytes/10MB-4 1.88GB/s ± 0% 1.89GB/s ± 0% +0.92% (p=0.000 n=10+9) DigestString/4B-4 58.9MB/s ± 0% 58.5MB/s ± 1% -0.60% (p=0.032 n=8+10) DigestString/100B-4 669MB/s ± 0% 696MB/s ± 1% +4.05% (p=0.000 n=10+10) DigestString/4KB-4 1.89GB/s ± 0% 1.89GB/s ± 0% +0.34% (p=0.000 n=10+10) DigestString/10MB-4 1.88GB/s ± 0% 1.89GB/s ± 0% +0.90% (p=0.000 n=10+10)
- Loading branch information
greatroar
committed
Oct 30, 2021
1 parent
532df6a
commit e3a7fe6
Showing
3 changed files
with
196 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
// +build gc,!purego | ||
|
||
#include "textflag.h" | ||
|
||
// Register allocation. | ||
#define digest R1 | ||
#define h R2 // Return value. | ||
#define p R3 // Input pointer. | ||
#define len R4 | ||
#define nblocks R5 // len / 32. | ||
#define prime1 R7 | ||
#define prime2 R8 | ||
#define prime3 R9 | ||
#define prime4 R10 | ||
#define prime5 R11 | ||
#define v1 R12 | ||
#define v2 R13 | ||
#define v3 R14 | ||
#define v4 R15 | ||
#define x1 R20 | ||
#define x2 R21 | ||
#define x3 R22 | ||
#define x4 R23 | ||
|
||
#define round(acc, x) \ | ||
MADD prime2, acc, x, acc \ | ||
ROR $64-31, acc \ | ||
MUL prime1, acc \ | ||
|
||
// x = round(0, x). | ||
#define round0(x) \ | ||
MUL prime2, x \ | ||
ROR $64-31, x \ | ||
MUL prime1, x \ | ||
|
||
#define mergeRound(x) \ | ||
round0(x) \ | ||
EOR x, h \ | ||
MADD h, prime4, prime1, h \ | ||
|
||
// Update v[1-4] with 32-byte blocks. Assumes len >= 32. | ||
#define blocksLoop() \ | ||
LSR $5, len, nblocks \ | ||
PCALIGN $16 \ | ||
loop: \ | ||
LDP.P 32(p), (x1, x2) \ | ||
round(v1, x1) \ | ||
LDP -16(p), (x3, x4) \ | ||
round(v2, x2) \ | ||
SUB $1, nblocks \ | ||
round(v3, x3) \ | ||
round(v4, x4) \ | ||
CBNZ nblocks, loop \ | ||
|
||
|
||
// The primes are repeated here to ensure that they're stored | ||
// in a contiguous array, so we can load them with LDP. | ||
DATA primes<> +0(SB)/8, $11400714785074694791 | ||
DATA primes<> +8(SB)/8, $14029467366897019727 | ||
DATA primes<>+16(SB)/8, $1609587929392839161 | ||
DATA primes<>+24(SB)/8, $9650029242287828579 | ||
DATA primes<>+32(SB)/8, $2870177450012600261 | ||
GLOBL primes<>(SB), NOPTR+RODATA, $40 | ||
|
||
|
||
// func Sum64(b []byte) uint64 | ||
TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32 | ||
LDP b_base+0(FP), (p, len) | ||
|
||
LDP primes<> +0(SB), (prime1, prime2) | ||
LDP primes<>+16(SB), (prime3, prime4) | ||
MOVD primes<>+32(SB), prime5 | ||
|
||
CMP $32, len | ||
CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 } | ||
BLO afterLoop | ||
|
||
ADD prime1, prime2, v1 | ||
MOVD prime2, v2 | ||
MOVD $0, v3 | ||
NEG prime1, v4 | ||
|
||
blocksLoop() | ||
|
||
ROR $64-1, v1, x1 | ||
ROR $64-7, v2, x2 | ||
ADD x1, x2 | ||
ROR $64-12, v3, x3 | ||
ROR $64-18, v4, x4 | ||
ADD x3, x4 | ||
ADD x2, x4, h | ||
|
||
mergeRound(v1) | ||
mergeRound(v2) | ||
mergeRound(v3) | ||
mergeRound(v4) | ||
|
||
afterLoop: | ||
ADD len, h | ||
|
||
TBZ $4, len, try8 | ||
LDP.P 16(p), (x1, x2) | ||
|
||
round0(x1) | ||
ROR $64-27, h | ||
EOR x1 @> 64-27, h, h | ||
MADD h, prime4, prime1, h | ||
|
||
round0(x2) | ||
ROR $64-27, h | ||
EOR x2 @> 64-27, h | ||
MADD h, prime4, prime1, h | ||
|
||
try8: | ||
TBZ $3, len, try4 | ||
MOVD.P 8(p), x1 | ||
|
||
round0(x1) | ||
ROR $64-27, h | ||
EOR x1 @> 64-27, h | ||
MADD h, prime4, prime1, h | ||
|
||
try4: | ||
TBZ $2, len, try2 | ||
MOVWU.P 4(p), x2 | ||
|
||
MUL prime1, x2 | ||
ROR $64-23, h | ||
EOR x2 @> 64-23, h | ||
MADD h, prime3, prime2, h | ||
|
||
try2: | ||
TBZ $1, len, try1 | ||
MOVHU.P 2(p), x3 | ||
AND $255, x3, x1 | ||
LSR $8, x3, x2 | ||
|
||
MUL prime5, x1 | ||
ROR $64-11, h | ||
EOR x1 @> 64-11, h | ||
MUL prime1, h | ||
|
||
MUL prime5, x2 | ||
ROR $64-11, h | ||
EOR x2 @> 64-11, h | ||
MUL prime1, h | ||
|
||
try1: | ||
TBZ $0, len, end | ||
MOVBU (p), x4 | ||
|
||
MUL prime5, x4 | ||
ROR $64-11, h | ||
EOR x4 @> 64-11, h | ||
MUL prime1, h | ||
|
||
end: | ||
EOR h >> 33, h | ||
MUL prime2, h | ||
EOR h >> 29, h | ||
MUL prime3, h | ||
EOR h >> 32, h | ||
|
||
MOVD h, ret+24(FP) | ||
RET | ||
|
||
|
||
// func writeBlocks(d *Digest, b []byte) int | ||
// | ||
// Assumes len(b) >= 32. | ||
TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40 | ||
LDP primes<>(SB), (prime1, prime2) | ||
|
||
// Load state. Assume v[1-4] are stored contiguously. | ||
MOVD d+0(FP), digest | ||
LDP 0(digest), (v1, v2) | ||
LDP 16(digest), (v3, v4) | ||
|
||
LDP b_base+8(FP), (p, len) | ||
|
||
blocksLoop() | ||
|
||
// Store updated state. | ||
STP (v1, v2), 0(digest) | ||
STP (v3, v4), 16(digest) | ||
|
||
BIC $31, len | ||
MOVD len, ret+32(FP) | ||
RET |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters