Skip to content

Commit

Permalink
s2: Do 2 overlapping match checks (#839)
Browse files Browse the repository at this point in the history
Mainly faster on long matches, but has little to no regression on short matches.
  • Loading branch information
klauspost authored Jul 27, 2023
1 parent 895291c commit c1dcc38
Show file tree
Hide file tree
Showing 2 changed files with 1,230 additions and 435 deletions.
49 changes: 37 additions & 12 deletions s2/_generate/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -2743,16 +2743,48 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re
}
Label("avx2_continue_" + name)

Label("matchlen_loopback_16_" + name)
tmp2 := GP64()
CMPL(len.As32(), U8(16))
JB(LabelRef("matchlen_match8_" + name))
MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp)
MOVQ(Mem{Base: a, Index: matched, Scale: 1, Disp: 8}, tmp2)
XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp)
JNZ(LabelRef("matchlen_bsf_8_" + name))
XORQ(Mem{Base: b, Index: matched, Scale: 1, Disp: 8}, tmp2)
JNZ(LabelRef("matchlen_bsf_16" + name))
// All 8 byte matched, update and loop.
LEAL(Mem{Base: len, Disp: -16}, len.As32())
LEAL(Mem{Base: matched, Disp: 16}, matched)
JMP(LabelRef("matchlen_loopback_16_" + name))

Label("matchlen_bsf_16" + name)
// Not all match.
Comment("#ifdef GOAMD64_v3")
// 2016 BMI :TZCNT r64, r64 L: 0.57ns= 2.0c T: 0.29ns= 1.00c
// 315 AMD64 :BSF r64, r64 L: 0.88ns= 3.1c T: 0.86ns= 3.00c
TZCNTQ(tmp2, tmp2)
Comment("#else")
BSFQ(tmp2, tmp2)
Comment("#endif")

SARQ(U8(3), tmp2)
LEAL(Mem{Base: matched, Index: tmp2, Scale: 1, Disp: 8}, matched)
JMP(end)

Label("matchlen_match8_" + name)
CMPL(len.As32(), U8(8))
JB(LabelRef("matchlen_match4_" + name))

Label("matchlen_loopback_" + name)
MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp)
XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp)
TESTQ(tmp, tmp)
JZ(LabelRef("matchlen_loop_" + name))
// Not all match.
JNZ(LabelRef("matchlen_bsf_8_" + name))
// All 8 byte matched, update and loop.
LEAL(Mem{Base: len, Disp: -8}, len.As32())
LEAL(Mem{Base: matched, Disp: 8}, matched)
JMP(LabelRef("matchlen_match4_" + name))
Label("matchlen_bsf_8_" + name)

// Not all match.
Comment("#ifdef GOAMD64_v3")
// 2016 BMI :TZCNT r64, r64 L: 0.57ns= 2.0c T: 0.29ns= 1.00c
// 315 AMD64 :BSF r64, r64 L: 0.88ns= 3.1c T: 0.86ns= 3.00c
Expand All @@ -2765,13 +2797,6 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re
LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched)
JMP(end)

// All 8 byte matched, update and loop.
Label("matchlen_loop_" + name)
LEAL(Mem{Base: len, Disp: -8}, len.As32())
LEAL(Mem{Base: matched, Disp: 8}, matched)
CMPL(len.As32(), U8(8))
JAE(LabelRef("matchlen_loopback_" + name))

// Less than 8 bytes left.
// Test 4 bytes...
Label("matchlen_match4_" + name)
Expand Down
Loading

0 comments on commit c1dcc38

Please sign in to comment.