From 60e376bdee2f739e576b62c89e99d13b035d30cf Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Mon, 9 Jan 2023 14:33:43 +0100 Subject: [PATCH] huff0: Assembler improvements (#736) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Main changes: * Compute out[id * dstEvery + i] statically. This shaves four instructions off the main loops. (It also frees up a register.) * Track "exhausted" by addition instead or OR. This gets rid of an additional instruction. The variable is now also zeroed inside the loop as a dependency hint. Benchmark results show small speedups on some datasets: ``` name old speed new speed delta Decompress1XTable/digits-8 350MB/s ± 0% 350MB/s ± 1% ~ (p=0.764 n=10+9) Decompress1XTable/gettysburg-8 270MB/s ± 1% 268MB/s ± 1% -0.72% (p=0.001 n=10+10) Decompress1XTable/twain-8 329MB/s ± 1% 328MB/s ± 0% ~ (p=0.035 n=10+9) Decompress1XTable/low-ent.10k-8 387MB/s ± 1% 386MB/s ± 0% ~ (p=0.027 n=10+8) Decompress1XTable/superlow-ent-10k-8 377MB/s ± 0% 375MB/s ± 0% -0.48% (p=0.000 n=10+10) Decompress1XTable/crash2-8 17.0MB/s ± 0% 16.9MB/s ± 0% -0.36% (p=0.004 n=9+10) Decompress1XTable/endzerobits-8 53.3MB/s ± 0% 53.0MB/s ± 0% -0.55% (p=0.000 n=10+9) Decompress1XTable/endnonzero-8 11.3MB/s ± 0% 11.3MB/s ± 1% ~ (p=0.060 n=10+10) Decompress1XTable/case1-8 22.0MB/s ± 0% 21.9MB/s ± 1% ~ (p=0.015 n=9+9) Decompress1XTable/case2-8 18.1MB/s ± 1% 18.1MB/s ± 1% ~ (p=0.202 n=10+9) Decompress1XTable/case3-8 19.1MB/s ± 1% 19.2MB/s ± 1% ~ (p=0.056 n=9+10) Decompress1XTable/pngdata.001-8 374MB/s ± 0% 374MB/s ± 0% ~ (p=0.148 n=10+10) Decompress1XTable/normcount2-8 54.4MB/s ± 1% 54.4MB/s ± 1% ~ (p=0.617 n=10+10) Decompress1XNoTable/digits/100-8 280MB/s ± 0% 280MB/s ± 1% ~ (p=0.951 n=9+10) Decompress1XNoTable/digits/10000-8 366MB/s ± 1% 367MB/s ± 0% ~ (p=0.090 n=10+9) Decompress1XNoTable/digits/262143-8 348MB/s ± 1% 349MB/s ± 0% ~ (p=0.043 n=10+10) Decompress1XNoTable/gettysburg/100-8 276MB/s ± 0% 277MB/s ± 1% +0.44% (p=0.009 n=10+10) Decompress1XNoTable/gettysburg/10000-8 363MB/s ± 1% 363MB/s ± 0% ~ (p=0.041 n=10+7) Decompress1XNoTable/gettysburg/262143-8 349MB/s ± 1% 350MB/s ± 0% ~ (p=0.123 n=10+10) Decompress1XNoTable/twain/100-8 267MB/s ± 0% 268MB/s ± 0% ~ (p=0.052 n=10+10) Decompress1XNoTable/twain/10000-8 357MB/s ± 3% 363MB/s ± 0% +1.74% (p=0.000 n=10+10) Decompress1XNoTable/twain/262143-8 320MB/s ± 2% 329MB/s ± 0% +3.09% (p=0.000 n=10+10) Decompress1XNoTable/low-ent.10k/100-8 183MB/s ± 1% 184MB/s ± 0% ~ (p=0.211 n=9+10) Decompress1XNoTable/low-ent.10k/10000-8 377MB/s ± 3% 385MB/s ± 1% +2.14% (p=0.000 n=10+10) Decompress1XNoTable/low-ent.10k/262143-8 386MB/s ± 1% 389MB/s ± 1% +0.84% (p=0.005 n=10+10) Decompress1XNoTable/superlow-ent-10k/262143-8 382MB/s ± 2% 389MB/s ± 1% +1.89% (p=0.001 n=10+10) Decompress1XNoTable/crash2/100-8 276MB/s ± 2% 278MB/s ± 0% ~ (p=0.180 n=10+8) Decompress1XNoTable/crash2/10000-8 373MB/s ± 1% 374MB/s ± 1% ~ (p=0.315 n=10+10) Decompress1XNoTable/crash2/262143-8 373MB/s ± 1% 375MB/s ± 0% ~ (p=0.165 n=10+8) Decompress1XNoTable/endzerobits/100-8 184MB/s ± 0% 184MB/s ± 1% ~ (p=0.845 n=9+9) Decompress1XNoTable/endzerobits/10000-8 384MB/s ± 1% 386MB/s ± 0% +0.61% (p=0.007 n=10+10) Decompress1XNoTable/endzerobits/262143-8 387MB/s ± 2% 389MB/s ± 0% ~ (p=0.963 n=9+8) Decompress1XNoTable/endnonzero/100-8 181MB/s ± 2% 183MB/s ± 0% ~ (p=0.017 n=9+10) Decompress1XNoTable/endnonzero/10000-8 385MB/s ± 0% 382MB/s ± 1% -0.88% (p=0.001 n=8+10) Decompress1XNoTable/endnonzero/262143-8 387MB/s ± 1% 385MB/s ± 2% ~ (p=0.143 n=10+10) Decompress1XNoTable/case1/100-8 278MB/s ± 2% 282MB/s ± 1% ~ (p=0.013 n=10+9) Decompress1XNoTable/case1/10000-8 373MB/s ± 1% 373MB/s ± 0% ~ (p=0.274 n=10+8) Decompress1XNoTable/case1/262143-8 374MB/s ± 1% 374MB/s ± 0% ~ (p=0.589 n=10+9) Decompress1XNoTable/case2/100-8 274MB/s ± 0% 274MB/s ± 0% -0.26% (p=0.002 n=10+9) Decompress1XNoTable/case2/10000-8 378MB/s ± 0% 377MB/s ± 0% ~ (p=0.093 n=10+10) Decompress1XNoTable/case2/262143-8 377MB/s ± 1% 376MB/s ± 1% ~ (p=0.225 n=10+10) Decompress1XNoTable/case3/100-8 266MB/s ± 0% 265MB/s ± 0% -0.20% (p=0.007 n=10+9) Decompress1XNoTable/case3/10000-8 371MB/s ± 0% 372MB/s ± 0% ~ (p=0.211 n=10+9) Decompress1XNoTable/case3/262143-8 373MB/s ± 0% 374MB/s ± 0% ~ (p=0.073 n=10+10) Decompress1XNoTable/pngdata.001/100-8 239MB/s ± 0% 239MB/s ± 0% ~ (p=0.889 n=9+10) Decompress1XNoTable/pngdata.001/10000-8 384MB/s ± 0% 384MB/s ± 0% ~ (p=0.228 n=10+8) Decompress1XNoTable/pngdata.001/262143-8 377MB/s ± 0% 379MB/s ± 0% +0.56% (p=0.000 n=10+10) Decompress1XNoTable/normcount2/100-8 281MB/s ± 1% 282MB/s ± 1% ~ (p=0.015 n=10+10) Decompress1XNoTable/normcount2/10000-8 368MB/s ± 0% 370MB/s ± 0% +0.37% (p=0.004 n=10+10) Decompress1XNoTable/normcount2/262143-8 371MB/s ± 0% 371MB/s ± 0% ~ (p=0.034 n=8+10) Decompress4XNoTable/digits/100-8 200MB/s ± 1% 201MB/s ± 0% ~ (p=0.274 n=8+10) Decompress4XNoTable/digits/10000-8 603MB/s ± 0% 622MB/s ± 1% +3.20% (p=0.000 n=8+10) Decompress4XNoTable/digits/262143-8 578MB/s ± 0% 595MB/s ± 1% +2.87% (p=0.000 n=8+10) Decompress4XNoTable/gettysburg/100-8 260MB/s ± 0% 260MB/s ± 1% ~ (p=0.011 n=8+10) Decompress4XNoTable/gettysburg/10000-8 643MB/s ± 0% 657MB/s ± 1% +2.19% (p=0.000 n=10+9) Decompress4XNoTable/gettysburg/262143-8 572MB/s ± 0% 589MB/s ± 0% +2.93% (p=0.000 n=8+10) Decompress4XNoTable/twain/100-8 206MB/s ± 1% 206MB/s ± 1% ~ (p=0.436 n=10+10) Decompress4XNoTable/twain/10000-8 639MB/s ± 1% 653MB/s ± 1% +2.25% (p=0.000 n=10+10) Decompress4XNoTable/twain/262143-8 516MB/s ± 0% 522MB/s ± 1% +1.09% (p=0.004 n=10+10) Decompress4XNoTable/low-ent.10k/100-8 207MB/s ± 1% 207MB/s ± 0% ~ (p=1.000 n=10+9) Decompress4XNoTable/low-ent.10k/10000-8 631MB/s ± 0% 653MB/s ± 0% +3.42% (p=0.000 n=10+9) Decompress4XNoTable/low-ent.10k/262143-8 685MB/s ± 1% 696MB/s ± 0% +1.61% (p=0.000 n=10+10) Decompress4XNoTable/superlow-ent-10k/262143-8 684MB/s ± 1% 695MB/s ± 1% +1.51% (p=0.000 n=9+10) Decompress4XNoTable/case1/100-8 208MB/s ± 1% 207MB/s ± 0% ~ (p=0.353 n=10+10) Decompress4XNoTable/case1/10000-8 601MB/s ± 0% 621MB/s ± 1% +3.22% (p=0.000 n=10+10) Decompress4XNoTable/case1/262143-8 613MB/s ± 1% 632MB/s ± 0% +3.14% (p=0.000 n=10+10) Decompress4XNoTable/case2/100-8 210MB/s ± 2% 208MB/s ± 2% ~ (p=0.315 n=10+9) Decompress4XNoTable/case2/10000-8 618MB/s ± 0% 636MB/s ± 0% +2.95% (p=0.000 n=10+10) Decompress4XNoTable/case2/262143-8 635MB/s ± 0% 651MB/s ± 0% +2.56% (p=0.000 n=7+10) Decompress4XNoTable/case3/100-8 199MB/s ± 1% 200MB/s ± 1% ~ (p=0.055 n=10+10) Decompress4XNoTable/case3/10000-8 615MB/s ± 0% 633MB/s ± 1% +2.94% (p=0.000 n=10+10) Decompress4XNoTable/case3/262143-8 620MB/s ± 0% 639MB/s ± 1% +3.00% (p=0.000 n=10+10) Decompress4XNoTable/pngdata.001/100-8 212MB/s ± 0% 211MB/s ± 1% ~ (p=0.211 n=10+9) Decompress4XNoTable/pngdata.001/10000-8 649MB/s ± 0% 667MB/s ± 1% +2.76% (p=0.000 n=10+10) Decompress4XNoTable/pngdata.001/262143-8 646MB/s ± 0% 660MB/s ± 0% +2.28% (p=0.000 n=9+10) Decompress4XNoTable/normcount2/100-8 261MB/s ± 1% 262MB/s ± 1% ~ (p=0.031 n=9+9) Decompress4XNoTable/normcount2/10000-8 589MB/s ± 1% 613MB/s ± 0% +3.99% (p=0.000 n=10+9) Decompress4XNoTable/normcount2/262143-8 585MB/s ± 3% 617MB/s ± 1% +5.57% (p=0.000 n=10+10) Decompress4XNoTableTableLog8/digits-8 579MB/s ± 2% 610MB/s ± 0% +5.33% (p=0.000 n=10+10) Decompress4XTable/digits-8 584MB/s ± 1% 607MB/s ± 1% +3.89% (p=0.000 n=10+10) Decompress4XTable/gettysburg-8 370MB/s ± 0% 373MB/s ± 1% +0.67% (p=0.009 n=10+10) Decompress4XTable/twain-8 512MB/s ± 2% 523MB/s ± 1% +2.08% (p=0.000 n=9+10) Decompress4XTable/low-ent.10k-8 656MB/s ± 1% 677MB/s ± 1% +3.21% (p=0.000 n=10+10) Decompress4XTable/superlow-ent-10k-8 603MB/s ± 4% 626MB/s ± 1% +3.91% (p=0.000 n=9+10) Decompress4XTable/case1-8 21.1MB/s ± 0% 21.0MB/s ± 0% -0.55% (p=0.000 n=9+9) Decompress4XTable/case2-8 17.6MB/s ± 0% 17.6MB/s ± 1% ~ (p=0.736 n=9+10) Decompress4XTable/case3-8 18.7MB/s ± 1% 18.7MB/s ± 1% ~ (p=0.642 n=10+10) Decompress4XTable/pngdata.001-8 648MB/s ± 0% 657MB/s ± 0% +1.50% (p=0.000 n=10+8) Decompress4XTable/normcount2-8 49.7MB/s ± 1% 49.7MB/s ± 1% ~ (p=0.839 n=10+10) [Geo mean] 271MB/s 274MB/s +0.96% ``` --- huff0/_generate/gen.go | 106 +++---- huff0/decompress_amd64.s | 584 +++++++++++++++++++-------------------- 2 files changed, 340 insertions(+), 350 deletions(-) diff --git a/huff0/_generate/gen.go b/huff0/_generate/gen.go index dd111bd8c3..900cba8168 100644 --- a/huff0/_generate/gen.go +++ b/huff0/_generate/gen.go @@ -47,14 +47,11 @@ func (d decompress4x) generateProcedure(name string) { Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") Pragma("noescape") - exhausted := GP64() - XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - + exhausted := GP8() + buffer := GP64() limit := GP64() - bufferOrigin := GP64() peekBits := GP64() - buffer := GP64() dstEvery := GP64() table := GP64() @@ -64,7 +61,7 @@ func (d decompress4x) generateProcedure(name string) { { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("out"), buffer) Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) @@ -74,27 +71,26 @@ func (d decompress4x) generateProcedure(name string) { Comment("Main loop") Label("main_loop") - MOVQ(bufferOrigin, buffer) - // Check if we have space + // Check if we have space. We could zero exhausted outside the loop, + // but doing it here is a hint to the CPU that there's no dependency + // on the previous iteration's value. + XORL(exhausted.As32(), exhausted.As32()) CMPQ(buffer, limit) SETGE(exhausted.As8()) - d.decodeTwoValues(0, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeTwoValues(1, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeTwoValues(2, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeTwoValues(3, br, peekBits, table, buffer, exhausted) + d.decodeTwoValues(0, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeTwoValues(1, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeTwoValues(2, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeTwoValues(3, br, peekBits, table, buffer, dstEvery, exhausted) - ADDQ(U8(2), bufferOrigin) // off += 2 + ADDQ(U8(2), buffer) // off += 2 - TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? + TESTB(exhausted, exhausted) // any br[i].ofs < 4? JZ(LabelRef("main_loop")) { ctx := Dereference(Param("ctx")) ctxout, _ := ctx.Field("out").Resolve() - decoded := bufferOrigin + decoded := buffer SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 @@ -105,15 +101,14 @@ func (d decompress4x) generateProcedure(name string) { } // TODO [wmu]: I believe it's doable in avo, but can't figure out how to deal -// -// with arbitrary pointers to a given type +// with arbitrary pointers to a given type const bitReader_in = 0 const bitReader_off = bitReader_in + 3*8 // {ptr, len, cap} const bitReader_value = bitReader_off + 8 const bitReader_bitsRead = bitReader_value + 8 const bitReader__size = bitReader_bitsRead + 8 -func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { +func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, dstEvery, exhausted reg.GPVirtual) { brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) val := GP64() @@ -149,7 +144,7 @@ func (d decompress4x) decodeTwoValues(id int, br, peekBits, table, buffer, exhau Comment("these two writes get coalesced") Comment("out[id * dstEvery + 0] = uint8(v0.entry >> 8)") Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") - MOVW(out.As16(), Mem{Base: buffer}) + MOVW(out.As16(), bufferIndex(id, buffer, dstEvery)) Comment("update the bitreader structure") offset := id * bitReader__size @@ -163,14 +158,11 @@ func (d decompress4x) generateProcedure4x8bit(name string) { Doc(name+" is an x86 assembler implementation of Decompress4X when tablelog > 8.decodes a sequence", "") Pragma("noescape") - exhausted := GP64() // Fixed since we need 8H - XORQ(exhausted.As64(), exhausted.As64()) // exhausted = false - - bufferOrigin := GP64() + exhausted := GP8() + buffer := GP64() limit := GP64() peekBits := GP64() - buffer := GP64() dstEvery := GP64() table := GP64() @@ -180,7 +172,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) { { ctx := Dereference(Param("ctx")) Load(ctx.Field("peekBits"), peekBits) - Load(ctx.Field("out"), bufferOrigin) + Load(ctx.Field("out"), buffer) Load(ctx.Field("limit"), limit) Load(ctx.Field("dstEvery"), dstEvery) Load(ctx.Field("tbl"), table) @@ -190,27 +182,26 @@ func (d decompress4x) generateProcedure4x8bit(name string) { Comment("Main loop") Label("main_loop") - MOVQ(bufferOrigin, buffer) - // Check if we have space + // Check if we have space. We could zero exhausted outside the loop, + // but doing it here is a hint to the CPU that there's no dependency + // on the previous iteration's value. + XORL(exhausted.As32(), exhausted.As32()) CMPQ(buffer, limit) - SETGE(exhausted.As8()) - d.decodeFourValues(0, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeFourValues(1, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeFourValues(2, br, peekBits, table, buffer, exhausted) - ADDQ(dstEvery, buffer) - d.decodeFourValues(3, br, peekBits, table, buffer, exhausted) + SETGE(exhausted) + d.decodeFourValues(0, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeFourValues(1, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeFourValues(2, br, peekBits, table, buffer, dstEvery, exhausted) + d.decodeFourValues(3, br, peekBits, table, buffer, dstEvery, exhausted) - ADDQ(U8(4), bufferOrigin) // off += 4 + ADDQ(U8(4), buffer) // off += 4 - TESTB(exhausted.As8(), exhausted.As8()) // any br[i].ofs < 4? + TESTB(exhausted, exhausted) // any br[i].ofs < 4? JZ(LabelRef("main_loop")) { ctx := Dereference(Param("ctx")) ctxout, _ := ctx.Field("out").Resolve() - decoded := bufferOrigin + decoded := buffer SUBQ(ctxout.Addr, decoded) SHLQ(U8(2), decoded) // decoded *= 4 @@ -219,7 +210,7 @@ func (d decompress4x) generateProcedure4x8bit(name string) { RET() } -func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exhausted reg.GPVirtual) { +func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, dstEvery, exhausted reg.GPVirtual) { brValue, brBitsRead := d.fillFast32(id, 32, br, exhausted) decompress := func(valID int, outByte reg.Register) { @@ -253,7 +244,7 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha Comment("out[id * dstEvery + 1] = uint8(v1.entry >> 8)") Comment("out[id * dstEvery + 3] = uint8(v2.entry >> 8)") Comment("out[id * dstEvery + 4] = uint8(v3.entry >> 8)") - MOVL(out.As32(), Mem{Base: buffer}) + MOVL(out.As32(), bufferIndex(id, buffer, dstEvery)) Comment("update the bitreader structure") offset := id * bitReader__size @@ -261,6 +252,21 @@ func (d decompress4x) decodeFourValues(id int, br, peekBits, table, buffer, exha MOVB(brBitsRead.As8(), Mem{Base: br, Disp: offset + bitReader_bitsRead}) } +func bufferIndex(id int, buffer, dstEvery reg.GPVirtual) Mem { + switch id { + case 0: + return Mem{Base: buffer} + case 1, 2: + return Mem{Base: buffer, Index: dstEvery, Scale: byte(id)} + case 3: + stride3 := GP64() // stride3 := 3*dstEvery + LEAQ(Mem{Base: dstEvery, Index: dstEvery, Scale: 2}, stride3) + return Mem{Base: buffer, Index: stride3, Scale: 1} + default: + panic("id must be >=0, <4") + } +} + func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) (brValue, brBitsRead reg.GPVirtual) { if atLeast > 32 { panic(fmt.Sprintf("at least (%d) cannot be >32", atLeast)) @@ -297,11 +303,11 @@ func (d decompress4x) fillFast32(id, atLeast int, br, exhausted reg.GPVirtual) ( MOVQ(brOffset, Mem{Base: br, Disp: offset + bitReader_off}) ORQ(tmp.As64(), brValue) { - Commentf("exhausted = exhausted || (br%d.off < 4)", id) + Commentf("exhausted += (br%d.off < 4)", id) CMPQ(brOffset, U8(4)) - tmp = GP64() - SETLT(tmp.As8()) - ORB(tmp.As8(), exhausted.As8()) + // Add carry from brOffset-4. We do this at most four times per iteration, + // and every iteration resets exhausted's lower byte, so it doesn't overflow. + ADCB(I8(0), exhausted) } Label("skip_fill" + strconv.Itoa(id)) @@ -409,7 +415,7 @@ func (d decompress1x) generateProcedure(name string) { outCap := GP64() Load(ctx.Field("outCap"), outCap) CMPQ(outCap, U8(4)) - JB(LabelRef("error_max_decoded_size_exeeded")) + JB(LabelRef("error_max_decoded_size_exceeded")) LEAQ(Mem{Base: buffer, Index: outCap, Scale: 1}, bufferEnd) @@ -432,7 +438,7 @@ func (d decompress1x) generateProcedure(name string) { tmp := GP64() LEAQ(Mem{Base: buffer, Disp: 4}, tmp) CMPQ(tmp, bufferEnd) - JGE(LabelRef("error_max_decoded_size_exeeded")) + JGE(LabelRef("error_max_decoded_size_exceeded")) } decompress := func(id int, out reg.Register) { @@ -474,7 +480,7 @@ func (d decompress1x) generateProcedure(name string) { RET() Comment("Report error") - Label("error_max_decoded_size_exeeded") + Label("error_max_decoded_size_exceeded") { ctx := Dereference(Param("ctx")) tmp := GP64() diff --git a/huff0/decompress_amd64.s b/huff0/decompress_amd64.s index 8d2187a2ce..c4c7ab2d1f 100644 --- a/huff0/decompress_amd64.s +++ b/huff0/decompress_amd64.s @@ -4,360 +4,349 @@ // func decompress4x_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_main_loop_amd64(SB), $0-8 - XORQ DX, DX - // Preload values MOVQ ctx+0(FP), AX MOVBQZX 8(AX), DI - MOVQ 16(AX), SI - MOVQ 48(AX), BX - MOVQ 24(AX), R9 - MOVQ 32(AX), R10 - MOVQ (AX), R11 + MOVQ 16(AX), BX + MOVQ 48(AX), SI + MOVQ 24(AX), R8 + MOVQ 32(AX), R9 + MOVQ (AX), R10 // Main loop main_loop: - MOVQ SI, R8 - CMPQ R8, BX + XORL DX, DX + CMPQ BX, SI SETGE DL // br0.fillFast32() - MOVQ 32(R11), R12 - MOVBQZX 40(R11), R13 - CMPQ R13, $0x20 + MOVQ 32(R10), R11 + MOVBQZX 40(R10), R12 + CMPQ R12, $0x20 JBE skip_fill0 - MOVQ 24(R11), AX - SUBQ $0x20, R13 + MOVQ 24(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ (R11), R14 + MOVQ (R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 24(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 24(R10) + ORQ R13, R11 - // exhausted = exhausted || (br0.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br0.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX) // update the bitreader structure - MOVQ R12, 32(R11) - MOVB R13, 40(R11) - ADDQ R9, R8 + MOVQ R11, 32(R10) + MOVB R12, 40(R10) // br1.fillFast32() - MOVQ 80(R11), R12 - MOVBQZX 88(R11), R13 - CMPQ R13, $0x20 + MOVQ 80(R10), R11 + MOVBQZX 88(R10), R12 + CMPQ R12, $0x20 JBE skip_fill1 - MOVQ 72(R11), AX - SUBQ $0x20, R13 + MOVQ 72(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 48(R11), R14 + MOVQ 48(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 72(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 72(R10) + ORQ R13, R11 - // exhausted = exhausted || (br1.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br1.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX)(R8*1) // update the bitreader structure - MOVQ R12, 80(R11) - MOVB R13, 88(R11) - ADDQ R9, R8 + MOVQ R11, 80(R10) + MOVB R12, 88(R10) // br2.fillFast32() - MOVQ 128(R11), R12 - MOVBQZX 136(R11), R13 - CMPQ R13, $0x20 + MOVQ 128(R10), R11 + MOVBQZX 136(R10), R12 + CMPQ R12, $0x20 JBE skip_fill2 - MOVQ 120(R11), AX - SUBQ $0x20, R13 + MOVQ 120(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 96(R11), R14 + MOVQ 96(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 120(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 120(R10) + ORQ R13, R11 - // exhausted = exhausted || (br2.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br2.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + MOVW AX, (BX)(R8*2) // update the bitreader structure - MOVQ R12, 128(R11) - MOVB R13, 136(R11) - ADDQ R9, R8 + MOVQ R11, 128(R10) + MOVB R12, 136(R10) // br3.fillFast32() - MOVQ 176(R11), R12 - MOVBQZX 184(R11), R13 - CMPQ R13, $0x20 + MOVQ 176(R10), R11 + MOVBQZX 184(R10), R12 + CMPQ R12, $0x20 JBE skip_fill3 - MOVQ 168(R11), AX - SUBQ $0x20, R13 + MOVQ 168(R10), AX + SUBQ $0x20, R12 SUBQ $0x04, AX - MOVQ 144(R11), R14 + MOVQ 144(R10), R13 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (AX)(R14*1), R14 - MOVQ R13, CX - SHLQ CL, R14 - MOVQ AX, 168(R11) - ORQ R14, R12 + MOVL (AX)(R13*1), R13 + MOVQ R12, CX + SHLQ CL, R13 + MOVQ AX, 168(R10) + ORQ R13, R11 - // exhausted = exhausted || (br3.off < 4) - CMPQ AX, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br3.off < 4) + CMPQ AX, $0x04 + ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) MOVQ DI, CX - MOVQ R12, R14 - SHRQ CL, R14 + MOVQ R11, R13 + SHRQ CL, R13 // v1 := table[val1&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) - MOVW AX, (R8) + LEAQ (R8)(R8*2), CX + MOVW AX, (BX)(CX*1) // update the bitreader structure - MOVQ R12, 176(R11) - MOVB R13, 184(R11) - ADDQ $0x02, SI + MOVQ R11, 176(R10) + MOVB R12, 184(R10) + ADDQ $0x02, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX - SUBQ 16(AX), SI - SHLQ $0x02, SI - MOVQ SI, 40(AX) + SUBQ 16(AX), BX + SHLQ $0x02, BX + MOVQ BX, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 - XORQ DX, DX - // Preload values MOVQ ctx+0(FP), CX MOVBQZX 8(CX), DI MOVQ 16(CX), BX MOVQ 48(CX), SI - MOVQ 24(CX), R9 - MOVQ 32(CX), R10 - MOVQ (CX), R11 + MOVQ 24(CX), R8 + MOVQ 32(CX), R9 + MOVQ (CX), R10 // Main loop main_loop: - MOVQ BX, R8 - CMPQ R8, SI + XORL DX, DX + CMPQ BX, SI SETGE DL // br0.fillFast32() - MOVQ 32(R11), R12 - MOVBQZX 40(R11), R13 - CMPQ R13, $0x20 + MOVQ 32(R10), R11 + MOVBQZX 40(R10), R12 + CMPQ R12, $0x20 JBE skip_fill0 - MOVQ 24(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ (R11), R15 + MOVQ 24(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ (R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 24(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 24(R10) + ORQ R14, R11 - // exhausted = exhausted || (br0.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br0.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br0.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -365,88 +354,86 @@ skip_fill0: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX) // update the bitreader structure - MOVQ R12, 32(R11) - MOVB R13, 40(R11) - ADDQ R9, R8 + MOVQ R11, 32(R10) + MOVB R12, 40(R10) // br1.fillFast32() - MOVQ 80(R11), R12 - MOVBQZX 88(R11), R13 - CMPQ R13, $0x20 + MOVQ 80(R10), R11 + MOVBQZX 88(R10), R12 + CMPQ R12, $0x20 JBE skip_fill1 - MOVQ 72(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 48(R11), R15 + MOVQ 72(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 48(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 72(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 72(R10) + ORQ R14, R11 - // exhausted = exhausted || (br1.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br1.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br1.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -454,88 +441,86 @@ skip_fill1: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX)(R8*1) // update the bitreader structure - MOVQ R12, 80(R11) - MOVB R13, 88(R11) - ADDQ R9, R8 + MOVQ R11, 80(R10) + MOVB R12, 88(R10) // br2.fillFast32() - MOVQ 128(R11), R12 - MOVBQZX 136(R11), R13 - CMPQ R13, $0x20 + MOVQ 128(R10), R11 + MOVBQZX 136(R10), R12 + CMPQ R12, $0x20 JBE skip_fill2 - MOVQ 120(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 96(R11), R15 + MOVQ 120(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 96(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 120(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 120(R10) + ORQ R14, R11 - // exhausted = exhausted || (br2.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br2.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br2.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -543,88 +528,86 @@ skip_fill2: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + MOVL AX, (BX)(R8*2) // update the bitreader structure - MOVQ R12, 128(R11) - MOVB R13, 136(R11) - ADDQ R9, R8 + MOVQ R11, 128(R10) + MOVB R12, 136(R10) // br3.fillFast32() - MOVQ 176(R11), R12 - MOVBQZX 184(R11), R13 - CMPQ R13, $0x20 + MOVQ 176(R10), R11 + MOVBQZX 184(R10), R12 + CMPQ R12, $0x20 JBE skip_fill3 - MOVQ 168(R11), R14 - SUBQ $0x20, R13 - SUBQ $0x04, R14 - MOVQ 144(R11), R15 + MOVQ 168(R10), R13 + SUBQ $0x20, R12 + SUBQ $0x04, R13 + MOVQ 144(R10), R14 // b.value |= uint64(low) << (b.bitsRead & 63) - MOVL (R14)(R15*1), R15 - MOVQ R13, CX - SHLQ CL, R15 - MOVQ R14, 168(R11) - ORQ R15, R12 + MOVL (R13)(R14*1), R14 + MOVQ R12, CX + SHLQ CL, R14 + MOVQ R13, 168(R10) + ORQ R14, R11 - // exhausted = exhausted || (br3.off < 4) - CMPQ R14, $0x04 - SETLT AL - ORB AL, DL + // exhausted += (br3.off < 4) + CMPQ R13, $0x04 + ADCB $+0, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v0 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val1 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v1 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // val2 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v2 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 // val3 := br3.peekTopBits(peekBits) - MOVQ R12, R14 + MOVQ R11, R13 MOVQ DI, CX - SHRQ CL, R14 + SHRQ CL, R13 // v3 := table[val0&mask] - MOVW (R10)(R14*2), CX + MOVW (R9)(R13*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL - SHLQ CL, R12 - ADDB CL, R13 + SHLQ CL, R11 + ADDB CL, R12 BSWAPL AX // these four writes get coalesced @@ -632,11 +615,12 @@ skip_fill3: // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) - MOVL AX, (R8) + LEAQ (R8)(R8*2), CX + MOVL AX, (BX)(CX*1) // update the bitreader structure - MOVQ R12, 176(R11) - MOVB R13, 184(R11) + MOVQ R11, 176(R10) + MOVB R12, 184(R10) ADDQ $0x04, BX TESTB DL, DL JZ main_loop @@ -652,7 +636,7 @@ TEXT ·decompress1x_main_loop_amd64(SB), $0-8 MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 - JB error_max_decoded_size_exeeded + JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 @@ -667,7 +651,7 @@ main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX - JGE error_max_decoded_size_exeeded + JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 @@ -744,7 +728,7 @@ loop_condition: RET // Report error -error_max_decoded_size_exeeded: +error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) @@ -757,7 +741,7 @@ TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 - JB error_max_decoded_size_exeeded + JB error_max_decoded_size_exceeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 @@ -772,7 +756,7 @@ main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX - JGE error_max_decoded_size_exeeded + JGE error_max_decoded_size_exceeded // Decode 4 values CMPQ R11, $0x20 @@ -839,7 +823,7 @@ loop_condition: RET // Report error -error_max_decoded_size_exeeded: +error_max_decoded_size_exceeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX)