From aac36dc8c83a49f7804ea0241bcf04e410e54409 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 5 Feb 2024 02:35:20 -0800 Subject: [PATCH] zstd: Fix incorrect repeat coding in best mode (#923) When extending back repeats we may encounter the [`literals_length = 0`](https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#repeat-offsets]) exception, which changes the meaning of offsets. This means the repeat matches the wrong offset. There is little gain extending repeats back anyway, so just avoid it. Fixes #922 --- zstd/enc_best.go | 37 +++++++++++++++++++------------- zstd/testdata/comp-crashers.zip | Bin 1507901 -> 1509112 bytes 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/zstd/enc_best.go b/zstd/enc_best.go index c81a15357a..87f42879a8 100644 --- a/zstd/enc_best.go +++ b/zstd/enc_best.go @@ -201,14 +201,6 @@ encodeLoop: if delta >= e.maxMatchOff || delta <= 0 || load3232(src, offset) != first { return } - if debugAsserts { - if offset >= s { - panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff)) - } - if !bytes.Equal(src[s:s+4], src[offset:offset+4]) { - panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first)) - } - } // Try to quick reject if we already have a long match. if m.length > 16 { left := len(src) - int(m.s+m.length) @@ -227,8 +219,10 @@ encodeLoop: } } l := 4 + e.matchlen(s+4, offset+4, src) - if true { + if m.rep <= 0 { // Extend candidate match backwards as far as possible. + // Do not extend repeats as we can assume they are optimal + // and offsets change if s == nextEmit. tMin := s - e.maxMatchOff if tMin < 0 { tMin = 0 @@ -239,7 +233,14 @@ encodeLoop: l++ } } - + if debugAsserts { + if offset >= s { + panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff)) + } + if !bytes.Equal(src[s:s+l], src[offset:offset+l]) { + panic(fmt.Sprintf("second match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first)) + } + } cand := match{offset: offset, s: s, length: l, rep: rep} cand.estBits(bitsPerByte) if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 { @@ -336,24 +337,31 @@ encodeLoop: } if debugAsserts { + if best.offset >= best.s { + panic(fmt.Sprintf("best.offset > s: %d >= %d", best.offset, best.s)) + } + if best.s < nextEmit { + panic(fmt.Sprintf("s %d < nextEmit %d", best.s, nextEmit)) + } + if best.offset < s-e.maxMatchOff { + panic(fmt.Sprintf("best.offset < s-e.maxMatchOff: %d < %d", best.offset, s-e.maxMatchOff)) + } if !bytes.Equal(src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]) { panic(fmt.Sprintf("match mismatch: %v != %v", src[best.s:best.s+best.length], src[best.offset:best.offset+best.length])) } } // We have a match, we can store the forward value + s = best.s if best.rep > 0 { var seq seq seq.matchLen = uint32(best.length - zstdMinMatch) - if debugAsserts && s < nextEmit { - panic("s < nextEmit") - } addLiterals(&seq, best.s) // Repeat. If bit 4 is set, this is a non-lit repeat. seq.offset = uint32(best.rep & 3) if debugSequences { - println("repeat sequence", seq, "next s:", s) + println("repeat sequence", seq, "next s:", best.s, "off:", best.s-best.offset) } blk.sequences = append(blk.sequences, seq) @@ -396,7 +404,6 @@ encodeLoop: // A 4-byte match has been found. Update recent offsets. // We'll later see if more than 4 bytes. - s = best.s t := best.offset offset1, offset2, offset3 = s-t, offset1, offset2 diff --git a/zstd/testdata/comp-crashers.zip b/zstd/testdata/comp-crashers.zip index 44c661cd50669beff0f82810ce2d2d956d8c4cb7..3f22c78bd421d322b23b90f4d7b4ff1f4a97a9d3 100644 GIT binary patch delta 1303 zcmV+y1?c*{gk1QIT!4fDgaU*Egam{Iga(8Mgb0KQgbIWUgbaiYgbsucgb;)ggc5`k zv=kQBe****00ICA06n%sSU%;0P*nr~0H_oI01W^D0Bvb*X>DnGWn?aNcyyguJB}4G z5Nx|7!aW1gChrbN9DoBLK}HBUw*XS2)k?(N1jH5I5fC8(Gh@4}s^=xk?C$%_+U|CD zb#-}wx&7xIi?h?R9(e?K#|9?u9#}o$cq5H&@WMwYON}DOb(~{fdr^rNlvY z1jcO)4-G*H+So#1z6Ff}Or$3LxQpdN8>12h(>cs_6C^XiX=$3zv;$ltU&R4HAa=aI ze?`Cm#w5_2KlEem>`me*qq_e+D8LH3GysXTCV?EU&!B;y|L73Z2-KF(IBzhmDG0ST zj@do^jriH%HMn-TefavYme!<5oL7cmFpFAEBU!dR4H6^|P-3N?A#j5jU`zx?&Fd<|(KQBu%oCKzE1PfJgpj49UdCX`8oR$uD2;wqRM(C@E4e0b=qt&!)bcgzyhNv_ZwH#xH zJga{orw6MmZn9suQirv*WYMr7Da6OmkLBn!w+-c@@CfGmcMO(5H*d7Xha|=2f9F*> zFe}Z8kUa{T=j1QTg>>3XLPKp3aI;oAkca_Nz`5c3*ojyYVx=Q3SvsB$*4RRQC^gLX zWY~s_4x`Di&x!{$#k&$r^ENJxK^mSh*!cHDtri_SsYhA}N;`0m-BR9c(xnp@tOk`= z6c`dg>m!ilaPr-J{~gj=Vd{e)e@gAzE;!wws72mwG^xe)W|(%t!QiNpg*iGz?Ivp@ z@YJg$p-+>awJkDeluE?qS>YND_c8*7rHIL0laM7}6Ii znd2ouQZ+hpWwW{{q_7dN0x(|XIN}af^@Owe;)J1o;;msW=R6D4)|U7-e*;boFdb}n zZvZ_TTyuGCbs#j-Yv*Z@eQ^g=nu3NjW=EcK0*Z2$F0#>t5Tkivi3; ztFlC%QfD&PQ!m+%25KR72^zcqE?72%9c!@w8Kd=$-WhOfwrN4a9^LBY2HYMmf zgBj|_@_$88@XTwhZNQnhZNQow-nYE6)aBy0u%rg00ICA06n%sSU%;0 zP*nr~0H_oI01W^D000000000W0000K))WA3X>Mt4X?kU3E_8TwP)h{{00000c?Nj~ NPAmcdl-d*k003`WRAm4F delta 84 zcmey-8?(11rlEzgg{g(Pg{6hHg{_6Xg`3B$!^9wRO$5jT0L7*mDgXcg