Skip to content

Commit

Permalink
[lazy] Skip over incompressible data
Browse files Browse the repository at this point in the history
Every 256 bytes the lazy match finders process without finding a match,
they will increase their step size by 1. So for bytes [0, 256) they search
every position, for bytes [256, 512) they search every other position,
and so on. However, they currently still insert every position into
their hash tables. This is different from fast & dfast, which only
insert the positions they search.

This PR changes that, so now after we've searched 2KB without finding
any matches, at which point we'll only be searching one in 9 positions,
we'll stop inserting every position, and only insert the positions we
search. The exact cutoff of 2KB isn't terribly important, I've just
selected a cutoff that is reasonably large, to minimize the impact on
"normal" data.

This PR only adds skipping to greedy, lazy, and lazy2, but does not
touch btlazy2.

| Dataset | Level | Compiler     | CSize ∆ | Speed ∆ |
|---------|-------|--------------|---------|---------|
| Random  |     5 | clang-14.0.6 |    0.0% |   +704% |
| Random  |     5 | gcc-12.2.0   |    0.0% |   +670% |
| Random  |     7 | clang-14.0.6 |    0.0% |   +679% |
| Random  |     7 | gcc-12.2.0   |    0.0% |   +657% |
| Random  |    12 | clang-14.0.6 |    0.0% |  +1355% |
| Random  |    12 | gcc-12.2.0   |    0.0% |  +1331% |
| Silesia |     5 | clang-14.0.6 | +0.002% |  +0.35% |
| Silesia |     5 | gcc-12.2.0   | +0.002% |  +2.45% |
| Silesia |     7 | clang-14.0.6 | +0.001% |  -1.40% |
| Silesia |     7 | gcc-12.2.0   | +0.007% |  +0.13% |
| Silesia |    12 | clang-14.0.6 | +0.011% | +22.70% |
| Silesia |    12 | gcc-12.2.0   | +0.011% |  -6.68% |
| Enwik8  |     5 | clang-14.0.6 |    0.0% |  -1.02% |
| Enwik8  |     5 | gcc-12.2.0   |    0.0% |  +0.34% |
| Enwik8  |     7 | clang-14.0.6 |    0.0% |  -1.22% |
| Enwik8  |     7 | gcc-12.2.0   |    0.0% |  -0.72% |
| Enwik8  |    12 | clang-14.0.6 |    0.0% | +26.19% |
| Enwik8  |    12 | gcc-12.2.0   |    0.0% |  -5.70% |

The speed difference for clang at level 12 is real, but is probably
caused by some sort of alignment or codegen issues. clang is
significantly slower than gcc before this PR, but gets up to parity with
it.

I also measured the ratio difference for the HC match finder, and it
looks basically the same as the row-based match finder. The speedup on
random data looks similar. And performance is about neutral, without the
big difference at level 12 for either clang or gcc.
  • Loading branch information
terrelln committed Mar 14, 2023
1 parent 488e45f commit acca036
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 16 deletions.
1 change: 1 addition & 0 deletions lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -1947,6 +1947,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
}

ms->hashLog3 = hashLog3;
ms->lazySkipping = 0;

ZSTD_invalidateMatchState(ms);

Expand Down
7 changes: 7 additions & 0 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,13 @@ struct ZSTD_matchState_t {
* This behavior is controlled from the cctx ms.
* This parameter has no effect in the cdict ms. */
int prefetchCDictTables;

/* When == 0, lazy match finders insert every position.
* When != 0, lazy match finders only insert positions they search.
* This allows them to skip much faster over incompressible data,
* at a small cost to compression ratio.
*/
int lazySkipping;
};

typedef struct {
Expand Down
79 changes: 63 additions & 16 deletions lib/compress/zstd_lazy.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include "zstd_lazy.h"
#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */

#define kLazySkippingStep 8


/*-*************************************
* Binary Tree search
Expand Down Expand Up @@ -618,7 +620,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
ZSTD_matchState_t* ms,
const ZSTD_compressionParameters* const cParams,
const BYTE* ip, U32 const mls)
const BYTE* ip, U32 const mls, U32 const lazySkipping)
{
U32* const hashTable = ms->hashTable;
const U32 hashLog = cParams->hashLog;
Expand All @@ -633,6 +635,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
hashTable[h] = idx;
idx++;
/* Stop inserting every position when in the lazy skipping mode. */
if (lazySkipping)
break;
}

ms->nextToUpdate = target;
Expand All @@ -641,7 +646,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(

U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
const ZSTD_compressionParameters* const cParams = &ms->cParams;
return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
}

/* inlining is important to hardwire a hot branch (template emulation) */
Expand Down Expand Up @@ -685,7 +690,7 @@ size_t ZSTD_HcFindBestMatch(
}

/* HC4 match finder */
matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);

for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
size_t currentMl=0;
Expand Down Expand Up @@ -866,7 +871,6 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
U32* const hashTable = ms->hashTable;
BYTE* const tagTable = ms->tagTable;
U32 const hashLog = ms->rowHashLog;
U32 hashSaltEntropyCollected = 0;
const BYTE* const base = ms->window.base;

DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
Expand All @@ -881,9 +885,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
row[pos] = updateStartIdx;
hashSaltEntropyCollected = hash;
}
ms->hashSaltEntropy += hashSaltEntropyCollected; /* collect salt entropy */
}

/* ZSTD_row_update_internal():
Expand Down Expand Up @@ -1144,6 +1146,7 @@ size_t ZSTD_RowFindBestMatch(
const U64 hashSalt = ms->hashSalt;
U32 nbAttempts = 1U << cappedSearchLog;
size_t ml=4-1;
U32 hash;

/* DMS/DDS variables that may be referenced laster */
const ZSTD_matchState_t* const dms = ms->dictMatchState;
Expand Down Expand Up @@ -1177,9 +1180,19 @@ size_t ZSTD_RowFindBestMatch(
}

/* Update the hashTable and tagTable up to (but not including) ip */
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
if (!ms->lazySkipping) {
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
} else {
/* Stop inserting every position when in the lazy skipping mode.
* The hash cache is also not kept up to date in this mode.
*/
hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
ms->nextToUpdate = curr;
}
ms->hashSaltEntropy += hash; /* collect salt entropy */

{ /* Get the hash for ip, compute the appropriate row */
U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
U32* const row = hashTable + relRow;
Expand Down Expand Up @@ -1527,10 +1540,11 @@ ZSTD_compressBlock_lazy_generic(
assert(offset_2 <= dictAndPrefixLength);
}

/* Reset the lazy skipping state */
ms->lazySkipping = 0;

if (searchMethod == search_rowHash) {
ZSTD_row_fillHashCache(ms, base, rowLog,
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
ms->nextToUpdate, ilimit);
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
}

/* Match Loop */
Expand Down Expand Up @@ -1574,7 +1588,16 @@ ZSTD_compressBlock_lazy_generic(
}

if (matchLength < 4) {
ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */;
ip += step;
/* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
* In this mode we stop inserting every position into our tables, and only insert
* positions that we search, which is one in step positions.
* The exact cutoff is flexible, I've just chosen a number that is reasonably high,
* so we minimize the compression ratio loss in "normal" scenarios. This mode gets
* triggered once we've gone 2KB without finding any matches.
*/
ms->lazySkipping = step > kLazySkippingStep;
continue;
}

Expand Down Expand Up @@ -1678,6 +1701,13 @@ ZSTD_compressBlock_lazy_generic(
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
anchor = ip = start + matchLength;
}
if (ms->lazySkipping) {
/* We've found a match, disable lazy skipping mode, and refill the hash cache. */
if (searchMethod == search_rowHash) {
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
}
ms->lazySkipping = 0;
}

/* check immediate repcode */
if (isDxS) {
Expand Down Expand Up @@ -1895,12 +1925,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(

DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);

/* Reset the lazy skipping state */
ms->lazySkipping = 0;

/* init */
ip += (ip == prefixStart);
if (searchMethod == search_rowHash) {
ZSTD_row_fillHashCache(ms, base, rowLog,
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
ms->nextToUpdate, ilimit);
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
}

/* Match Loop */
Expand Down Expand Up @@ -1938,7 +1969,16 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
}

if (matchLength < 4) {
ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
ip += step + 1; /* jump faster over incompressible sections */
/* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
* In this mode we stop inserting every position into our tables, and only insert
* positions that we search, which is one in step positions.
* The exact cutoff is flexible, I've just chosen a number that is reasonably high,
* so we minimize the compression ratio loss in "normal" scenarios. This mode gets
* triggered once we've gone 2KB without finding any matches.
*/
ms->lazySkipping = step > kLazySkippingStep;
continue;
}

Expand Down Expand Up @@ -2024,6 +2064,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
anchor = ip = start + matchLength;
}
if (ms->lazySkipping) {
/* We've found a match, disable lazy skipping mode, and refill the hash cache. */
if (searchMethod == search_rowHash) {
ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
}
ms->lazySkipping = 0;
}

/* check immediate repcode */
while (ip <= ilimit) {
Expand Down

0 comments on commit acca036

Please sign in to comment.