Skip to content

Commit

Permalink
Merge branch 'dev' into opt_investigation
Browse files Browse the repository at this point in the history
  • Loading branch information
Cyan4973 committed Sep 14, 2021
2 parents b6b2855 + 29f595e commit fd94b9d
Show file tree
Hide file tree
Showing 3 changed files with 203 additions and 201 deletions.
19 changes: 8 additions & 11 deletions lib/compress/huf_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ typedef struct {
typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];

/* Number of buckets available for HUF_sort() */
#define RANK_POSITION_TABLE_SIZE 128
#define RANK_POSITION_TABLE_SIZE 192

typedef struct {
huffNodeTable huffNodeTbl;
Expand All @@ -444,18 +444,15 @@ typedef struct {

/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
* Strategy is to use as many buckets as possible for representing distinct
* counts while using the remainder to represent all counts up to HUF_BLOCKSIZE_MAX
* using log2 bucketing.
* counts while using the remainder to represent all "large" counts.
*
* To satisfy this requirement for 128 buckets, we can do the following:
* Let buckets 0-114 represent distinct counts of [0, 114]
* Let buckets 115 to 126 represent counts of [115, HUF_BLOCKSIZE_MAX]. (the final bucket 127 must remain empty)
*
* Note that we don't actually need 17 buckets (assuming 2^17 maxcount) for log2 bucketing since
* the first few buckets in the log2 bucketing representation are already covered by the distinct count bucketing.
* To satisfy this requirement for 192 buckets, we can do the following:
* Let buckets 0-166 represent distinct counts of [0, 166]
* Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
*/
#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - BIT_highbit32(HUF_BLOCKSIZE_MAX) - 1
#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN)
#define RANK_POSITION_MAX_COUNT_LOG 32
#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */

/* Return the appropriate bucket index for a given count. See definition of
* RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
Expand Down
9 changes: 7 additions & 2 deletions tests/playTests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -936,8 +936,13 @@ cat tmp | zstd -14 -f --size-hint=5500 | zstd -t # considerably too low


println "\n===> dictionary tests "

println "- test with raw dict (content only) "
println "- Test high/low compressibility corpus training"
datagen -g12M -P90 > tmpCorpusHighCompress
datagen -g12M -P5 > tmpCorpusLowCompress
zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress
zstd --train -B2K tmpCorpusLowCompress -o tmpDictLowCompress
rm -f tmpCorpusHighCompress tmpCorpusLowCompress tmpDictHighCompress tmpDictLowCompress
println "- Test with raw dict (content only) "
datagen > tmpDict
datagen -g1M | $MD5SUM > tmp1
datagen -g1M | zstd -D tmpDict | zstd -D tmpDict -dvq | $MD5SUM > tmp2
Expand Down
Loading

0 comments on commit fd94b9d

Please sign in to comment.