Skip to content

Commit

Permalink
External matchfinder API (#3333)
Browse files Browse the repository at this point in the history
* First building commit with sample matchfinder

* Set up ZSTD_externalMatchCtx struct

* move seqBuffer to ZSTD_Sequence*

* support non-contiguous dictionary

* clean up parens

* add clearExternalMatchfinder, handle allocation errors

* Add useExternalMatchfinder cParam

* validate useExternalMatchfinder cParam

* Disable LDM + external matchfinder

* Check for static CCtx

* Validate mState and mStateDestructor

* Improve LDM check to cover both branches

* Error API with optional fallback

* handle RLE properly for external matchfinder

* nit

* Move to a CDict-like model for resource ownership

* Add hidden useExternalMatchfinder bool to CCtx_params_s

* Eliminate malloc, move to cwksp allocation

* Handle CCtx reset properly

* Ensure seqStore has enough space for external sequences

* fix capitalization

* Add DEBUGLOG statements

* Add compressionLevel param to matchfinder API

* fix c99 issues and add a param combination error code

* nits

* Test external matchfinder API

* C90 compat for simpleExternalMatchFinder

* Fix some @nocommits and an ASAN bug

* nit

* nit

* nits

* forward declare copySequencesToSeqStore functions in zstd_compress_internal.h

* nit

* nit

* nits

* Update copyright headers

* Fix CMake zstreamtest build

* Fix copyright headers (again)

* typo

* Add externalMatchfinder demo program to make contrib

* Reduce memory consumption for small blockSize

* ZSTD_postProcessExternalMatchFinderResult nits

* test sum(matchlen) + sum(litlen) == srcSize in debug builds

* refExternalMatchFinder -> registerExternalMatchFinder

* C90 nit

* zstreamtest nits

* contrib nits

* contrib nits

* allow block splitter + external matchfinder, refactor

* add windowSize param

* add contrib/externalMatchfinder/README.md

* docs

* go back to old RLE heuristic because of the first block issue

* fix initializer element is not a constant expression

* ref contrib from zstd.h

* extremely pedantic compiler warning fix, meson fix, typo fix

* Additional docs on API limitations

* minor nits

* Refactor maxNbSeq calculation into a helper function

* Fix copyright
  • Loading branch information
embg authored Dec 28, 2022
1 parent 90597d7 commit 2a40262
Show file tree
Hide file tree
Showing 18 changed files with 929 additions and 40 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ contrib: lib
$(MAKE) -C contrib/seekable_format/examples all
$(MAKE) -C contrib/seekable_format/tests test
$(MAKE) -C contrib/largeNbDicts all
$(MAKE) -C contrib/externalMatchfinder all
cd build/single_file_libs/ ; ./build_decoder_test.sh
cd build/single_file_libs/ ; ./build_library_test.sh

Expand All @@ -142,6 +143,7 @@ clean:
$(Q)$(MAKE) -C contrib/seekable_format/examples $@ > $(VOID)
$(Q)$(MAKE) -C contrib/seekable_format/tests $@ > $(VOID)
$(Q)$(MAKE) -C contrib/largeNbDicts $@ > $(VOID)
$(Q)$(MAKE) -C contrib/externalMatchfinder $@ > $(VOID)
$(Q)$(RM) zstd$(EXT) zstdmt$(EXT) tmp*
$(Q)$(RM) -r lz4
@echo Cleaning completed
Expand Down
2 changes: 1 addition & 1 deletion build/cmake/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ add_test(NAME fuzzer COMMAND fuzzer ${ZSTD_FUZZER_FLAGS})
#
# zstreamtest
#
add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c)
add_executable(zstreamtest ${PROGRAMS_DIR}/datagen.c ${PROGRAMS_DIR}/util.c ${PROGRAMS_DIR}/timefn.c ${TESTS_DIR}/seqgen.c ${TESTS_DIR}/zstreamtest.c ${TESTS_DIR}/external_matchfinder.c)
if (NOT MSVC)
target_compile_options(zstreamtest PRIVATE "-Wno-deprecated-declarations")
endif()
Expand Down
6 changes: 4 additions & 2 deletions build/meson/tests/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ fuzzer = executable('fuzzer',
dependencies: [ testcommon_dep, thread_dep ],
install: false)

zstreamtest_sources = [join_paths(zstd_rootdir, 'tests/seqgen.c'),
join_paths(zstd_rootdir, 'tests/zstreamtest.c')]
zstreamtest_sources = [
join_paths(zstd_rootdir, 'tests/seqgen.c'),
join_paths(zstd_rootdir, 'tests/zstreamtest.c'),
join_paths(zstd_rootdir, 'tests/external_matchfinder.c')]
zstreamtest = executable('zstreamtest',
zstreamtest_sources,
include_directories: test_includes,
Expand Down
2 changes: 2 additions & 0 deletions contrib/externalMatchfinder/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# build artifacts
externalMatchfinder
40 changes: 40 additions & 0 deletions contrib/externalMatchfinder/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# ################################################################
# Copyright (c) Yann Collet, Meta Platforms, Inc.
# All rights reserved.
#
# This source code is licensed under both the BSD-style license (found in the
# LICENSE file in the root directory of this source tree) and the GPLv2 (found
# in the COPYING file in the root directory of this source tree).
# ################################################################

PROGDIR = ../../programs
LIBDIR = ../../lib

LIBZSTD = $(LIBDIR)/libzstd.a

CPPFLAGS+= -I$(LIBDIR) -I$(LIBDIR)/compress -I$(LIBDIR)/common

CFLAGS ?= -O3
CFLAGS += -std=gnu99
DEBUGFLAGS= -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow \
-Wstrict-aliasing=1 -Wswitch-enum \
-Wstrict-prototypes -Wundef -Wpointer-arith \
-Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings \
-Wredundant-decls
CFLAGS += $(DEBUGFLAGS) $(MOREFLAGS)

default: externalMatchfinder

all: externalMatchfinder

externalMatchfinder: matchfinder.c main.c $(LIBZSTD)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@

.PHONY: $(LIBZSTD)
$(LIBZSTD):
$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"

clean:
$(RM) *.o
$(MAKE) -C $(LIBDIR) clean > /dev/null
$(RM) externalMatchfinder
14 changes: 14 additions & 0 deletions contrib/externalMatchfinder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
externalMatchfinder
=====================

`externalMatchfinder` is a test tool for the external matchfinder API.
It demonstrates how to use the API to perform a simple round-trip test.

A sample matchfinder is provided in matchfinder.c, but the user can swap
this out with a different one if desired. The sample matchfinder implements
LZ compression with a 1KB hashtable. Dictionary compression is not currently supported.

Command line :
```
externalMatchfinder filename
```
107 changes: 107 additions & 0 deletions contrib/externalMatchfinder/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"
#include "zstd_errors.h"
#include "matchfinder.h" // simpleExternalMatchFinder

#define CHECK(res) \
do { \
if (ZSTD_isError(res)) { \
printf("ERROR: %s\n", ZSTD_getErrorName(res)); \
return 1; \
} \
} while (0) \

int main(int argc, char *argv[]) {
if (argc != 2) {
printf("Usage: exampleMatchfinder <file>\n");
return 1;
}

ZSTD_CCtx* const zc = ZSTD_createCCtx();

int simpleExternalMatchState = 0xdeadbeef;

// Here is the crucial bit of code!
ZSTD_registerExternalMatchFinder(
zc,
&simpleExternalMatchState,
simpleExternalMatchFinder
);

{
size_t const res = ZSTD_CCtx_setParameter(zc, ZSTD_c_enableMatchFinderFallback, 1);
CHECK(res);
}

FILE *f = fopen(argv[1], "rb");
assert(f);
{
int const ret = fseek(f, 0, SEEK_END);
assert(ret == 0);
}
size_t const srcSize = ftell(f);
{
int const ret = fseek(f, 0, SEEK_SET);
assert(ret == 0);
}

char* const src = malloc(srcSize + 1);
assert(src);
{
size_t const ret = fread(src, srcSize, 1, f);
assert(ret == 1);
int const ret2 = fclose(f);
assert(ret2 == 0);
}

size_t const dstSize = ZSTD_compressBound(srcSize);
char* const dst = malloc(dstSize);
assert(dst);

size_t const cSize = ZSTD_compress2(zc, dst, dstSize, src, srcSize);
CHECK(cSize);

char* const val = malloc(srcSize);
assert(val);

{
size_t const res = ZSTD_decompress(val, srcSize, dst, cSize);
CHECK(res);
}

if (memcmp(src, val, srcSize) == 0) {
printf("Compression and decompression were successful!\n");
printf("Original size: %lu\n", srcSize);
printf("Compressed size: %lu\n", cSize);
} else {
printf("ERROR: input and validation buffers don't match!\n");
for (size_t i = 0; i < srcSize; i++) {
if (src[i] != val[i]) {
printf("First bad index: %zu\n", i);
break;
}
}
return 1;
}

ZSTD_freeCCtx(zc);
free(src);
free(dst);
free(val);
return 0;
}
80 changes: 80 additions & 0 deletions contrib/externalMatchfinder/matchfinder.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#include "zstd_compress_internal.h"
#include "matchfinder.h"

#define HSIZE 1024
static U32 const HLOG = 10;
static U32 const MLS = 4;
static U32 const BADIDX = 0xffffffff;

size_t simpleExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
) {
const BYTE* const istart = (const BYTE*)src;
const BYTE* const iend = istart + srcSize;
const BYTE* ip = istart;
const BYTE* anchor = istart;
size_t seqCount = 0;
U32 hashTable[HSIZE];

(void)externalMatchState;
(void)dict;
(void)dictSize;
(void)outSeqsCapacity;
(void)compressionLevel;

{ int i;
for (i=0; i < HSIZE; i++) {
hashTable[i] = BADIDX;
} }

while (ip + MLS < iend) {
size_t const hash = ZSTD_hashPtr(ip, HLOG, MLS);
U32 const matchIndex = hashTable[hash];
hashTable[hash] = (U32)(ip - istart);

if (matchIndex != BADIDX) {
const BYTE* const match = istart + matchIndex;
U32 const matchLen = (U32)ZSTD_count(ip, match, iend);
if (matchLen >= ZSTD_MINMATCH_MIN) {
U32 const litLen = (U32)(ip - anchor);
U32 const offset = (U32)(ip - match);
ZSTD_Sequence const seq = {
offset, litLen, matchLen, 0
};

/* Note: it's crucial to stay within the window size! */
if (offset <= windowSize) {
outSeqs[seqCount++] = seq;
ip += matchLen;
anchor = ip;
continue;
}
}
}

ip++;
}

{ ZSTD_Sequence const finalSeq = {
0, (U32)(iend - anchor), 0, 0
};
outSeqs[seqCount++] = finalSeq;
}

return seqCount;
}
26 changes: 26 additions & 0 deletions contrib/externalMatchfinder/matchfinder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Copyright (c) Yann Collet, Meta Platforms, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
* in the COPYING file in the root directory of this source tree).
* You may select, at your option, one of the above-listed licenses.
*/

#ifndef MATCHFINDER_H
#define MATCHFINDER_H

#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"

size_t simpleExternalMatchFinder(
void* externalMatchState,
ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
const void* src, size_t srcSize,
const void* dict, size_t dictSize,
int compressionLevel,
size_t windowSize
);

#endif
2 changes: 2 additions & 0 deletions lib/common/error_private.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
case PREFIX(parameter_unsupported): return "Unsupported parameter";
case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
case PREFIX(init_missing): return "Context should be init first";
case PREFIX(memory_allocation): return "Allocation error : not enough memory";
Expand All @@ -51,6 +52,7 @@ const char* ERR_getErrorString(ERR_enum code)
case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
case PREFIX(externalMatchFinder_failed): return "External matchfinder returned an error code";
case PREFIX(maxCode):
default: return notErrorCode;
}
Expand Down
Loading

0 comments on commit 2a40262

Please sign in to comment.