Skip to content

Commit

Permalink
[lazy] Use switch instead of indirect function calls.
Browse files Browse the repository at this point in the history
Use a switch statement to select the search function instead of an
indirect function call. This results in a sizable performance win.

This PR is a modification of the approach taken in PR #2828.
When I measured performance for that commit, it was neutral.
However, I now see a performance regression on gcc, but still
neutral on clang. I'm measuring on the same platform, but with
newer compilers. The new approach beats both the current dev
branch and the baseline before PR #2828 was merged.

This PR is necessary for Issue #3275, to update zstd in the kernel.
Without this PR there is a large regression in greedy - btlazy2
compression speed. With this PR it is about neutral.

gcc version: 12.2.0
clang version: 14.0.6
dataset: silesia.tar

| Compiler | Level | Dev Speed (MB/s) | PR Speed (MB/s) | Delta  |
|----------|-------|------------------|-----------------|--------|
| gcc      |     5 |            102.6 |           113.7 | +10.8% |
| gcc      |     7 |             66.6 |            74.8 | +12.3% |
| gcc      |     9 |             51.5 |            58.9 | +14.3% |
| gcc      |    13 |             14.3 |            14.3 |  +0.0% |
| clang    |     5 |            108.1 |           114.8 |  +6.2% |
| clang    |     7 |             68.5 |            72.3 |  +5.5% |
| clang    |     9 |             53.2 |            56.2 |  +5.6% |
| clang    |    13 |             14.3 |            14.7 |  +2.8% |

The binary size stays just about the same for clang and gcc, measured
using the `size` command:

| Compiler | Branch | Text    | Data | BSS | Total   |
|----------|--------|---------|------|-----|---------|
| gcc      | dev    | 1127950 | 3312 | 280 | 1131542 |
| gcc      | PR     | 1123422 | 2512 | 280 | 1126214 |
| clang    | dev    | 1046254 | 3256 | 216 | 1049726 |
| clang    | PR     | 1048198 | 2296 | 216 | 1050710 |
  • Loading branch information
terrelln committed Oct 21, 2022
1 parent 5c1cdba commit 3032050
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 111 deletions.
6 changes: 6 additions & 0 deletions lib/common/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,12 @@
#define UNLIKELY(x) (x)
#endif

#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
#else
# define ZSTD_UNREACHABLE { assert(0); }
#endif

/* disable warnings */
#ifdef _MSC_VER /* Visual Studio */
# include <intrin.h> /* For Visual 2005 */
Expand Down
229 changes: 118 additions & 111 deletions lib/compress/zstd_lazy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1317,14 +1317,10 @@ size_t ZSTD_RowFindBestMatch(
}


typedef size_t (*searchMax_f)(
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);

/**
* This struct contains the functions necessary for lazy to search.
* Currently, that is only searchMax. However, it is still valuable to have the
* VTable because this makes it easier to add more functions to the VTable later.
* Generate search functions templated on (dictMode, mls, rowLog).
* These functions are outlined for code size & compilation time.
* ZSTD_searchMax() dispatches to the correct implementation function.
*
* TODO: The start of the search function involves loading and calculating a
* bunch of constants from the ZSTD_matchState_t. These computations could be
Expand All @@ -1342,38 +1338,35 @@ typedef size_t (*searchMax_f)(
* the single segment loop. It should go in searchMax instead of its own
* function to avoid having multiple virtual function calls per search.
*/
typedef struct {
searchMax_f searchMax;
} ZSTD_LazyVTable;

#define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
ZSTD_matchState_t* ms, \
const BYTE* ip, const BYTE* const iLimit, \
size_t* offBasePtr) \
{ \
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode);\
} \
static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
ZSTD_BtFindBestMatch_##dictMode##_##mls \
};
#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog

#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE

#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
ZSTD_matchState_t* ms, \
const BYTE* ip, const BYTE* const iLimit, \
size_t* offBasePtr) \
{ \
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
} \

#define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
ZSTD_matchState_t* ms, \
const BYTE* ip, const BYTE* const iLimit, \
size_t* offsetPtr) \
{ \
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
} \
static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
ZSTD_HcFindBestMatch_##dictMode##_##mls \
};

#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
ZSTD_matchState_t* ms, \
const BYTE* ip, const BYTE* const iLimit, \
size_t* offsetPtr) \
Expand All @@ -1382,9 +1375,6 @@ typedef struct {
assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
} \
static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
};

#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
X(dictMode, mls, 4) \
Expand All @@ -1407,84 +1397,103 @@ typedef struct {
X(__VA_ARGS__, dictMatchState) \
X(__VA_ARGS__, dedicatedDictSearch)

/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
/* Generate Binary Tree VTables for each combination of (dictMode, mls) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
/* Generate Hash Chain VTables for each combination of (dictMode, mls) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)

#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
{ \
&ZSTD_BtVTable_##dictMode##_4, \
&ZSTD_BtVTable_##dictMode##_5, \
&ZSTD_BtVTable_##dictMode##_6 \
}

#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
{ \
&ZSTD_HcVTable_##dictMode##_4, \
&ZSTD_HcVTable_##dictMode##_5, \
&ZSTD_HcVTable_##dictMode##_6 \
}

#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
{ \
&ZSTD_RowVTable_##dictMode##_##mls##_4, \
&ZSTD_RowVTable_##dictMode##_##mls##_5, \
&ZSTD_RowVTable_##dictMode##_##mls##_6 \
}
/* Generate row search fns for each combination of (dictMode, mls, rowLog) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
/* Generate binary Tree search fns for each combination of (dictMode, mls) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
/* Generate hash chain search fns for each combination of (dictMode, mls) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)

#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
{ \
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
}
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;

#define GEN_ZSTD_VTABLE_ARRAY(X) \
{ \
X(noDict), \
X(extDict), \
X(dictMatchState), \
X(dedicatedDictSearch) \
#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
case mls: \
return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
case mls: \
return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
case rowLog: \
return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);

#define ZSTD_SWITCH_MLS(X, dictMode) \
switch (mls) { \
ZSTD_FOR_EACH_MLS(X, dictMode) \
}

/* *******************************
* Common parser - lazy strategy
*********************************/
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
case mls: \
switch (rowLog) { \
ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
} \
ZSTD_UNREACHABLE; \
break;

#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
switch (searchMethod) { \
case search_hashChain: \
ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
break; \
case search_binaryTree: \
ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
break; \
case search_rowHash: \
ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
break; \
} \
ZSTD_UNREACHABLE;

/**
* This table is indexed first by the four ZSTD_dictMode_e values, and then
* by the two searchMethod_e values. NULLs are placed for configurations
* that should never occur (extDict modes go to the other implementation
* below and there is no DDSS for binary tree search yet).
* Searches for the longest match at @p ip.
* Dispatches to the correct implementation function based on the
* (searchMethod, dictMode, mls, rowLog). We use switch statements
* here instead of using an indirect function call through a function
* pointer because after Spectre and Meltdown mitigations, indirect
* function calls can be very costly, especially in the kernel.
*
* NOTE: dictMode and searchMethod should be templated, so those switch
* statements should be optimized out. Only the mls & rowLog switches
* should be left.
*
* @param ms The match state.
* @param ip The position to search at.
* @param iend The end of the input data.
* @param[out] offsetPtr Stores the match offset into this pointer.
* @param mls The minimum search length, in the range [4, 6].
* @param rowLog The row log (if applicable), in the range [4, 6].
* @param searchMethod The search method to use (templated).
* @param dictMode The dictMode (templated).
*
* @returns The length of the longest match found, or < mls if no match is found.
* If a match is found its offset is stored in @p offsetPtr.
*/

static ZSTD_LazyVTable const*
ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
ZSTD_matchState_t* ms,
const BYTE* ip,
const BYTE* iend,
size_t* offsetPtr,
U32 const mls,
U32 const rowLog,
searchMethod_e const searchMethod,
ZSTD_dictMode_e const dictMode)
{
/* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
/* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);

U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
switch (searchMethod) {
case search_hashChain:
return hcVTables[dictMode][mls - 4];
case search_binaryTree:
return btVTables[dictMode][mls - 4];
case search_rowHash:
return rowVTables[dictMode][mls - 4][rowLog - 4];
default:
return NULL;
if (dictMode == ZSTD_noDict) {
ZSTD_SWITCH_SEARCH_METHOD(noDict)
} else if (dictMode == ZSTD_extDict) {
ZSTD_SWITCH_SEARCH_METHOD(extDict)
} else if (dictMode == ZSTD_dictMatchState) {
ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
} else if (dictMode == ZSTD_dedicatedDictSearch) {
ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
}
ZSTD_UNREACHABLE;
return 0;
}

/* *******************************
* Common parser - lazy strategy
*********************************/

FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_lazy_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore,
Expand All @@ -1501,8 +1510,9 @@ ZSTD_compressBlock_lazy_generic(
const BYTE* const base = ms->window.base;
const U32 prefixLowestIndex = ms->window.dictLimit;
const BYTE* const prefixLowest = base + prefixLowestIndex;
const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);

searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
U32 offset_1 = rep[0], offset_2 = rep[1];
U32 offsetSaved1 = 0, offsetSaved2 = 0;

Expand All @@ -1519,8 +1529,6 @@ ZSTD_compressBlock_lazy_generic(
0;
const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));

assert(searchMax != NULL);

DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
ip += (dictAndPrefixLength == 0);
if (dictMode == ZSTD_noDict) {
Expand All @@ -1538,7 +1546,6 @@ ZSTD_compressBlock_lazy_generic(
}

if (searchMethod == search_rowHash) {
const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
ZSTD_row_fillHashCache(ms, base, rowLog,
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
ms->nextToUpdate, ilimit);
Expand Down Expand Up @@ -1579,7 +1586,7 @@ ZSTD_compressBlock_lazy_generic(

/* first search (depth 0) */
{ size_t offbaseFound = 999999999;
size_t const ml2 = searchMax(ms, ip, iend, &offbaseFound);
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
if (ml2 > matchLength)
matchLength = ml2, start = ip, offBase = offbaseFound;
}
Expand Down Expand Up @@ -1618,7 +1625,7 @@ ZSTD_compressBlock_lazy_generic(
}
}
{ size_t ofbCandidate=999999999;
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
if ((ml2 >= 4) && (gain2 > gain1)) {
Expand Down Expand Up @@ -1654,7 +1661,7 @@ ZSTD_compressBlock_lazy_generic(
}
}
{ size_t ofbCandidate=999999999;
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
if ((ml2 >= 4) && (gain2 > gain1)) {
Expand Down Expand Up @@ -1899,9 +1906,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const BYTE* const dictEnd = dictBase + dictLimit;
const BYTE* const dictStart = dictBase + ms->window.lowLimit;
const U32 windowLog = ms->cParams.windowLog;
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);

searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
U32 offset_1 = rep[0], offset_2 = rep[1];

DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
Expand Down Expand Up @@ -1943,7 +1950,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(

/* first search (depth 0) */
{ size_t ofbCandidate = 999999999;
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
if (ml2 > matchLength)
matchLength = ml2, start = ip, offBase = ofbCandidate;
}
Expand Down Expand Up @@ -1978,7 +1985,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(

/* search match, depth 1 */
{ size_t ofbCandidate = 999999999;
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
if ((ml2 >= 4) && (gain2 > gain1)) {
Expand Down Expand Up @@ -2010,7 +2017,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(

/* search match, depth 2 */
{ size_t ofbCandidate = 999999999;
size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */
int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
if ((ml2 >= 4) && (gain2 > gain1)) {
Expand Down

0 comments on commit 3032050

Please sign in to comment.