Skip to content

Commit

Permalink
ICU-21710 Additional clean up after removing BOYER_MOORE code from us…
Browse files Browse the repository at this point in the history
…earch.cpp

Changes:
- We can completely remove the shift tables and related fields from
  data structs.

- Creation of UStringSearch objects should be faster now,
  as it doesn't waste time computing the unused shift tables.

- The sizeof(UStringSearch) is decreased from 5240 to 3192, so
  this should help to reduce memory for applications that create many string search objects.

Note regarding the comments on initialize(). It actually does not set illegal argument error if pattern is all ignoreables. Added a test case for it.
  • Loading branch information
jefgen committed Aug 27, 2021
1 parent edc62cf commit b05f3b7
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 133 deletions.
138 changes: 12 additions & 126 deletions icu4c/source/i18n/usearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,6 @@ inline uint32_t getMask(UCollationStrength strength)
}
}

/**
* @param ce 32-bit collation element
* @return hash code
*/
static
inline int hashFromCE32(uint32_t ce)
{
int hc = (int)(
((((((ce >> 24) * 37) +
(ce >> 16)) * 37) +
(ce >> 8)) * 37) +
ce);
hc %= MAX_TABLE_SIZE_;
if (hc < 0) {
hc += MAX_TABLE_SIZE_;
}
return hc;
}

U_CDECL_BEGIN
static UBool U_CALLCONV
usearch_cleanup(void) {
Expand Down Expand Up @@ -282,11 +263,9 @@ inline int64_t * addTouint64_tArray(int64_t *destination,
* @param strsrch string search data
* @param status output error if any, caller to check status before calling
* method, status assumed to be success when passed in.
* @return total number of expansions
*/
static
inline uint16_t initializePatternCETable(UStringSearch *strsrch,
UErrorCode *status)
inline void initializePatternCETable(UStringSearch *strsrch, UErrorCode *status)
{
UPattern *pattern = &(strsrch->pattern);
uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
Expand All @@ -306,15 +285,14 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch,
ucol_setText(coleiter, pattern->text, pattern->textLength, status);
}
if(U_FAILURE(*status)) {
return 0;
return;
}

if (pattern->ces != cetable && pattern->ces) {
uprv_free(pattern->ces);
}

uint32_t offset = 0;
uint16_t result = 0;
int32_t ce;

while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
Expand All @@ -326,22 +304,19 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch,
patternlength - ucol_getOffset(coleiter) + 1,
status);
if (U_FAILURE(*status)) {
return 0;
return;
}
offset ++;
if (cetable != temp && cetable != pattern->cesBuffer) {
uprv_free(cetable);
}
cetable = temp;
}
result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
}

cetable[offset] = 0;
pattern->ces = cetable;
pattern->cesLength = offset;

return result;
}

/**
Expand All @@ -354,10 +329,9 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch,
* @param strsrch string search data
* @param status output error if any, caller to check status before calling
* method, status assumed to be success when passed in.
* @return total number of expansions
*/
static
inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
inline void initializePatternPCETable(UStringSearch *strsrch,
UErrorCode *status)
{
UPattern *pattern = &(strsrch->pattern);
Expand All @@ -377,15 +351,14 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
ucol_setText(coleiter, pattern->text, pattern->textLength, status);
}
if(U_FAILURE(*status)) {
return 0;
return;
}

if (pattern->pces != pcetable && pattern->pces != NULL) {
uprv_free(pattern->pces);
}

uint32_t offset = 0;
uint16_t result = 0;
int64_t pce;

icu::UCollationPCE iter(coleiter);
Expand All @@ -401,7 +374,7 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
status);

if (U_FAILURE(*status)) {
return 0;
return;
}

offset += 1;
Expand All @@ -411,28 +384,24 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
}

pcetable = temp;
//result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
}

pcetable[offset] = 0;
pattern->pces = pcetable;
pattern->pcesLength = offset;

return result;
}

/**
* Initializes the pattern struct.
* Internal method, status assumed to be success.
* @param strsrch UStringSearch data storage
* @param status output error if any, caller to check status before calling
* method, status assumed to be success when passed in.
* @return expansionsize the total expansion size of the pattern
*/
static
inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
inline void initializePattern(UStringSearch *strsrch, UErrorCode *status)
{
if (U_FAILURE(*status)) { return 0; }
if (U_FAILURE(*status)) { return; }

UPattern *pattern = &(strsrch->pattern);
const UChar *patterntext = pattern->text;
int32_t length = pattern->textLength;
Expand Down Expand Up @@ -460,102 +429,19 @@ inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
strsrch->pattern.pces = NULL;
}

// since intializePattern is an internal method status is a success.
return initializePatternCETable(strsrch, status);
initializePatternCETable(strsrch, status);
}

/**
* Initializing shift tables, with the default values.
* If a corresponding default value is 0, the shift table is not set.
* @param shift table for forwards shift
* @param backshift table for backwards shift
* @param cetable table containing pattern ce
* @param cesize size of the pattern ces
* @param expansionsize total size of the expansions
* @param defaultforward the default forward value
* @param defaultbackward the default backward value
*/
static
inline void setShiftTable(int16_t shift[], int16_t backshift[],
int32_t *cetable, int32_t cesize,
int16_t expansionsize,
int16_t defaultforward,
int16_t defaultbackward)
{
// estimate the value to shift. to do that we estimate the smallest
// number of characters to give the relevant ces, ie approximately
// the number of ces minus their expansion, since expansions can come
// from a character.
int32_t count;
for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
shift[count] = defaultforward;
}
cesize --; // down to the last index
for (count = 0; count < cesize; count ++) {
// number of ces from right of array to the count
int temp = defaultforward - count - 1;
shift[hashFromCE32(cetable[count])] = temp > 1 ? static_cast<int16_t>(temp) : 1;
}
shift[hashFromCE32(cetable[cesize])] = 1;
// for ignorables we just shift by one. see test examples.
shift[hashFromCE32(0)] = 1;

for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
backshift[count] = defaultbackward;
}
for (count = cesize; count > 0; count --) {
// the original value count does not seem to work
backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
(int16_t)(count - expansionsize) : 1;
}
backshift[hashFromCE32(cetable[0])] = 1;
backshift[hashFromCE32(0)] = 1;
}

/**
* Building of the pattern collation element list and the boyer moore strsrch
* table.
* The canonical match will only be performed after the default match fails.
* For both cases we need to remember the size of the composed and decomposed
* versions of the string. Since the Boyer-Moore shift calculations shifts by
* a number of characters in the text and tries to match the pattern from that
* offset, the shift value can not be too large in case we miss some
* characters. To choose a right shift size, we estimate the NFC form of the
* and use its size as a shift guide. The NFC form should be the small
* possible representation of the pattern. Anyways, we'll err on the smaller
* shift size. Hence the calculation for minlength.
* Canonical match will be performed slightly differently. We'll split the
* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
* the first and last base character (MS), the ending accents (EA). Matches
* will be done on MS first, and only when we match MS then some processing
* will be required for the prefix and end accents in order to determine if
* they match PA and EA. Hence the default shift values
* for the canonical match will take the size of either end's accent into
* consideration. Forwards search will take the end accents into consideration
* for the default shift values and the backwards search will take the prefix
* accents into consideration.
* If pattern has no non-ignorable ce, we return a illegal argument error.
* Internal method, status assumed to be success.
* Initializes the pattern struct and builds the pattern collation element table.
* @param strsrch UStringSearch data storage
* @param status for output errors if it occurs, status is assumed to be a
* success when it is passed in.
*/
static
inline void initialize(UStringSearch *strsrch, UErrorCode *status)
{
int16_t expandlength = initializePattern(strsrch, status);
if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
UPattern *pattern = &strsrch->pattern;
int32_t cesize = pattern->cesLength;

int16_t minlength = cesize > expandlength
? (int16_t)cesize - expandlength : 1;
pattern->defaultShiftSize = minlength;
setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
cesize, expandlength, minlength, minlength);
return;
}
strsrch->pattern.defaultShiftSize = 0;
initializePattern(strsrch, status);
}

/**
Expand Down
6 changes: 0 additions & 6 deletions icu4c/source/i18n/usrchimp.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ class UCollationPCE : public UMemory {
U_NAMESPACE_END

#define INITIAL_ARRAY_SIZE_ 256
#define MAX_TABLE_SIZE_ 257

struct USearch {
// required since collation element iterator does not have a getText API
Expand Down Expand Up @@ -160,9 +159,6 @@ struct UPattern {
int64_t pcesBuffer[INITIAL_ARRAY_SIZE_];
UBool hasPrefixAccents;
UBool hasSuffixAccents;
int16_t defaultShiftSize;
int16_t shift[MAX_TABLE_SIZE_];
int16_t backShift[MAX_TABLE_SIZE_];
};

struct UStringSearch {
Expand All @@ -182,8 +178,6 @@ struct UStringSearch {
uint32_t ceMask;
uint32_t variableTop;
UBool toShift;
UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
};

/**
Expand Down
11 changes: 10 additions & 1 deletion icu4c/source/test/cintltst/usrchtst.c
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ static void TestInitialization(void)
{
UErrorCode status = U_ZERO_ERROR;
UChar pattern[512];
const UChar text[] = {0x61, 0x62, 0x63, 0x64, 0x65, 0x66};
const UChar text[] = {0x61, 0x62, 0x63, 0x64, 0x65, 0x66}; // abcdef
int32_t i = 0;
UStringSearch *result;

Expand Down Expand Up @@ -332,6 +332,15 @@ static void TestInitialization(void)
log_err("Error opening search %s\n", u_errorName(status));
}
usearch_close(result);

/* testing that a pattern with all ignoreables doesn't fail initialization with an error */
UChar patternIgnoreables[] = {0x200b};
result = usearch_openFromCollator(patternIgnoreables, 1, text, 3, EN_US_, NULL, &status);
if (U_FAILURE(status)) {
log_err("Error opening search %s\n", u_errorName(status));
}
usearch_close(result);

close();
}

Expand Down

0 comments on commit b05f3b7

Please sign in to comment.