diff --git a/docs/userguide/icu4j/faq.md b/docs/userguide/icu4j/faq.md index 81e8e30014d6..e9b086ed32a2 100644 --- a/docs/userguide/icu4j/faq.md +++ b/docs/userguide/icu4j/faq.md @@ -195,7 +195,9 @@ determine whether case and accents are ignored during a search. #### What algorithm are you using to perform the search? -StringSearch uses a version of the Boyer-Moore search algorithm that has been +As of ICU 4.0, StringSearch uses a simple linear search algorithm which +locates a match by shifting a cursor in the target text one by one. Previous +versions of ICU used a version of the Boyer-Moore search algorithm which was modified for use with Unicode. Rather than using raw Unicode character values in its comparisons and shift tables, the algorithm uses collation elements that have been "hashed" down to a smaller range to make the tables a reasonable size. diff --git a/icu4c/source/i18n/unicode/usearch.h b/icu4c/source/i18n/unicode/usearch.h index 65747cb1ed45..04a5aee8f88a 100644 --- a/icu4c/source/i18n/unicode/usearch.h +++ b/icu4c/source/i18n/unicode/usearch.h @@ -35,8 +35,9 @@ * See the * "ICU Collation Design Document" for more information. *

- * The implementation may use a linear search or a modified form of the Boyer-Moore - * search; for more information on the latter see + * As of ICU 4.0, the implementation uses a linear search. In previous versions, + * a modified form of the Boyer-Moore searching algorithm was used. For more information + * on the modified Boyer-Moore algorithm see * * "Efficient Text Searching in Java", published in Java Report * in February, 1999. @@ -595,8 +596,8 @@ U_CAPI UCollator * U_EXPORT2 usearch_getCollator( /** * Sets the collator used for the language rules. User retains the ownership * of this collator, thus the responsibility of deletion lies with the user. -* This method causes internal data such as Boyer-Moore shift tables to -* be recalculated, but the iterator's position is unchanged. +* This method causes internal data such as the pattern collation elements +* and shift tables to be recalculated, but the iterator's position is unchanged. * @param strsrch search iterator data struct * @param collator to be used * @param status for errors if it occurs @@ -608,7 +609,7 @@ U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, /** * Sets the pattern used for matching. -* Internal data like the Boyer Moore table will be recalculated, but the +* Internal data like the pattern collation elements will be recalculated, but the * iterator's position is unchanged. * * The UStringSearch retains a pointer to the pattern string. The caller must not diff --git a/icu4c/source/i18n/usearch.cpp b/icu4c/source/i18n/usearch.cpp index 1bc02e77b7bd..3e16f62d2501 100644 --- a/icu4c/source/i18n/usearch.cpp +++ b/icu4c/source/i18n/usearch.cpp @@ -26,10 +26,6 @@ U_NAMESPACE_USE -// don't use Boyer-Moore -// (and if we decide to turn this on again there are several new TODOs that will need to be addressed) -#define BOYER_MOORE 0 - // internal definition --------------------------------------------------- #define LAST_BYTE_MASK_ 0xFF @@ -545,2048 +541,79 @@ inline void setShiftTable(int16_t shift[], int16_t backshift[], * success when it is passed in. */ static -inline void initialize(UStringSearch *strsrch, UErrorCode *status) -{ - int16_t expandlength = initializePattern(strsrch, status); - if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) { - UPattern *pattern = &strsrch->pattern; - int32_t cesize = pattern->cesLength; - - int16_t minlength = cesize > expandlength - ? (int16_t)cesize - expandlength : 1; - pattern->defaultShiftSize = minlength; - setShiftTable(pattern->shift, pattern->backShift, pattern->ces, - cesize, expandlength, minlength, minlength); - return; - } - strsrch->pattern.defaultShiftSize = 0; -} - -#if BOYER_MOORE -/** -* Check to make sure that the match length is at the end of the character by -* using the breakiterator. -* @param strsrch string search data -* @param start target text start offset -* @param end target text end offset -*/ -static -void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, - int32_t *end) -{ -#if !UCONFIG_NO_BREAK_ITERATION - UBreakIterator *breakiterator = strsrch->search->internalBreakIter; - if (breakiterator) { - int32_t matchend = *end; - //int32_t matchstart = *start; - - if (!ubrk_isBoundary(breakiterator, matchend)) { - *end = ubrk_following(breakiterator, matchend); - } - - /* Check the start of the matched text to make sure it doesn't have any accents - * before it. This code may not be necessary and so it is commented out */ - /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { - *start = ubrk_preceding(breakiterator, matchstart); - }*/ - } -#endif -} - -/** -* Determine whether the target text in UStringSearch bounded by the offset -* start and end is one or more whole units of text as -* determined by the breakiterator in UStringSearch. -* @param strsrch string search data -* @param start target text start offset -* @param end target text end offset -*/ -static -UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, - int32_t end) -{ -#if !UCONFIG_NO_BREAK_ITERATION - UBreakIterator *breakiterator = strsrch->search->breakIter; - //TODO: Add here. - if (breakiterator) { - int32_t startindex = ubrk_first(breakiterator); - int32_t endindex = ubrk_last(breakiterator); - - // out-of-range indexes are never boundary positions - if (start < startindex || start > endindex || - end < startindex || end > endindex) { - return FALSE; - } - // otherwise, we can use following() on the position before the - // specified one and return true of the position we get back is the - // one the user specified - UBool result = (start == startindex || - ubrk_following(breakiterator, start - 1) == start) && - (end == endindex || - ubrk_following(breakiterator, end - 1) == end); - if (result) { - // iterates the individual ces - UCollationElements *coleiter = strsrch->utilIter; - const UChar *text = strsrch->search->text + - start; - UErrorCode status = U_ZERO_ERROR; - ucol_setText(coleiter, text, end - start, &status); - for (int32_t count = 0; count < strsrch->pattern.cesLength; - count ++) { - int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); - if (ce == UCOL_IGNORABLE) { - count --; - continue; - } - if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) { - return FALSE; - } - } - int32_t nextce = ucol_next(coleiter, &status); - while (ucol_getOffset(coleiter) == (end - start) - && getCE(strsrch, nextce) == UCOL_IGNORABLE) { - nextce = ucol_next(coleiter, &status); - } - if (ucol_getOffset(coleiter) == (end - start) - && nextce != UCOL_NULLORDER) { - // extra collation elements at the end of the match - return FALSE; - } - } - return result; - } -#endif - return TRUE; -} - -/** -* Getting the next base character offset if current offset is an accent, -* or the current offset if the current character contains a base character. -* accents the following base character will be returned -* @param text string -* @param textoffset current offset -* @param textlength length of text string -* @return the next base character or the current offset -* if the current character is contains a base character. -*/ -static -inline int32_t getNextBaseOffset(const UChar *text, - int32_t textoffset, - int32_t textlength) -{ - if (textoffset < textlength) { - int32_t temp = textoffset; - if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { - while (temp < textlength) { - int32_t result = temp; - if ((getFCD(text, &temp, textlength) >> - SECOND_LAST_BYTE_SHIFT_) == 0) { - return result; - } - } - return textlength; - } - } - return textoffset; -} - -/** -* Gets the next base character offset depending on the string search pattern -* data -* @param strsrch string search data -* @param textoffset current offset, one offset away from the last character -* to search for. -* @return start index of the next base character or the current offset -* if the current character is contains a base character. -*/ -static -inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, - int32_t textoffset) -{ - int32_t textlength = strsrch->search->textLength; - if (strsrch->pattern.hasSuffixAccents && - textoffset < textlength) { - int32_t temp = textoffset; - const UChar *text = strsrch->search->text; - U16_BACK_1(text, 0, temp); - if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { - return getNextBaseOffset(text, textoffset, textlength); - } - } - return textoffset; -} - -/** -* Shifting the collation element iterator position forward to prepare for -* a following match. If the last character is a unsafe character, we'll only -* shift by 1 to capture contractions, normalization etc. -* Internal method, status assumed to be success. -* @param text strsrch string search data -* @param textoffset start text position to do search -* @param ce the text ce which failed the match. -* @param patternceindex index of the ce within the pattern ce buffer which -* failed the match -* @return final offset -*/ -static -inline int32_t shiftForward(UStringSearch *strsrch, - int32_t textoffset, - int32_t ce, - int32_t patternceindex) -{ - UPattern *pattern = &(strsrch->pattern); - if (ce != UCOL_NULLORDER) { - int32_t shift = pattern->shift[hashFromCE32(ce)]; - // this is to adjust for characters in the middle of the - // substring for matching that failed. - int32_t adjust = pattern->cesLength - patternceindex; - if (adjust > 1 && shift >= adjust) { - shift -= adjust - 1; - } - textoffset += shift; - } - else { - textoffset += pattern->defaultShiftSize; - } - - textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); - // check for unsafe characters - // * if it is the start or middle of a contraction: to be done after - // a initial match is found - // * thai or lao base consonant character: similar to contraction - // * high surrogate character: similar to contraction - // * next character is a accent: shift to the next base character - return textoffset; -} -#endif // #if BOYER_MOORE - -/** -* sets match not found -* @param strsrch string search data -*/ -static -inline void setMatchNotFound(UStringSearch *strsrch) -{ - // this method resets the match result regardless of the error status. - strsrch->search->matchedIndex = USEARCH_DONE; - strsrch->search->matchedLength = 0; - if (strsrch->search->isForwardSearching) { - setColEIterOffset(strsrch->textIter, strsrch->search->textLength); - } - else { - setColEIterOffset(strsrch->textIter, 0); - } -} - -#if BOYER_MOORE -/** -* Gets the offset to the next safe point in text. -* ie. not the middle of a contraction, swappable characters or supplementary -* characters. -* @param collator collation sata -* @param text string to work with -* @param textoffset offset in string -* @param textlength length of text string -* @return offset to the next safe character -*/ -static -inline int32_t getNextSafeOffset(const UCollator *collator, - const UChar *text, - int32_t textoffset, - int32_t textlength) -{ - int32_t result = textoffset; // first contraction character - while (result != textlength && ucol_unsafeCP(text[result], collator)) { - result ++; - } - return result; -} - -/** -* This checks for accents in the potential match started with a . -* composite character. -* This is really painful... we have to check that composite character do not -* have any extra accents. We have to normalize the potential match and find -* the immediate decomposed character before the match. -* The first composite character would have been taken care of by the fcd -* checks in checkForwardExactMatch. -* This is the slow path after the fcd of the first character and -* the last character has been checked by checkForwardExactMatch and we -* determine that the potential match has extra non-ignorable preceding -* ces. -* E.g. looking for \u0301 acute in \u01FA A ring above and acute, -* checkExtraMatchAccent should fail since there is a middle ring in \u01FA -* Note here that accents checking are slow and cautioned in the API docs. -* Internal method, status assumed to be a success, caller should check status -* before calling this method -* @param strsrch string search data -* @param start index of the potential unfriendly composite character -* @param end index of the potential unfriendly composite character -* @param status output error status if any. -* @return TRUE if there is non-ignorable accents before at the beginning -* of the match, FALSE otherwise. -*/ - -static -UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, - int32_t end, - UErrorCode *status) -{ - UBool result = FALSE; - if (strsrch->pattern.hasPrefixAccents) { - int32_t length = end - start; - int32_t offset = 0; - const UChar *text = strsrch->search->text + start; - - U16_FWD_1(text, offset, length); - // we are only concerned with the first composite character - if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { - int32_t safeoffset = getNextSafeOffset(strsrch->collator, - text, 0, length); - if (safeoffset != length) { - safeoffset ++; - } - UChar *norm = NULL; - UChar buffer[INITIAL_ARRAY_SIZE_]; - int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, - buffer, INITIAL_ARRAY_SIZE_, - status); - if (U_FAILURE(*status)) { - return FALSE; - } - if (size >= INITIAL_ARRAY_SIZE_) { - norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), - status); - // if allocation failed, status will be set to - // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally - // checks for it. - size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, - size, status); - if (U_FAILURE(*status) && norm != NULL) { - uprv_free(norm); - return FALSE; - } - } - else { - norm = buffer; - } - - UCollationElements *coleiter = strsrch->utilIter; - ucol_setText(coleiter, norm, size, status); - uint32_t firstce = strsrch->pattern.ces[0]; - UBool ignorable = TRUE; - uint32_t ce = UCOL_IGNORABLE; - while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { - offset = ucol_getOffset(coleiter); - if (ce != firstce && ce != UCOL_IGNORABLE) { - ignorable = FALSE; - } - ce = ucol_next(coleiter, status); - } - UChar32 codepoint; - U16_PREV(norm, 0, offset, codepoint); - result = !ignorable && (u_getCombiningClass(codepoint) != 0); - - if (norm != buffer) { - uprv_free(norm); - } - } - } - - return result; -} - -/** -* Used by exact matches, checks if there are accents before the match. -* This is really painful... we have to check that composite characters at -* the start of the matches have to not have any extra accents. -* We check the FCD of the character first, if it starts with an accent and -* the first pattern ce does not match the first ce of the character, we bail. -* Otherwise we try normalizing the first composite -* character and find the immediate decomposed character before the match to -* see if it is an non-ignorable accent. -* Now normalizing the first composite character is enough because we ensure -* that when the match is passed in here with extra beginning ces, the -* first or last ce that match has to occur within the first character. -* E.g. looking for \u0301 acute in \u01FA A ring above and acute, -* checkExtraMatchAccent should fail since there is a middle ring in \u01FA -* Note here that accents checking are slow and cautioned in the API docs. -* @param strsrch string search data -* @param start offset -* @param end offset -* @return TRUE if there are accents on either side of the match, -* FALSE otherwise -*/ -static -UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, - int32_t end) -{ - if (strsrch->pattern.hasPrefixAccents) { - UCollationElements *coleiter = strsrch->textIter; - UErrorCode status = U_ZERO_ERROR; - // we have been iterating forwards previously - uint32_t ignorable = TRUE; - int32_t firstce = strsrch->pattern.ces[0]; - - setColEIterOffset(coleiter, start); - int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); - if (U_FAILURE(status)) { - return TRUE; - } - while (ce != firstce) { - if (ce != UCOL_IGNORABLE) { - ignorable = FALSE; - } - ce = getCE(strsrch, ucol_next(coleiter, &status)); - if (U_FAILURE(status) || ce == UCOL_NULLORDER) { - return TRUE; - } - } - if (!ignorable && inNormBuf(coleiter)) { - // within normalization buffer, discontiguous handled here - return TRUE; - } - - // within text - int32_t temp = start; - // original code - // accent = (getFCD(strsrch->search->text, &temp, - // strsrch->search->textLength) - // >> SECOND_LAST_BYTE_SHIFT_); - // however this code does not work well with VC7 .net in release mode. - // maybe the inlines for getFCD combined with shifting has bugs in - // VC7. anyways this is a work around. - UBool accent = getFCD(strsrch->search->text, &temp, - strsrch->search->textLength) > 0xFF; - if (!accent) { - return checkExtraMatchAccents(strsrch, start, end, &status); - } - if (!ignorable) { - return TRUE; - } - if (start > 0) { - temp = start; - U16_BACK_1(strsrch->search->text, 0, temp); - if (getFCD(strsrch->search->text, &temp, - strsrch->search->textLength) & LAST_BYTE_MASK_) { - setColEIterOffset(coleiter, start); - ce = ucol_previous(coleiter, &status); - if (U_FAILURE(status) || - (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { - return TRUE; - } - } - } - } - - return FALSE; -} - -/** -* Used by exact matches, checks if there are accents bounding the match. -* Note this is the initial boundary check. If the potential match -* starts or ends with composite characters, the accents in those -* characters will be determined later. -* Not doing backwards iteration here, since discontiguous contraction for -* backwards collation element iterator, use up too many characters. -* E.g. looking for \u030A ring in \u01FA A ring above and acute, -* should fail since there is a acute at the end of \u01FA -* Note here that accents checking are slow and cautioned in the API docs. -* @param strsrch string search data -* @param start offset of match -* @param end end offset of the match -* @return TRUE if there are accents on either side of the match, -* FALSE otherwise -*/ -static -UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, - int32_t end) -{ - if (strsrch->pattern.hasSuffixAccents) { - const UChar *text = strsrch->search->text; - int32_t temp = end; - int32_t textlength = strsrch->search->textLength; - U16_BACK_1(text, 0, temp); - if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { - int32_t firstce = strsrch->pattern.ces[0]; - UCollationElements *coleiter = strsrch->textIter; - UErrorCode status = U_ZERO_ERROR; - int32_t ce; - setColEIterOffset(coleiter, start); - while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { - if (U_FAILURE(status) || ce == UCOL_NULLORDER) { - return TRUE; - } - } - int32_t count = 1; - while (count < strsrch->pattern.cesLength) { - if (getCE(strsrch, ucol_next(coleiter, &status)) - == UCOL_IGNORABLE) { - // Thai can give an ignorable here. - count --; - } - if (U_FAILURE(status)) { - return TRUE; - } - count ++; - } - - ce = ucol_next(coleiter, &status); - if (U_FAILURE(status)) { - return TRUE; - } - if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { - ce = getCE(strsrch, ce); - } - if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { - if (ucol_getOffset(coleiter) <= end) { - return TRUE; - } - if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { - return TRUE; - } - } - } - } - return FALSE; -} -#endif // #if BOYER_MOORE - -/** -* Checks if the offset runs out of the text string -* @param offset -* @param textlength of the text string -* @return TRUE if offset is out of bounds, FALSE otherwise -*/ -static -inline UBool isOutOfBounds(int32_t textlength, int32_t offset) -{ - return offset < 0 || offset > textlength; -} - -/** -* Checks for identical match -* @param strsrch string search data -* @param start offset of possible match -* @param end offset of possible match -* @return TRUE if identical match is found -*/ -static -inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, - int32_t end) -{ - if (strsrch->strength != UCOL_IDENTICAL) { - return TRUE; - } - - // Note: We could use Normalizer::compare() or similar, but for short strings - // which may not be in FCD it might be faster to just NFD them. - UErrorCode status = U_ZERO_ERROR; - UnicodeString t2, p2; - strsrch->nfd->normalize( - UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); - strsrch->nfd->normalize( - UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); - // return FALSE if NFD failed - return U_SUCCESS(status) && t2 == p2; -} - -#if BOYER_MOORE -/** -* Checks to see if the match is repeated -* @param strsrch string search data -* @param start new match start index -* @param end new match end index -* @return TRUE if the the match is repeated, FALSE otherwise -*/ -static -inline UBool checkRepeatedMatch(UStringSearch *strsrch, - int32_t start, - int32_t end) -{ - int32_t lastmatchindex = strsrch->search->matchedIndex; - UBool result; - if (lastmatchindex == USEARCH_DONE) { - return FALSE; - } - if (strsrch->search->isForwardSearching) { - result = start <= lastmatchindex; - } - else { - result = start >= lastmatchindex; - } - if (!result && !strsrch->search->isOverlap) { - if (strsrch->search->isForwardSearching) { - result = start < lastmatchindex + strsrch->search->matchedLength; - } - else { - result = end > lastmatchindex; - } - } - return result; -} - -/** -* Gets the collation element iterator's current offset. -* @param coleiter collation element iterator -* @param forwards flag TRUE if we are moving in th forwards direction -* @return current offset -*/ -static -inline int32_t getColElemIterOffset(const UCollationElements *coleiter, - UBool forwards) -{ - int32_t result = ucol_getOffset(coleiter); - // intricacies of the the backwards collation element iterator - if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { - result ++; - } - return result; -} - -/** -* Checks match for contraction. -* If the match ends with a partial contraction we fail. -* If the match starts too far off (because of backwards iteration) we try to -* chip off the extra characters depending on whether a breakiterator has -* been used. -* Internal method, error assumed to be success, caller has to check status -* before calling this method. -* @param strsrch string search data -* @param start offset of potential match, to be modified if necessary -* @param end offset of potential match, to be modified if necessary -* @param status output error status if any -* @return TRUE if match passes the contraction test, FALSE otherwise -*/ - -static -UBool checkNextExactContractionMatch(UStringSearch *strsrch, - int32_t *start, - int32_t *end, UErrorCode *status) -{ - UCollationElements *coleiter = strsrch->textIter; - int32_t textlength = strsrch->search->textLength; - int32_t temp = *start; - const UCollator *collator = strsrch->collator; - const UChar *text = strsrch->search->text; - // This part checks if either ends of the match contains potential - // contraction. If so we'll have to iterate through them - // The start contraction needs to be checked since ucol_previous dumps - // all characters till the first safe character into the buffer. - // *start + 1 is used to test for the unsafe characters instead of *start - // because ucol_prev takes all unsafe characters till the first safe - // character ie *start. so by testing *start + 1, we can estimate if - // excess prefix characters has been included in the potential search - // results. - if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || - (*start + 1 < textlength - && ucol_unsafeCP(text[*start + 1], collator))) { - int32_t expansion = getExpansionPrefix(coleiter); - UBool expandflag = expansion > 0; - setColEIterOffset(coleiter, *start); - while (expansion > 0) { - // getting rid of the redundant ce, caused by setOffset. - // since backward contraction/expansion may have extra ces if we - // are in the normalization buffer, hasAccentsBeforeMatch would - // have taken care of it. - // E.g. the character \u01FA will have an expansion of 3, but if - // we are only looking for acute and ring \u030A and \u0301, we'll - // have to skip the first ce in the expansion buffer. - ucol_next(coleiter, status); - if (U_FAILURE(*status)) { - return FALSE; - } - if (ucol_getOffset(coleiter) != temp) { - *start = temp; - temp = ucol_getOffset(coleiter); - } - expansion --; - } - - int32_t *patternce = strsrch->pattern.ces; - int32_t patterncelength = strsrch->pattern.cesLength; - int32_t count = 0; - while (count < patterncelength) { - int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); - if (ce == UCOL_IGNORABLE) { - continue; - } - if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { - *start = temp; - temp = ucol_getOffset(coleiter); - } - if (U_FAILURE(*status) || ce != patternce[count]) { - (*end) ++; - *end = getNextUStringSearchBaseOffset(strsrch, *end); - return FALSE; - } - count ++; - } - } - return TRUE; -} - -/** -* Checks and sets the match information if found. -* Checks -*