diff --git a/docs/userguide/icu4j/faq.md b/docs/userguide/icu4j/faq.md
index 81e8e30014d6..e9b086ed32a2 100644
--- a/docs/userguide/icu4j/faq.md
+++ b/docs/userguide/icu4j/faq.md
@@ -195,7 +195,9 @@ determine whether case and accents are ignored during a search.
#### What algorithm are you using to perform the search?
-StringSearch uses a version of the Boyer-Moore search algorithm that has been
+As of ICU 4.0, StringSearch uses a simple linear search algorithm which
+locates a match by shifting a cursor in the target text one by one. Previous
+versions of ICU used a version of the Boyer-Moore search algorithm which was
modified for use with Unicode. Rather than using raw Unicode character values in
its comparisons and shift tables, the algorithm uses collation elements that
have been "hashed" down to a smaller range to make the tables a reasonable size.
diff --git a/icu4c/source/i18n/unicode/usearch.h b/icu4c/source/i18n/unicode/usearch.h
index 65747cb1ed45..04a5aee8f88a 100644
--- a/icu4c/source/i18n/unicode/usearch.h
+++ b/icu4c/source/i18n/unicode/usearch.h
@@ -35,8 +35,9 @@
* See the
* "ICU Collation Design Document" for more information.
*
- * The implementation may use a linear search or a modified form of the Boyer-Moore
- * search; for more information on the latter see
+ * As of ICU 4.0, the implementation uses a linear search. In previous versions,
+ * a modified form of the Boyer-Moore searching algorithm was used. For more information
+ * on the modified Boyer-Moore algorithm see
*
* "Efficient Text Searching in Java", published in Java Report
* in February, 1999.
@@ -595,8 +596,8 @@ U_CAPI UCollator * U_EXPORT2 usearch_getCollator(
/**
* Sets the collator used for the language rules. User retains the ownership
* of this collator, thus the responsibility of deletion lies with the user.
-* This method causes internal data such as Boyer-Moore shift tables to
-* be recalculated, but the iterator's position is unchanged.
+* This method causes internal data such as the pattern collation elements
+* and shift tables to be recalculated, but the iterator's position is unchanged.
* @param strsrch search iterator data struct
* @param collator to be used
* @param status for errors if it occurs
@@ -608,7 +609,7 @@ U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
/**
* Sets the pattern used for matching.
-* Internal data like the Boyer Moore table will be recalculated, but the
+* Internal data like the pattern collation elements will be recalculated, but the
* iterator's position is unchanged.
*
* The UStringSearch retains a pointer to the pattern string. The caller must not
diff --git a/icu4c/source/i18n/usearch.cpp b/icu4c/source/i18n/usearch.cpp
index 1bc02e77b7bd..3e16f62d2501 100644
--- a/icu4c/source/i18n/usearch.cpp
+++ b/icu4c/source/i18n/usearch.cpp
@@ -26,10 +26,6 @@
U_NAMESPACE_USE
-// don't use Boyer-Moore
-// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
-#define BOYER_MOORE 0
-
// internal definition ---------------------------------------------------
#define LAST_BYTE_MASK_ 0xFF
@@ -545,2048 +541,79 @@ inline void setShiftTable(int16_t shift[], int16_t backshift[],
* success when it is passed in.
*/
static
-inline void initialize(UStringSearch *strsrch, UErrorCode *status)
-{
- int16_t expandlength = initializePattern(strsrch, status);
- if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
- UPattern *pattern = &strsrch->pattern;
- int32_t cesize = pattern->cesLength;
-
- int16_t minlength = cesize > expandlength
- ? (int16_t)cesize - expandlength : 1;
- pattern->defaultShiftSize = minlength;
- setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
- cesize, expandlength, minlength, minlength);
- return;
- }
- strsrch->pattern.defaultShiftSize = 0;
-}
-
-#if BOYER_MOORE
-/**
-* Check to make sure that the match length is at the end of the character by
-* using the breakiterator.
-* @param strsrch string search data
-* @param start target text start offset
-* @param end target text end offset
-*/
-static
-void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
- int32_t *end)
-{
-#if !UCONFIG_NO_BREAK_ITERATION
- UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
- if (breakiterator) {
- int32_t matchend = *end;
- //int32_t matchstart = *start;
-
- if (!ubrk_isBoundary(breakiterator, matchend)) {
- *end = ubrk_following(breakiterator, matchend);
- }
-
- /* Check the start of the matched text to make sure it doesn't have any accents
- * before it. This code may not be necessary and so it is commented out */
- /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
- *start = ubrk_preceding(breakiterator, matchstart);
- }*/
- }
-#endif
-}
-
-/**
-* Determine whether the target text in UStringSearch bounded by the offset
-* start and end is one or more whole units of text as
-* determined by the breakiterator in UStringSearch.
-* @param strsrch string search data
-* @param start target text start offset
-* @param end target text end offset
-*/
-static
-UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
- int32_t end)
-{
-#if !UCONFIG_NO_BREAK_ITERATION
- UBreakIterator *breakiterator = strsrch->search->breakIter;
- //TODO: Add here.
- if (breakiterator) {
- int32_t startindex = ubrk_first(breakiterator);
- int32_t endindex = ubrk_last(breakiterator);
-
- // out-of-range indexes are never boundary positions
- if (start < startindex || start > endindex ||
- end < startindex || end > endindex) {
- return FALSE;
- }
- // otherwise, we can use following() on the position before the
- // specified one and return true of the position we get back is the
- // one the user specified
- UBool result = (start == startindex ||
- ubrk_following(breakiterator, start - 1) == start) &&
- (end == endindex ||
- ubrk_following(breakiterator, end - 1) == end);
- if (result) {
- // iterates the individual ces
- UCollationElements *coleiter = strsrch->utilIter;
- const UChar *text = strsrch->search->text +
- start;
- UErrorCode status = U_ZERO_ERROR;
- ucol_setText(coleiter, text, end - start, &status);
- for (int32_t count = 0; count < strsrch->pattern.cesLength;
- count ++) {
- int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
- if (ce == UCOL_IGNORABLE) {
- count --;
- continue;
- }
- if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
- return FALSE;
- }
- }
- int32_t nextce = ucol_next(coleiter, &status);
- while (ucol_getOffset(coleiter) == (end - start)
- && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
- nextce = ucol_next(coleiter, &status);
- }
- if (ucol_getOffset(coleiter) == (end - start)
- && nextce != UCOL_NULLORDER) {
- // extra collation elements at the end of the match
- return FALSE;
- }
- }
- return result;
- }
-#endif
- return TRUE;
-}
-
-/**
-* Getting the next base character offset if current offset is an accent,
-* or the current offset if the current character contains a base character.
-* accents the following base character will be returned
-* @param text string
-* @param textoffset current offset
-* @param textlength length of text string
-* @return the next base character or the current offset
-* if the current character is contains a base character.
-*/
-static
-inline int32_t getNextBaseOffset(const UChar *text,
- int32_t textoffset,
- int32_t textlength)
-{
- if (textoffset < textlength) {
- int32_t temp = textoffset;
- if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
- while (temp < textlength) {
- int32_t result = temp;
- if ((getFCD(text, &temp, textlength) >>
- SECOND_LAST_BYTE_SHIFT_) == 0) {
- return result;
- }
- }
- return textlength;
- }
- }
- return textoffset;
-}
-
-/**
-* Gets the next base character offset depending on the string search pattern
-* data
-* @param strsrch string search data
-* @param textoffset current offset, one offset away from the last character
-* to search for.
-* @return start index of the next base character or the current offset
-* if the current character is contains a base character.
-*/
-static
-inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
- int32_t textoffset)
-{
- int32_t textlength = strsrch->search->textLength;
- if (strsrch->pattern.hasSuffixAccents &&
- textoffset < textlength) {
- int32_t temp = textoffset;
- const UChar *text = strsrch->search->text;
- U16_BACK_1(text, 0, temp);
- if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
- return getNextBaseOffset(text, textoffset, textlength);
- }
- }
- return textoffset;
-}
-
-/**
-* Shifting the collation element iterator position forward to prepare for
-* a following match. If the last character is a unsafe character, we'll only
-* shift by 1 to capture contractions, normalization etc.
-* Internal method, status assumed to be success.
-* @param text strsrch string search data
-* @param textoffset start text position to do search
-* @param ce the text ce which failed the match.
-* @param patternceindex index of the ce within the pattern ce buffer which
-* failed the match
-* @return final offset
-*/
-static
-inline int32_t shiftForward(UStringSearch *strsrch,
- int32_t textoffset,
- int32_t ce,
- int32_t patternceindex)
-{
- UPattern *pattern = &(strsrch->pattern);
- if (ce != UCOL_NULLORDER) {
- int32_t shift = pattern->shift[hashFromCE32(ce)];
- // this is to adjust for characters in the middle of the
- // substring for matching that failed.
- int32_t adjust = pattern->cesLength - patternceindex;
- if (adjust > 1 && shift >= adjust) {
- shift -= adjust - 1;
- }
- textoffset += shift;
- }
- else {
- textoffset += pattern->defaultShiftSize;
- }
-
- textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
- // check for unsafe characters
- // * if it is the start or middle of a contraction: to be done after
- // a initial match is found
- // * thai or lao base consonant character: similar to contraction
- // * high surrogate character: similar to contraction
- // * next character is a accent: shift to the next base character
- return textoffset;
-}
-#endif // #if BOYER_MOORE
-
-/**
-* sets match not found
-* @param strsrch string search data
-*/
-static
-inline void setMatchNotFound(UStringSearch *strsrch)
-{
- // this method resets the match result regardless of the error status.
- strsrch->search->matchedIndex = USEARCH_DONE;
- strsrch->search->matchedLength = 0;
- if (strsrch->search->isForwardSearching) {
- setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
- }
- else {
- setColEIterOffset(strsrch->textIter, 0);
- }
-}
-
-#if BOYER_MOORE
-/**
-* Gets the offset to the next safe point in text.
-* ie. not the middle of a contraction, swappable characters or supplementary
-* characters.
-* @param collator collation sata
-* @param text string to work with
-* @param textoffset offset in string
-* @param textlength length of text string
-* @return offset to the next safe character
-*/
-static
-inline int32_t getNextSafeOffset(const UCollator *collator,
- const UChar *text,
- int32_t textoffset,
- int32_t textlength)
-{
- int32_t result = textoffset; // first contraction character
- while (result != textlength && ucol_unsafeCP(text[result], collator)) {
- result ++;
- }
- return result;
-}
-
-/**
-* This checks for accents in the potential match started with a .
-* composite character.
-* This is really painful... we have to check that composite character do not
-* have any extra accents. We have to normalize the potential match and find
-* the immediate decomposed character before the match.
-* The first composite character would have been taken care of by the fcd
-* checks in checkForwardExactMatch.
-* This is the slow path after the fcd of the first character and
-* the last character has been checked by checkForwardExactMatch and we
-* determine that the potential match has extra non-ignorable preceding
-* ces.
-* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
-* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
-* Note here that accents checking are slow and cautioned in the API docs.
-* Internal method, status assumed to be a success, caller should check status
-* before calling this method
-* @param strsrch string search data
-* @param start index of the potential unfriendly composite character
-* @param end index of the potential unfriendly composite character
-* @param status output error status if any.
-* @return TRUE if there is non-ignorable accents before at the beginning
-* of the match, FALSE otherwise.
-*/
-
-static
-UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
- int32_t end,
- UErrorCode *status)
-{
- UBool result = FALSE;
- if (strsrch->pattern.hasPrefixAccents) {
- int32_t length = end - start;
- int32_t offset = 0;
- const UChar *text = strsrch->search->text + start;
-
- U16_FWD_1(text, offset, length);
- // we are only concerned with the first composite character
- if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
- int32_t safeoffset = getNextSafeOffset(strsrch->collator,
- text, 0, length);
- if (safeoffset != length) {
- safeoffset ++;
- }
- UChar *norm = NULL;
- UChar buffer[INITIAL_ARRAY_SIZE_];
- int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
- buffer, INITIAL_ARRAY_SIZE_,
- status);
- if (U_FAILURE(*status)) {
- return FALSE;
- }
- if (size >= INITIAL_ARRAY_SIZE_) {
- norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
- status);
- // if allocation failed, status will be set to
- // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
- // checks for it.
- size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
- size, status);
- if (U_FAILURE(*status) && norm != NULL) {
- uprv_free(norm);
- return FALSE;
- }
- }
- else {
- norm = buffer;
- }
-
- UCollationElements *coleiter = strsrch->utilIter;
- ucol_setText(coleiter, norm, size, status);
- uint32_t firstce = strsrch->pattern.ces[0];
- UBool ignorable = TRUE;
- uint32_t ce = UCOL_IGNORABLE;
- while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
- offset = ucol_getOffset(coleiter);
- if (ce != firstce && ce != UCOL_IGNORABLE) {
- ignorable = FALSE;
- }
- ce = ucol_next(coleiter, status);
- }
- UChar32 codepoint;
- U16_PREV(norm, 0, offset, codepoint);
- result = !ignorable && (u_getCombiningClass(codepoint) != 0);
-
- if (norm != buffer) {
- uprv_free(norm);
- }
- }
- }
-
- return result;
-}
-
-/**
-* Used by exact matches, checks if there are accents before the match.
-* This is really painful... we have to check that composite characters at
-* the start of the matches have to not have any extra accents.
-* We check the FCD of the character first, if it starts with an accent and
-* the first pattern ce does not match the first ce of the character, we bail.
-* Otherwise we try normalizing the first composite
-* character and find the immediate decomposed character before the match to
-* see if it is an non-ignorable accent.
-* Now normalizing the first composite character is enough because we ensure
-* that when the match is passed in here with extra beginning ces, the
-* first or last ce that match has to occur within the first character.
-* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
-* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
-* Note here that accents checking are slow and cautioned in the API docs.
-* @param strsrch string search data
-* @param start offset
-* @param end offset
-* @return TRUE if there are accents on either side of the match,
-* FALSE otherwise
-*/
-static
-UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
- int32_t end)
-{
- if (strsrch->pattern.hasPrefixAccents) {
- UCollationElements *coleiter = strsrch->textIter;
- UErrorCode status = U_ZERO_ERROR;
- // we have been iterating forwards previously
- uint32_t ignorable = TRUE;
- int32_t firstce = strsrch->pattern.ces[0];
-
- setColEIterOffset(coleiter, start);
- int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
- if (U_FAILURE(status)) {
- return TRUE;
- }
- while (ce != firstce) {
- if (ce != UCOL_IGNORABLE) {
- ignorable = FALSE;
- }
- ce = getCE(strsrch, ucol_next(coleiter, &status));
- if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
- return TRUE;
- }
- }
- if (!ignorable && inNormBuf(coleiter)) {
- // within normalization buffer, discontiguous handled here
- return TRUE;
- }
-
- // within text
- int32_t temp = start;
- // original code
- // accent = (getFCD(strsrch->search->text, &temp,
- // strsrch->search->textLength)
- // >> SECOND_LAST_BYTE_SHIFT_);
- // however this code does not work well with VC7 .net in release mode.
- // maybe the inlines for getFCD combined with shifting has bugs in
- // VC7. anyways this is a work around.
- UBool accent = getFCD(strsrch->search->text, &temp,
- strsrch->search->textLength) > 0xFF;
- if (!accent) {
- return checkExtraMatchAccents(strsrch, start, end, &status);
- }
- if (!ignorable) {
- return TRUE;
- }
- if (start > 0) {
- temp = start;
- U16_BACK_1(strsrch->search->text, 0, temp);
- if (getFCD(strsrch->search->text, &temp,
- strsrch->search->textLength) & LAST_BYTE_MASK_) {
- setColEIterOffset(coleiter, start);
- ce = ucol_previous(coleiter, &status);
- if (U_FAILURE(status) ||
- (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
- return TRUE;
- }
- }
- }
- }
-
- return FALSE;
-}
-
-/**
-* Used by exact matches, checks if there are accents bounding the match.
-* Note this is the initial boundary check. If the potential match
-* starts or ends with composite characters, the accents in those
-* characters will be determined later.
-* Not doing backwards iteration here, since discontiguous contraction for
-* backwards collation element iterator, use up too many characters.
-* E.g. looking for \u030A ring in \u01FA A ring above and acute,
-* should fail since there is a acute at the end of \u01FA
-* Note here that accents checking are slow and cautioned in the API docs.
-* @param strsrch string search data
-* @param start offset of match
-* @param end end offset of the match
-* @return TRUE if there are accents on either side of the match,
-* FALSE otherwise
-*/
-static
-UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
- int32_t end)
-{
- if (strsrch->pattern.hasSuffixAccents) {
- const UChar *text = strsrch->search->text;
- int32_t temp = end;
- int32_t textlength = strsrch->search->textLength;
- U16_BACK_1(text, 0, temp);
- if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
- int32_t firstce = strsrch->pattern.ces[0];
- UCollationElements *coleiter = strsrch->textIter;
- UErrorCode status = U_ZERO_ERROR;
- int32_t ce;
- setColEIterOffset(coleiter, start);
- while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
- if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
- return TRUE;
- }
- }
- int32_t count = 1;
- while (count < strsrch->pattern.cesLength) {
- if (getCE(strsrch, ucol_next(coleiter, &status))
- == UCOL_IGNORABLE) {
- // Thai can give an ignorable here.
- count --;
- }
- if (U_FAILURE(status)) {
- return TRUE;
- }
- count ++;
- }
-
- ce = ucol_next(coleiter, &status);
- if (U_FAILURE(status)) {
- return TRUE;
- }
- if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
- ce = getCE(strsrch, ce);
- }
- if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
- if (ucol_getOffset(coleiter) <= end) {
- return TRUE;
- }
- if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
- return TRUE;
- }
- }
- }
- }
- return FALSE;
-}
-#endif // #if BOYER_MOORE
-
-/**
-* Checks if the offset runs out of the text string
-* @param offset
-* @param textlength of the text string
-* @return TRUE if offset is out of bounds, FALSE otherwise
-*/
-static
-inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
-{
- return offset < 0 || offset > textlength;
-}
-
-/**
-* Checks for identical match
-* @param strsrch string search data
-* @param start offset of possible match
-* @param end offset of possible match
-* @return TRUE if identical match is found
-*/
-static
-inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
- int32_t end)
-{
- if (strsrch->strength != UCOL_IDENTICAL) {
- return TRUE;
- }
-
- // Note: We could use Normalizer::compare() or similar, but for short strings
- // which may not be in FCD it might be faster to just NFD them.
- UErrorCode status = U_ZERO_ERROR;
- UnicodeString t2, p2;
- strsrch->nfd->normalize(
- UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
- strsrch->nfd->normalize(
- UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
- // return FALSE if NFD failed
- return U_SUCCESS(status) && t2 == p2;
-}
-
-#if BOYER_MOORE
-/**
-* Checks to see if the match is repeated
-* @param strsrch string search data
-* @param start new match start index
-* @param end new match end index
-* @return TRUE if the the match is repeated, FALSE otherwise
-*/
-static
-inline UBool checkRepeatedMatch(UStringSearch *strsrch,
- int32_t start,
- int32_t end)
-{
- int32_t lastmatchindex = strsrch->search->matchedIndex;
- UBool result;
- if (lastmatchindex == USEARCH_DONE) {
- return FALSE;
- }
- if (strsrch->search->isForwardSearching) {
- result = start <= lastmatchindex;
- }
- else {
- result = start >= lastmatchindex;
- }
- if (!result && !strsrch->search->isOverlap) {
- if (strsrch->search->isForwardSearching) {
- result = start < lastmatchindex + strsrch->search->matchedLength;
- }
- else {
- result = end > lastmatchindex;
- }
- }
- return result;
-}
-
-/**
-* Gets the collation element iterator's current offset.
-* @param coleiter collation element iterator
-* @param forwards flag TRUE if we are moving in th forwards direction
-* @return current offset
-*/
-static
-inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
- UBool forwards)
-{
- int32_t result = ucol_getOffset(coleiter);
- // intricacies of the the backwards collation element iterator
- if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
- result ++;
- }
- return result;
-}
-
-/**
-* Checks match for contraction.
-* If the match ends with a partial contraction we fail.
-* If the match starts too far off (because of backwards iteration) we try to
-* chip off the extra characters depending on whether a breakiterator has
-* been used.
-* Internal method, error assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param start offset of potential match, to be modified if necessary
-* @param end offset of potential match, to be modified if necessary
-* @param status output error status if any
-* @return TRUE if match passes the contraction test, FALSE otherwise
-*/
-
-static
-UBool checkNextExactContractionMatch(UStringSearch *strsrch,
- int32_t *start,
- int32_t *end, UErrorCode *status)
-{
- UCollationElements *coleiter = strsrch->textIter;
- int32_t textlength = strsrch->search->textLength;
- int32_t temp = *start;
- const UCollator *collator = strsrch->collator;
- const UChar *text = strsrch->search->text;
- // This part checks if either ends of the match contains potential
- // contraction. If so we'll have to iterate through them
- // The start contraction needs to be checked since ucol_previous dumps
- // all characters till the first safe character into the buffer.
- // *start + 1 is used to test for the unsafe characters instead of *start
- // because ucol_prev takes all unsafe characters till the first safe
- // character ie *start. so by testing *start + 1, we can estimate if
- // excess prefix characters has been included in the potential search
- // results.
- if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
- (*start + 1 < textlength
- && ucol_unsafeCP(text[*start + 1], collator))) {
- int32_t expansion = getExpansionPrefix(coleiter);
- UBool expandflag = expansion > 0;
- setColEIterOffset(coleiter, *start);
- while (expansion > 0) {
- // getting rid of the redundant ce, caused by setOffset.
- // since backward contraction/expansion may have extra ces if we
- // are in the normalization buffer, hasAccentsBeforeMatch would
- // have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but if
- // we are only looking for acute and ring \u030A and \u0301, we'll
- // have to skip the first ce in the expansion buffer.
- ucol_next(coleiter, status);
- if (U_FAILURE(*status)) {
- return FALSE;
- }
- if (ucol_getOffset(coleiter) != temp) {
- *start = temp;
- temp = ucol_getOffset(coleiter);
- }
- expansion --;
- }
-
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t count = 0;
- while (count < patterncelength) {
- int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
- if (ce == UCOL_IGNORABLE) {
- continue;
- }
- if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
- *start = temp;
- temp = ucol_getOffset(coleiter);
- }
- if (U_FAILURE(*status) || ce != patternce[count]) {
- (*end) ++;
- *end = getNextUStringSearchBaseOffset(strsrch, *end);
- return FALSE;
- }
- count ++;
- }
- }
- return TRUE;
-}
-
-/**
-* Checks and sets the match information if found.
-* Checks
-*
-* - the potential match does not repeat the previous match
-*
- boundaries are correct
-*
- exact matches has no extra accents
-*
- identical matchesb
-*
- potential match does not end in the middle of a contraction
-* <\ul>
-* Otherwise the offset will be shifted to the next character.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param textoffset offset in the collation element text. the returned value
-* will be the truncated end offset of the match or the new start
-* search offset.
-* @param status output error status if any
-* @return TRUE if the match is valid, FALSE otherwise
-*/
-static
-inline UBool checkNextExactMatch(UStringSearch *strsrch,
- int32_t *textoffset, UErrorCode *status)
-{
- UCollationElements *coleiter = strsrch->textIter;
- int32_t start = getColElemIterOffset(coleiter, FALSE);
-
- if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
- return FALSE;
- }
-
- // this totally matches, however we need to check if it is repeating
- if (!isBreakUnit(strsrch, start, *textoffset) ||
- checkRepeatedMatch(strsrch, start, *textoffset) ||
- hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
- !checkIdentical(strsrch, start, *textoffset) ||
- hasAccentsAfterMatch(strsrch, start, *textoffset)) {
-
- (*textoffset) ++;
- *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
- return FALSE;
- }
-
- //Add breakiterator boundary check for primary strength search.
- if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
- checkBreakBoundary(strsrch, &start, textoffset);
- }
-
- // totally match, we will get rid of the ending ignorables.
- strsrch->search->matchedIndex = start;
- strsrch->search->matchedLength = *textoffset - start;
- return TRUE;
-}
-
-/**
-* Getting the previous base character offset, or the current offset if the
-* current character is a base character
-* @param text string
-* @param textoffset one offset after the current character
-* @return the offset of the next character after the base character or the first
-* composed character with accents
-*/
-static
-inline int32_t getPreviousBaseOffset(const UChar *text,
- int32_t textoffset)
-{
- if (textoffset > 0) {
- for (;;) {
- int32_t result = textoffset;
- U16_BACK_1(text, 0, textoffset);
- int32_t temp = textoffset;
- uint16_t fcd = getFCD(text, &temp, result);
- if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
- if (fcd & LAST_BYTE_MASK_) {
- return textoffset;
- }
- return result;
- }
- if (textoffset == 0) {
- return 0;
- }
- }
- }
- return textoffset;
-}
-
-/**
-* Getting the indexes of the accents that are not blocked in the argument
-* accent array
-* @param accents array of accents in nfd terminated by a 0.
-* @param accentsindex array of indexes of the accents that are not blocked
-*/
-static
-inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
-{
- int32_t index = 0;
- int32_t length = u_strlen(accents);
- UChar32 codepoint = 0;
- int cclass = 0;
- int result = 0;
- int32_t temp;
- while (index < length) {
- temp = index;
- U16_NEXT(accents, index, length, codepoint);
- if (u_getCombiningClass(codepoint) != cclass) {
- cclass = u_getCombiningClass(codepoint);
- accentsindex[result] = temp;
- result ++;
- }
- }
- accentsindex[result] = length;
- return result;
-}
-
-/**
-* Appends 3 UChar arrays to a destination array.
-* Creates a new array if we run out of space. The caller will have to
-* manually deallocate the newly allocated array.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method. destination not to be NULL and has at least
-* size destinationlength.
-* @param destination target array
-* @param destinationlength target array size, returning the appended length
-* @param source1 null-terminated first array
-* @param source2 second array
-* @param source2length length of second array
-* @param source3 null-terminated third array
-* @param status error status if any
-* @return new destination array, destination if there was no new allocation
-*/
-static
-inline UChar * addToUCharArray( UChar *destination,
- int32_t *destinationlength,
- const UChar *source1,
- const UChar *source2,
- int32_t source2length,
- const UChar *source3,
- UErrorCode *status)
-{
- int32_t source1length = source1 ? u_strlen(source1) : 0;
- int32_t source3length = source3 ? u_strlen(source3) : 0;
- if (*destinationlength < source1length + source2length + source3length +
- 1)
- {
- destination = (UChar *)allocateMemory(
- (source1length + source2length + source3length + 1) * sizeof(UChar),
- status);
- // if error allocating memory, status will be
- // U_MEMORY_ALLOCATION_ERROR
- if (U_FAILURE(*status)) {
- *destinationlength = 0;
- return NULL;
- }
- }
- if (source1length != 0) {
- u_memcpy(destination, source1, source1length);
- }
- if (source2length != 0) {
- uprv_memcpy(destination + source1length, source2,
- sizeof(UChar) * source2length);
- }
- if (source3length != 0) {
- uprv_memcpy(destination + source1length + source2length, source3,
- sizeof(UChar) * source3length);
- }
- *destinationlength = source1length + source2length + source3length;
- return destination;
-}
-
-/**
-* Running through a collation element iterator to see if the contents matches
-* pattern in string search data
-* @param strsrch string search data
-* @param coleiter collation element iterator
-* @return TRUE if a match if found, FALSE otherwise
-*/
-static
-inline UBool checkCollationMatch(const UStringSearch *strsrch,
- UCollationElements *coleiter)
-{
- int patternceindex = strsrch->pattern.cesLength;
- int32_t *patternce = strsrch->pattern.ces;
- UErrorCode status = U_ZERO_ERROR;
- while (patternceindex > 0) {
- int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
- if (ce == UCOL_IGNORABLE) {
- continue;
- }
- if (U_FAILURE(status) || ce != *patternce) {
- return FALSE;
- }
- patternce ++;
- patternceindex --;
- }
- return TRUE;
-}
-
-/**
-* Rearranges the front accents to try matching.
-* Prefix accents in the text will be grouped according to their combining
-* class and the groups will be mixed and matched to try find the perfect
-* match with the pattern.
-* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
-* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
-* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
-* "\u0301\u0325".
-* step 2: check if any of the generated substrings matches the pattern.
-* Internal method, status is assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search match
-* @param start first offset of the accents to start searching
-* @param end start of the last accent set
-* @param status output error status if any
-* @return USEARCH_DONE if a match is not found, otherwise return the starting
-* offset of the match. Note this start includes all preceding accents.
-*/
-static
-int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
- int32_t start,
- int32_t end,
- UErrorCode *status)
-{
- const UChar *text = strsrch->search->text;
- int32_t textlength = strsrch->search->textLength;
- int32_t tempstart = start;
-
- if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
- // die... failed at a base character
- return USEARCH_DONE;
- }
-
- int32_t offset = getNextBaseOffset(text, tempstart, textlength);
- start = getPreviousBaseOffset(text, tempstart);
-
- UChar accents[INITIAL_ARRAY_SIZE_];
- // normalizing the offensive string
- unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
- INITIAL_ARRAY_SIZE_, status);
- if (U_FAILURE(*status)) {
- return USEARCH_DONE;
- }
-
- int32_t accentsindex[INITIAL_ARRAY_SIZE_];
- int32_t accentsize = getUnblockedAccentIndex(accents,
- accentsindex);
- int32_t count = (2 << (accentsize - 1)) - 1;
- UChar buffer[INITIAL_ARRAY_SIZE_];
- UCollationElements *coleiter = strsrch->utilIter;
- while (U_SUCCESS(*status) && count > 0) {
- UChar *rearrange = strsrch->canonicalPrefixAccents;
- // copy the base characters
- for (int k = 0; k < accentsindex[0]; k ++) {
- *rearrange ++ = accents[k];
- }
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= accentsize - 1; i ++) {
- int32_t mask = 1 << (accentsize - i - 1);
- if (count & mask) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
- *rearrange ++ = accents[j];
- }
- }
- }
- *rearrange = 0;
- int32_t matchsize = INITIAL_ARRAY_SIZE_;
- UChar *match = addToUCharArray(buffer, &matchsize,
- strsrch->canonicalPrefixAccents,
- strsrch->search->text + offset,
- end - offset,
- strsrch->canonicalSuffixAccents,
- status);
-
- // if status is a failure, ucol_setText does nothing.
- // run the collator iterator through this match
- ucol_setText(coleiter, match, matchsize, status);
- if (U_SUCCESS(*status)) {
- if (checkCollationMatch(strsrch, coleiter)) {
- if (match != buffer) {
- uprv_free(match);
- }
- return start;
- }
- }
- count --;
- }
- return USEARCH_DONE;
-}
-
-/**
-* Gets the offset to the safe point in text before textoffset.
-* ie. not the middle of a contraction, swappable characters or supplementary
-* characters.
-* @param collator collation sata
-* @param text string to work with
-* @param textoffset offset in string
-* @param textlength length of text string
-* @return offset to the previous safe character
-*/
-static
-inline uint32_t getPreviousSafeOffset(const UCollator *collator,
- const UChar *text,
- int32_t textoffset)
-{
- int32_t result = textoffset; // first contraction character
- while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
- result --;
- }
- if (result != 0) {
- // the first contraction character is consider unsafe here
- result --;
- }
- return result;
-}
-
-/**
-* Cleaning up after we passed the safe zone
-* @param strsrch string search data
-* @param safetext safe text array
-* @param safebuffer safe text buffer
-* @param coleiter collation element iterator for safe text
-*/
-static
-inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
- UChar *safebuffer)
-{
- if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
- {
- uprv_free(safetext);
- }
-}
-
-/**
-* Take the rearranged end accents and tries matching. If match failed at
-* a separate preceding set of accents (separated from the rearranged on by
-* at least a base character) then we rearrange the preceding accents and
-* tries matching again.
-* We allow skipping of the ends of the accent set if the ces do not match.
-* However if the failure is found before the accent set, it fails.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param textoffset of the start of the rearranged accent
-* @param status output error status if any
-* @return USEARCH_DONE if a match is not found, otherwise return the starting
-* offset of the match. Note this start includes all preceding accents.
-*/
-static
-int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
- int32_t textoffset,
- UErrorCode *status)
-{
- const UChar *text = strsrch->search->text;
- const UCollator *collator = strsrch->collator;
- int32_t safelength = 0;
- UChar *safetext;
- int32_t safetextlength;
- UChar safebuffer[INITIAL_ARRAY_SIZE_];
- UCollationElements *coleiter = strsrch->utilIter;
- int32_t safeoffset = textoffset;
-
- if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
- collator)) {
- safeoffset = getPreviousSafeOffset(collator, text, textoffset);
- safelength = textoffset - safeoffset;
- safetextlength = INITIAL_ARRAY_SIZE_;
- safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
- text + safeoffset, safelength,
- strsrch->canonicalSuffixAccents,
- status);
- }
- else {
- safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
- safetext = strsrch->canonicalSuffixAccents;
- }
-
- // if status is a failure, ucol_setText does nothing
- ucol_setText(coleiter, safetext, safetextlength, status);
- // status checked in loop below
-
- int32_t *ce = strsrch->pattern.ces;
- int32_t celength = strsrch->pattern.cesLength;
- int ceindex = celength - 1;
- UBool isSafe = TRUE; // indication flag for position in safe zone
-
- while (ceindex >= 0) {
- int32_t textce = ucol_previous(coleiter, status);
- if (U_FAILURE(*status)) {
- if (isSafe) {
- cleanUpSafeText(strsrch, safetext, safebuffer);
- }
- return USEARCH_DONE;
- }
- if (textce == UCOL_NULLORDER) {
- // check if we have passed the safe buffer
- if (coleiter == strsrch->textIter) {
- cleanUpSafeText(strsrch, safetext, safebuffer);
- return USEARCH_DONE;
- }
- cleanUpSafeText(strsrch, safetext, safebuffer);
- safetext = safebuffer;
- coleiter = strsrch->textIter;
- setColEIterOffset(coleiter, safeoffset);
- // status checked at the start of the loop
- isSafe = FALSE;
- continue;
- }
- textce = getCE(strsrch, textce);
- if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
- // do the beginning stuff
- int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
- if (isSafe && failedoffset >= safelength) {
- // alas... no hope. failed at rearranged accent set
- cleanUpSafeText(strsrch, safetext, safebuffer);
- return USEARCH_DONE;
- }
- else {
- if (isSafe) {
- failedoffset += safeoffset;
- cleanUpSafeText(strsrch, safetext, safebuffer);
- }
-
- // try rearranging the front accents
- int32_t result = doNextCanonicalPrefixMatch(strsrch,
- failedoffset, textoffset, status);
- if (result != USEARCH_DONE) {
- // if status is a failure, ucol_setOffset does nothing
- setColEIterOffset(strsrch->textIter, result);
- }
- if (U_FAILURE(*status)) {
- return USEARCH_DONE;
- }
- return result;
- }
- }
- if (textce == ce[ceindex]) {
- ceindex --;
- }
- }
- // set offset here
- if (isSafe) {
- int32_t result = getColElemIterOffset(coleiter, FALSE);
- // sets the text iterator here with the correct expansion and offset
- int32_t leftoverces = getExpansionPrefix(coleiter);
- cleanUpSafeText(strsrch, safetext, safebuffer);
- if (result >= safelength) {
- result = textoffset;
- }
- else {
- result += safeoffset;
- }
- setColEIterOffset(strsrch->textIter, result);
- strsrch->textIter->iteratordata_.toReturn =
- setExpansionPrefix(strsrch->textIter, leftoverces);
- return result;
- }
-
- return ucol_getOffset(coleiter);
-}
-
-/**
-* Trying out the substring and sees if it can be a canonical match.
-* This will try normalizing the end accents and arranging them into canonical
-* equivalents and check their corresponding ces with the pattern ce.
-* Suffix accents in the text will be grouped according to their combining
-* class and the groups will be mixed and matched to try find the perfect
-* match with the pattern.
-* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
-* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
-* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
-* "\u0301\u0325".
-* step 2: check if any of the generated substrings matches the pattern.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param textoffset end offset in the collation element text that ends with
-* the accents to be rearranged
-* @param status error status if any
-* @return TRUE if the match is valid, FALSE otherwise
-*/
-static
-UBool doNextCanonicalMatch(UStringSearch *strsrch,
- int32_t textoffset,
- UErrorCode *status)
-{
- const UChar *text = strsrch->search->text;
- int32_t temp = textoffset;
- U16_BACK_1(text, 0, temp);
- if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
- UCollationElements *coleiter = strsrch->textIter;
- int32_t offset = getColElemIterOffset(coleiter, FALSE);
- if (strsrch->pattern.hasPrefixAccents) {
- offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
- status);
- if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
- setColEIterOffset(coleiter, offset);
- return TRUE;
- }
- }
- return FALSE;
- }
-
- if (!strsrch->pattern.hasSuffixAccents) {
- return FALSE;
- }
-
- UChar accents[INITIAL_ARRAY_SIZE_];
- // offset to the last base character in substring to search
- int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
- // normalizing the offensive string
- unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
- 0, accents, INITIAL_ARRAY_SIZE_, status);
- // status checked in loop below
-
- int32_t accentsindex[INITIAL_ARRAY_SIZE_];
- int32_t size = getUnblockedAccentIndex(accents, accentsindex);
-
- // 2 power n - 1 plus the full set of accents
- int32_t count = (2 << (size - 1)) - 1;
- while (U_SUCCESS(*status) && count > 0) {
- UChar *rearrange = strsrch->canonicalSuffixAccents;
- // copy the base characters
- for (int k = 0; k < accentsindex[0]; k ++) {
- *rearrange ++ = accents[k];
- }
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= size - 1; i ++) {
- int32_t mask = 1 << (size - i - 1);
- if (count & mask) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
- *rearrange ++ = accents[j];
- }
- }
- }
- *rearrange = 0;
- int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
- status);
- if (offset != USEARCH_DONE) {
- return TRUE; // match found
- }
- count --;
- }
- return FALSE;
-}
-
-/**
-* Gets the previous base character offset depending on the string search
-* pattern data
-* @param strsrch string search data
-* @param textoffset current offset, current character
-* @return the offset of the next character after this base character or itself
-* if it is a composed character with accents
-*/
-static
-inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
- int32_t textoffset)
-{
- if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
- const UChar *text = strsrch->search->text;
- int32_t offset = textoffset;
- if (getFCD(text, &offset, strsrch->search->textLength) >>
- SECOND_LAST_BYTE_SHIFT_) {
- return getPreviousBaseOffset(text, textoffset);
- }
- }
- return textoffset;
-}
-
-/**
-* Checks match for contraction.
-* If the match ends with a partial contraction we fail.
-* If the match starts too far off (because of backwards iteration) we try to
-* chip off the extra characters
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param start offset of potential match, to be modified if necessary
-* @param end offset of potential match, to be modified if necessary
-* @param status output error status if any
-* @return TRUE if match passes the contraction test, FALSE otherwise
-*/
-static
-UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
- int32_t *start,
- int32_t *end,
- UErrorCode *status)
-{
- UCollationElements *coleiter = strsrch->textIter;
- int32_t textlength = strsrch->search->textLength;
- int32_t temp = *start;
- const UCollator *collator = strsrch->collator;
- const UChar *text = strsrch->search->text;
- // This part checks if either ends of the match contains potential
- // contraction. If so we'll have to iterate through them
- if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
- (*start + 1 < textlength
- && ucol_unsafeCP(text[*start + 1], collator))) {
- int32_t expansion = getExpansionPrefix(coleiter);
- UBool expandflag = expansion > 0;
- setColEIterOffset(coleiter, *start);
- while (expansion > 0) {
- // getting rid of the redundant ce, caused by setOffset.
- // since backward contraction/expansion may have extra ces if we
- // are in the normalization buffer, hasAccentsBeforeMatch would
- // have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but if
- // we are only looking for acute and ring \u030A and \u0301, we'll
- // have to skip the first ce in the expansion buffer.
- ucol_next(coleiter, status);
- if (U_FAILURE(*status)) {
- return FALSE;
- }
- if (ucol_getOffset(coleiter) != temp) {
- *start = temp;
- temp = ucol_getOffset(coleiter);
- }
- expansion --;
- }
-
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t count = 0;
- int32_t textlength = strsrch->search->textLength;
- while (count < patterncelength) {
- int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
- // status checked below, note that if status is a failure
- // ucol_next returns UCOL_NULLORDER
- if (ce == UCOL_IGNORABLE) {
- continue;
- }
- if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
- *start = temp;
- temp = ucol_getOffset(coleiter);
- }
-
- if (count == 0 && ce != patternce[0]) {
- // accents may have extra starting ces, this occurs when a
- // pure accent pattern is matched without rearrangement
- // text \u0325\u0300 and looking for \u0300
- int32_t expected = patternce[0];
- if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
- ce = getCE(strsrch, ucol_next(coleiter, status));
- while (U_SUCCESS(*status) && ce != expected &&
- ce != UCOL_NULLORDER &&
- ucol_getOffset(coleiter) <= *end) {
- ce = getCE(strsrch, ucol_next(coleiter, status));
- }
- }
- }
- if (U_FAILURE(*status) || ce != patternce[count]) {
- (*end) ++;
- *end = getNextUStringSearchBaseOffset(strsrch, *end);
- return FALSE;
- }
- count ++;
- }
- }
- return TRUE;
-}
-
-/**
-* Checks and sets the match information if found.
-* Checks
-*
-* - the potential match does not repeat the previous match
-*
- boundaries are correct
-*
- potential match does not end in the middle of a contraction
-*
- identical matches
-* <\ul>
-* Otherwise the offset will be shifted to the next character.
-* Internal method, status assumed to be success, caller has to check the
-* status before calling this method.
-* @param strsrch string search data
-* @param textoffset offset in the collation element text. the returned value
-* will be the truncated end offset of the match or the new start
-* search offset.
-* @param status output error status if any
-* @return TRUE if the match is valid, FALSE otherwise
-*/
-static
-inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
- int32_t *textoffset,
- UErrorCode *status)
-{
- // to ensure that the start and ends are not composite characters
- UCollationElements *coleiter = strsrch->textIter;
- // if we have a canonical accent match
- if ((strsrch->pattern.hasSuffixAccents &&
- strsrch->canonicalSuffixAccents[0]) ||
- (strsrch->pattern.hasPrefixAccents &&
- strsrch->canonicalPrefixAccents[0])) {
- strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
- strsrch,
- ucol_getOffset(coleiter));
- strsrch->search->matchedLength = *textoffset -
- strsrch->search->matchedIndex;
- return TRUE;
- }
-
- int32_t start = getColElemIterOffset(coleiter, FALSE);
- if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
- status) || U_FAILURE(*status)) {
- return FALSE;
- }
-
- start = getPreviousUStringSearchBaseOffset(strsrch, start);
- // this totally matches, however we need to check if it is repeating
- if (checkRepeatedMatch(strsrch, start, *textoffset) ||
- !isBreakUnit(strsrch, start, *textoffset) ||
- !checkIdentical(strsrch, start, *textoffset)) {
- (*textoffset) ++;
- *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
- strsrch->search->textLength);
- return FALSE;
- }
-
- strsrch->search->matchedIndex = start;
- strsrch->search->matchedLength = *textoffset - start;
- return TRUE;
-}
-
-/**
-* Shifting the collation element iterator position forward to prepare for
-* a preceding match. If the first character is a unsafe character, we'll only
-* shift by 1 to capture contractions, normalization etc.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param text strsrch string search data
-* @param textoffset start text position to do search
-* @param ce the text ce which failed the match.
-* @param patternceindex index of the ce within the pattern ce buffer which
-* failed the match
-* @return final offset
-*/
-static
-inline int32_t reverseShift(UStringSearch *strsrch,
- int32_t textoffset,
- int32_t ce,
- int32_t patternceindex)
-{
- if (strsrch->search->isOverlap) {
- if (textoffset != strsrch->search->textLength) {
- textoffset --;
- }
- else {
- textoffset -= strsrch->pattern.defaultShiftSize;
- }
- }
- else {
- if (ce != UCOL_NULLORDER) {
- int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
-
- // this is to adjust for characters in the middle of the substring
- // for matching that failed.
- int32_t adjust = patternceindex;
- if (adjust > 1 && shift > adjust) {
- shift -= adjust - 1;
- }
- textoffset -= shift;
- }
- else {
- textoffset -= strsrch->pattern.defaultShiftSize;
- }
- }
- textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
- return textoffset;
-}
-
-/**
-* Checks match for contraction.
-* If the match starts with a partial contraction we fail.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param start offset of potential match, to be modified if necessary
-* @param end offset of potential match, to be modified if necessary
-* @param status output error status if any
-* @return TRUE if match passes the contraction test, FALSE otherwise
-*/
-static
-UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
- int32_t *start,
- int32_t *end, UErrorCode *status)
-{
- UCollationElements *coleiter = strsrch->textIter;
- int32_t textlength = strsrch->search->textLength;
- int32_t temp = *end;
- const UCollator *collator = strsrch->collator;
- const UChar *text = strsrch->search->text;
- // This part checks if either if the start of the match contains potential
- // contraction. If so we'll have to iterate through them
- // Since we used ucol_next while previously looking for the potential
- // match, this guarantees that our end will not be a partial contraction,
- // or a partial supplementary character.
- if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
- int32_t expansion = getExpansionSuffix(coleiter);
- UBool expandflag = expansion > 0;
- setColEIterOffset(coleiter, *end);
- while (U_SUCCESS(*status) && expansion > 0) {
- // getting rid of the redundant ce
- // since forward contraction/expansion may have extra ces
- // if we are in the normalization buffer, hasAccentsBeforeMatch
- // would have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but if
- // we are only looking for A ring A\u030A, we'll have to skip the
- // last ce in the expansion buffer
- ucol_previous(coleiter, status);
- if (U_FAILURE(*status)) {
- return FALSE;
- }
- if (ucol_getOffset(coleiter) != temp) {
- *end = temp;
- temp = ucol_getOffset(coleiter);
- }
- expansion --;
- }
-
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t count = patterncelength;
- while (count > 0) {
- int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
- // status checked below, note that if status is a failure
- // ucol_previous returns UCOL_NULLORDER
- if (ce == UCOL_IGNORABLE) {
- continue;
- }
- if (expandflag && count == 0 &&
- getColElemIterOffset(coleiter, FALSE) != temp) {
- *end = temp;
- temp = ucol_getOffset(coleiter);
- }
- if (U_FAILURE(*status) || ce != patternce[count - 1]) {
- (*start) --;
- *start = getPreviousBaseOffset(text, *start);
- return FALSE;
- }
- count --;
- }
- }
- return TRUE;
-}
-
-/**
-* Checks and sets the match information if found.
-* Checks
-*
-* - the current match does not repeat the last match
-*
- boundaries are correct
-*
- exact matches has no extra accents
-*
- identical matches
-* <\ul>
-* Otherwise the offset will be shifted to the preceding character.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param collator
-* @param coleiter collation element iterator
-* @param text string
-* @param textoffset offset in the collation element text. the returned value
-* will be the truncated start offset of the match or the new start
-* search offset.
-* @param status output error status if any
-* @return TRUE if the match is valid, FALSE otherwise
-*/
-static
-inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
- int32_t *textoffset,
- UErrorCode *status)
-{
- // to ensure that the start and ends are not composite characters
- int32_t end = ucol_getOffset(strsrch->textIter);
- if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
- || U_FAILURE(*status)) {
- return FALSE;
- }
-
- // this totally matches, however we need to check if it is repeating
- // the old match
- if (checkRepeatedMatch(strsrch, *textoffset, end) ||
- !isBreakUnit(strsrch, *textoffset, end) ||
- hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
- !checkIdentical(strsrch, *textoffset, end) ||
- hasAccentsAfterMatch(strsrch, *textoffset, end)) {
- (*textoffset) --;
- *textoffset = getPreviousBaseOffset(strsrch->search->text,
- *textoffset);
- return FALSE;
- }
-
- //Add breakiterator boundary check for primary strength search.
- if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
- checkBreakBoundary(strsrch, textoffset, &end);
- }
-
- strsrch->search->matchedIndex = *textoffset;
- strsrch->search->matchedLength = end - *textoffset;
- return TRUE;
-}
-
-/**
-* Rearranges the end accents to try matching.
-* Suffix accents in the text will be grouped according to their combining
-* class and the groups will be mixed and matched to try find the perfect
-* match with the pattern.
-* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
-* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
-* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
-* "\u0301\u0325".
-* step 2: check if any of the generated substrings matches the pattern.
-* Internal method, status assumed to be success, user has to check status
-* before calling this method.
-* @param strsrch string search match
-* @param start offset of the first base character
-* @param end start of the last accent set
-* @param status only error status if any
-* @return USEARCH_DONE if a match is not found, otherwise return the ending
-* offset of the match. Note this start includes all following accents.
-*/
-static
-int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
- int32_t start,
- int32_t end,
- UErrorCode *status)
-{
- const UChar *text = strsrch->search->text;
- int32_t tempend = end;
-
- U16_BACK_1(text, 0, tempend);
- if (!(getFCD(text, &tempend, strsrch->search->textLength) &
- LAST_BYTE_MASK_)) {
- // die... failed at a base character
- return USEARCH_DONE;
- }
- end = getNextBaseOffset(text, end, strsrch->search->textLength);
-
- if (U_SUCCESS(*status)) {
- UChar accents[INITIAL_ARRAY_SIZE_];
- int32_t offset = getPreviousBaseOffset(text, end);
- // normalizing the offensive string
- unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
- INITIAL_ARRAY_SIZE_, status);
-
- int32_t accentsindex[INITIAL_ARRAY_SIZE_];
- int32_t accentsize = getUnblockedAccentIndex(accents,
- accentsindex);
- int32_t count = (2 << (accentsize - 1)) - 1;
- UChar buffer[INITIAL_ARRAY_SIZE_];
- UCollationElements *coleiter = strsrch->utilIter;
- while (U_SUCCESS(*status) && count > 0) {
- UChar *rearrange = strsrch->canonicalSuffixAccents;
- // copy the base characters
- for (int k = 0; k < accentsindex[0]; k ++) {
- *rearrange ++ = accents[k];
- }
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= accentsize - 1; i ++) {
- int32_t mask = 1 << (accentsize - i - 1);
- if (count & mask) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
- *rearrange ++ = accents[j];
- }
- }
- }
- *rearrange = 0;
- int32_t matchsize = INITIAL_ARRAY_SIZE_;
- UChar *match = addToUCharArray(buffer, &matchsize,
- strsrch->canonicalPrefixAccents,
- strsrch->search->text + start,
- offset - start,
- strsrch->canonicalSuffixAccents,
- status);
-
- // run the collator iterator through this match
- // if status is a failure ucol_setText does nothing
- ucol_setText(coleiter, match, matchsize, status);
- if (U_SUCCESS(*status)) {
- if (checkCollationMatch(strsrch, coleiter)) {
- if (match != buffer) {
- uprv_free(match);
- }
- return end;
- }
- }
- count --;
- }
- }
- return USEARCH_DONE;
-}
-
-/**
-* Take the rearranged start accents and tries matching. If match failed at
-* a separate following set of accents (separated from the rearranged on by
-* at least a base character) then we rearrange the preceding accents and
-* tries matching again.
-* We allow skipping of the ends of the accent set if the ces do not match.
-* However if the failure is found before the accent set, it fails.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param textoffset of the ends of the rearranged accent
-* @param status output error status if any
-* @return USEARCH_DONE if a match is not found, otherwise return the ending
-* offset of the match. Note this start includes all following accents.
-*/
-static
-int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
- int32_t textoffset,
- UErrorCode *status)
-{
- const UChar *text = strsrch->search->text;
- const UCollator *collator = strsrch->collator;
- int32_t safelength = 0;
- UChar *safetext;
- int32_t safetextlength;
- UChar safebuffer[INITIAL_ARRAY_SIZE_];
- int32_t safeoffset = textoffset;
-
- if (textoffset &&
- ucol_unsafeCP(strsrch->canonicalPrefixAccents[
- u_strlen(strsrch->canonicalPrefixAccents) - 1
- ], collator)) {
- safeoffset = getNextSafeOffset(collator, text, textoffset,
- strsrch->search->textLength);
- safelength = safeoffset - textoffset;
- safetextlength = INITIAL_ARRAY_SIZE_;
- safetext = addToUCharArray(safebuffer, &safetextlength,
- strsrch->canonicalPrefixAccents,
- text + textoffset, safelength,
- NULL, status);
- }
- else {
- safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
- safetext = strsrch->canonicalPrefixAccents;
- }
-
- UCollationElements *coleiter = strsrch->utilIter;
- // if status is a failure, ucol_setText does nothing
- ucol_setText(coleiter, safetext, safetextlength, status);
- // status checked in loop below
-
- int32_t *ce = strsrch->pattern.ces;
- int32_t celength = strsrch->pattern.cesLength;
- int ceindex = 0;
- UBool isSafe = TRUE; // safe zone indication flag for position
- int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
-
- while (ceindex < celength) {
- int32_t textce = ucol_next(coleiter, status);
- if (U_FAILURE(*status)) {
- if (isSafe) {
- cleanUpSafeText(strsrch, safetext, safebuffer);
- }
- return USEARCH_DONE;
- }
- if (textce == UCOL_NULLORDER) {
- // check if we have passed the safe buffer
- if (coleiter == strsrch->textIter) {
- cleanUpSafeText(strsrch, safetext, safebuffer);
- return USEARCH_DONE;
- }
- cleanUpSafeText(strsrch, safetext, safebuffer);
- safetext = safebuffer;
- coleiter = strsrch->textIter;
- setColEIterOffset(coleiter, safeoffset);
- // status checked at the start of the loop
- isSafe = FALSE;
- continue;
- }
- textce = getCE(strsrch, textce);
- if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
- // do the beginning stuff
- int32_t failedoffset = ucol_getOffset(coleiter);
- if (isSafe && failedoffset <= prefixlength) {
- // alas... no hope. failed at rearranged accent set
- cleanUpSafeText(strsrch, safetext, safebuffer);
- return USEARCH_DONE;
- }
- else {
- if (isSafe) {
- failedoffset = safeoffset - failedoffset;
- cleanUpSafeText(strsrch, safetext, safebuffer);
- }
+inline void initialize(UStringSearch *strsrch, UErrorCode *status)
+{
+ int16_t expandlength = initializePattern(strsrch, status);
+ if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
+ UPattern *pattern = &strsrch->pattern;
+ int32_t cesize = pattern->cesLength;
- // try rearranging the end accents
- int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
- textoffset, failedoffset, status);
- if (result != USEARCH_DONE) {
- // if status is a failure, ucol_setOffset does nothing
- setColEIterOffset(strsrch->textIter, result);
- }
- if (U_FAILURE(*status)) {
- return USEARCH_DONE;
- }
- return result;
- }
- }
- if (textce == ce[ceindex]) {
- ceindex ++;
- }
- }
- // set offset here
- if (isSafe) {
- int32_t result = ucol_getOffset(coleiter);
- // sets the text iterator here with the correct expansion and offset
- int32_t leftoverces = getExpansionSuffix(coleiter);
- cleanUpSafeText(strsrch, safetext, safebuffer);
- if (result <= prefixlength) {
- result = textoffset;
- }
- else {
- result = textoffset + (safeoffset - result);
- }
- setColEIterOffset(strsrch->textIter, result);
- setExpansionSuffix(strsrch->textIter, leftoverces);
- return result;
+ int16_t minlength = cesize > expandlength
+ ? (int16_t)cesize - expandlength : 1;
+ pattern->defaultShiftSize = minlength;
+ setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
+ cesize, expandlength, minlength, minlength);
+ return;
}
-
- return ucol_getOffset(coleiter);
+ strsrch->pattern.defaultShiftSize = 0;
}
/**
-* Trying out the substring and sees if it can be a canonical match.
-* This will try normalizing the starting accents and arranging them into
-* canonical equivalents and check their corresponding ces with the pattern ce.
-* Prefix accents in the text will be grouped according to their combining
-* class and the groups will be mixed and matched to try find the perfect
-* match with the pattern.
-* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
-* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
-* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
-* "\u0301\u0325".
-* step 2: check if any of the generated substrings matches the pattern.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
+* sets match not found
* @param strsrch string search data
-* @param textoffset start offset in the collation element text that starts
-* with the accents to be rearranged
-* @param status output error status if any
-* @return TRUE if the match is valid, FALSE otherwise
*/
static
-UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
- int32_t textoffset,
- UErrorCode *status)
+inline void setMatchNotFound(UStringSearch *strsrch)
{
- const UChar *text = strsrch->search->text;
- int32_t temp = textoffset;
- int32_t textlength = strsrch->search->textLength;
- if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
- UCollationElements *coleiter = strsrch->textIter;
- int32_t offset = ucol_getOffset(coleiter);
- if (strsrch->pattern.hasSuffixAccents) {
- offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
- offset, status);
- if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
- setColEIterOffset(coleiter, offset);
- return TRUE;
- }
- }
- return FALSE;
- }
-
- if (!strsrch->pattern.hasPrefixAccents) {
- return FALSE;
+ // this method resets the match result regardless of the error status.
+ strsrch->search->matchedIndex = USEARCH_DONE;
+ strsrch->search->matchedLength = 0;
+ if (strsrch->search->isForwardSearching) {
+ setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
}
-
- UChar accents[INITIAL_ARRAY_SIZE_];
- // offset to the last base character in substring to search
- int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
- // normalizing the offensive string
- unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
- 0, accents, INITIAL_ARRAY_SIZE_, status);
- // status checked in loop
-
- int32_t accentsindex[INITIAL_ARRAY_SIZE_];
- int32_t size = getUnblockedAccentIndex(accents, accentsindex);
-
- // 2 power n - 1 plus the full set of accents
- int32_t count = (2 << (size - 1)) - 1;
- while (U_SUCCESS(*status) && count > 0) {
- UChar *rearrange = strsrch->canonicalPrefixAccents;
- // copy the base characters
- for (int k = 0; k < accentsindex[0]; k ++) {
- *rearrange ++ = accents[k];
- }
- // forming all possible canonical rearrangement by dropping
- // sets of accents
- for (int i = 0; i <= size - 1; i ++) {
- int32_t mask = 1 << (size - i - 1);
- if (count & mask) {
- for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
- *rearrange ++ = accents[j];
- }
- }
- }
- *rearrange = 0;
- int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
- baseoffset, status);
- if (offset != USEARCH_DONE) {
- return TRUE; // match found
- }
- count --;
+ else {
+ setColEIterOffset(strsrch->textIter, 0);
}
- return FALSE;
}
/**
-* Checks match for contraction.
-* If the match starts with a partial contraction we fail.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
-* @param strsrch string search data
-* @param start offset of potential match, to be modified if necessary
-* @param end offset of potential match, to be modified if necessary
-* @param status only error status if any
-* @return TRUE if match passes the contraction test, FALSE otherwise
+* Checks if the offset runs out of the text string
+* @param offset
+* @param textlength of the text string
+* @return TRUE if offset is out of bounds, FALSE otherwise
*/
static
-UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
- int32_t *start,
- int32_t *end, UErrorCode *status)
+inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
{
- UCollationElements *coleiter = strsrch->textIter;
- int32_t textlength = strsrch->search->textLength;
- int32_t temp = *end;
- const UCollator *collator = strsrch->collator;
- const UChar *text = strsrch->search->text;
- // This part checks if either if the start of the match contains potential
- // contraction. If so we'll have to iterate through them
- // Since we used ucol_next while previously looking for the potential
- // match, this guarantees that our end will not be a partial contraction,
- // or a partial supplementary character.
- if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
- int32_t expansion = getExpansionSuffix(coleiter);
- UBool expandflag = expansion > 0;
- setColEIterOffset(coleiter, *end);
- while (expansion > 0) {
- // getting rid of the redundant ce
- // since forward contraction/expansion may have extra ces
- // if we are in the normalization buffer, hasAccentsBeforeMatch
- // would have taken care of it.
- // E.g. the character \u01FA will have an expansion of 3, but if
- // we are only looking for A ring A\u030A, we'll have to skip the
- // last ce in the expansion buffer
- ucol_previous(coleiter, status);
- if (U_FAILURE(*status)) {
- return FALSE;
- }
- if (ucol_getOffset(coleiter) != temp) {
- *end = temp;
- temp = ucol_getOffset(coleiter);
- }
- expansion --;
- }
-
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t count = patterncelength;
- while (count > 0) {
- int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
- // status checked below, note that if status is a failure
- // ucol_previous returns UCOL_NULLORDER
- if (ce == UCOL_IGNORABLE) {
- continue;
- }
- if (expandflag && count == 0 &&
- getColElemIterOffset(coleiter, FALSE) != temp) {
- *end = temp;
- temp = ucol_getOffset(coleiter);
- }
- if (count == patterncelength &&
- ce != patternce[patterncelength - 1]) {
- // accents may have extra starting ces, this occurs when a
- // pure accent pattern is matched without rearrangement
- int32_t expected = patternce[patterncelength - 1];
- U16_BACK_1(text, 0, *end);
- if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
- ce = getCE(strsrch, ucol_previous(coleiter, status));
- while (U_SUCCESS(*status) && ce != expected &&
- ce != UCOL_NULLORDER &&
- ucol_getOffset(coleiter) <= *start) {
- ce = getCE(strsrch, ucol_previous(coleiter, status));
- }
- }
- }
- if (U_FAILURE(*status) || ce != patternce[count - 1]) {
- (*start) --;
- *start = getPreviousBaseOffset(text, *start);
- return FALSE;
- }
- count --;
- }
- }
- return TRUE;
+ return offset < 0 || offset > textlength;
}
/**
-* Checks and sets the match information if found.
-* Checks
-*
-* - the potential match does not repeat the previous match
-*
- boundaries are correct
-*
- potential match does not end in the middle of a contraction
-*
- identical matches
-* <\ul>
-* Otherwise the offset will be shifted to the next character.
-* Internal method, status assumed to be success, caller has to check status
-* before calling this method.
+* Checks for identical match
* @param strsrch string search data
-* @param textoffset offset in the collation element text. the returned value
-* will be the truncated start offset of the match or the new start
-* search offset.
-* @param status only error status if any
-* @return TRUE if the match is valid, FALSE otherwise
+* @param start offset of possible match
+* @param end offset of possible match
+* @return TRUE if identical match is found
*/
static
-inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
- int32_t *textoffset,
- UErrorCode *status)
+inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
+ int32_t end)
{
- // to ensure that the start and ends are not composite characters
- UCollationElements *coleiter = strsrch->textIter;
- // if we have a canonical accent match
- if ((strsrch->pattern.hasSuffixAccents &&
- strsrch->canonicalSuffixAccents[0]) ||
- (strsrch->pattern.hasPrefixAccents &&
- strsrch->canonicalPrefixAccents[0])) {
- strsrch->search->matchedIndex = *textoffset;
- strsrch->search->matchedLength =
- getNextUStringSearchBaseOffset(strsrch,
- getColElemIterOffset(coleiter, FALSE))
- - *textoffset;
+ if (strsrch->strength != UCOL_IDENTICAL) {
return TRUE;
}
- int32_t end = ucol_getOffset(coleiter);
- if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
- status) ||
- U_FAILURE(*status)) {
- return FALSE;
- }
-
- end = getNextUStringSearchBaseOffset(strsrch, end);
- // this totally matches, however we need to check if it is repeating
- if (checkRepeatedMatch(strsrch, *textoffset, end) ||
- !isBreakUnit(strsrch, *textoffset, end) ||
- !checkIdentical(strsrch, *textoffset, end)) {
- (*textoffset) --;
- *textoffset = getPreviousBaseOffset(strsrch->search->text,
- *textoffset);
- return FALSE;
- }
-
- strsrch->search->matchedIndex = *textoffset;
- strsrch->search->matchedLength = end - *textoffset;
- return TRUE;
+ // Note: We could use Normalizer::compare() or similar, but for short strings
+ // which may not be in FCD it might be faster to just NFD them.
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString t2, p2;
+ strsrch->nfd->normalize(
+ UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
+ strsrch->nfd->normalize(
+ UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
+ // return FALSE if NFD failed
+ return U_SUCCESS(status) && t2 == p2;
}
-#endif // #if BOYER_MOORE
// constructors and destructor -------------------------------------------
@@ -3198,17 +1225,6 @@ U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
search->reset = FALSE;
int32_t textlength = search->textLength;
if (search->isForwardSearching) {
-#if BOYER_MOORE
- if (offset == textlength
- || (!search->isOverlap &&
- (offset + strsrch->pattern.defaultShiftSize > textlength ||
- (search->matchedIndex != USEARCH_DONE &&
- offset + search->matchedLength >= textlength)))) {
- // not enough characters to match
- setMatchNotFound(strsrch);
- return USEARCH_DONE;
- }
-#else
if (offset == textlength ||
(! search->isOverlap &&
(search->matchedIndex != USEARCH_DONE &&
@@ -3217,7 +1233,6 @@ U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
setMatchNotFound(strsrch);
return USEARCH_DONE;
}
-#endif
}
else {
// switching direction.
@@ -3281,13 +1296,11 @@ U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
return USEARCH_DONE;
}
-#if !BOYER_MOORE
if (search->matchedIndex == USEARCH_DONE) {
ucol_setOffset(strsrch->textIter, search->textLength, status);
} else {
ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
}
-#endif
return search->matchedIndex;
}
@@ -3324,24 +1337,13 @@ U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
}
}
else {
-#if BOYER_MOORE
- if (offset == 0 || matchedindex == 0 ||
- (!search->isOverlap &&
- (offset < strsrch->pattern.defaultShiftSize ||
- (matchedindex != USEARCH_DONE &&
- matchedindex < strsrch->pattern.defaultShiftSize)))) {
- // not enough characters to match
- setMatchNotFound(strsrch);
- return USEARCH_DONE;
- }
-#else
+
// Could check pattern length, but the
// linear search will do the right thing
if (offset == 0 || matchedindex == 0) {
setMatchNotFound(strsrch);
return USEARCH_DONE;
}
-#endif
}
if (U_SUCCESS(*status)) {
@@ -3813,10 +1815,6 @@ static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t com
return U_CE_MATCH;
}
-#if BOYER_MOORE
-// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
-#endif
-
namespace {
UChar32 codePointAt(const USearch &search, int32_t index) {
@@ -4431,98 +2429,6 @@ UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
return FALSE;
}
-#if BOYER_MOORE
- UCollationElements *coleiter = strsrch->textIter;
- int32_t textlength = strsrch->search->textLength;
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t textoffset = ucol_getOffset(coleiter);
-
- // status used in setting coleiter offset, since offset is checked in
- // shiftForward before setting the coleiter offset, status never
- // a failure
- textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
- patterncelength);
- while (textoffset <= textlength)
- {
- uint32_t patternceindex = patterncelength - 1;
- int32_t targetce;
- UBool found = FALSE;
- int32_t lastce = UCOL_NULLORDER;
-
- setColEIterOffset(coleiter, textoffset);
-
- for (;;) {
- // finding the last pattern ce match, imagine composite characters
- // for example: search for pattern A in text \u00C0
- // we'll have to skip \u0300 the grave first before we get to A
- targetce = ucol_previous(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
- // this is for the text \u0315\u0300 that requires
- // normalization and pattern \u0300, where \u0315 is ignorable
- continue;
- }
- if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
- lastce = targetce;
- }
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- if (targetce == patternce[patternceindex]) {
- // the first ce can be a contraction
- found = TRUE;
- break;
- }
- if (!hasExpansion(coleiter)) {
- found = FALSE;
- break;
- }
- }
-
- //targetce = lastce;
-
- while (found && patternceindex > 0) {
- lastce = targetce;
- targetce = ucol_previous(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (targetce == UCOL_IGNORABLE) {
- continue;
- }
-
- patternceindex --;
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- found = found && targetce == patternce[patternceindex];
- }
-
- targetce = lastce;
-
- if (!found) {
- if (U_FAILURE(*status)) {
- break;
- }
- textoffset = shiftForward(strsrch, textoffset, lastce,
- patternceindex);
- // status checked at loop.
- patternceindex = patterncelength;
- continue;
- }
-
- if (checkNextExactMatch(strsrch, &textoffset, status)) {
- // status checked in ucol_setOffset
- setColEIterOffset(coleiter, strsrch->search->matchedIndex);
- return TRUE;
- }
- }
- setMatchNotFound(strsrch);
- return FALSE;
-#else
int32_t textOffset = ucol_getOffset(strsrch->textIter);
int32_t start = -1;
int32_t end = -1;
@@ -4535,7 +2441,6 @@ UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
setMatchNotFound(strsrch);
return FALSE;
}
-#endif
}
UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
@@ -4545,99 +2450,6 @@ UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
return FALSE;
}
-#if BOYER_MOORE
- UCollationElements *coleiter = strsrch->textIter;
- int32_t textlength = strsrch->search->textLength;
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t textoffset = ucol_getOffset(coleiter);
- UBool hasPatternAccents =
- strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
-
- textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
- patterncelength);
- strsrch->canonicalPrefixAccents[0] = 0;
- strsrch->canonicalSuffixAccents[0] = 0;
-
- while (textoffset <= textlength)
- {
- int32_t patternceindex = patterncelength - 1;
- int32_t targetce;
- UBool found = FALSE;
- int32_t lastce = UCOL_NULLORDER;
-
- setColEIterOffset(coleiter, textoffset);
-
- for (;;) {
- // finding the last pattern ce match, imagine composite characters
- // for example: search for pattern A in text \u00C0
- // we'll have to skip \u0300 the grave first before we get to A
- targetce = ucol_previous(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
- lastce = targetce;
- }
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- if (targetce == patternce[patternceindex]) {
- // the first ce can be a contraction
- found = TRUE;
- break;
- }
- if (!hasExpansion(coleiter)) {
- found = FALSE;
- break;
- }
- }
-
- while (found && patternceindex > 0) {
- targetce = ucol_previous(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (targetce == UCOL_IGNORABLE) {
- continue;
- }
-
- patternceindex --;
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- found = found && targetce == patternce[patternceindex];
- }
-
- // initializing the rearranged accent array
- if (hasPatternAccents && !found) {
- strsrch->canonicalPrefixAccents[0] = 0;
- strsrch->canonicalSuffixAccents[0] = 0;
- if (U_FAILURE(*status)) {
- break;
- }
- found = doNextCanonicalMatch(strsrch, textoffset, status);
- }
-
- if (!found) {
- if (U_FAILURE(*status)) {
- break;
- }
- textoffset = shiftForward(strsrch, textoffset, lastce,
- patternceindex);
- // status checked at loop
- patternceindex = patterncelength;
- continue;
- }
-
- if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
- setColEIterOffset(coleiter, strsrch->search->matchedIndex);
- return TRUE;
- }
- }
- setMatchNotFound(strsrch);
- return FALSE;
-#else
int32_t textOffset = ucol_getOffset(strsrch->textIter);
int32_t start = -1;
int32_t end = -1;
@@ -4650,7 +2462,6 @@ UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
setMatchNotFound(strsrch);
return FALSE;
}
-#endif
}
UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
@@ -4660,101 +2471,6 @@ UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
return FALSE;
}
-#if BOYER_MOORE
- UCollationElements *coleiter = strsrch->textIter;
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t textoffset = ucol_getOffset(coleiter);
-
- // shifting it check for setting offset
- // if setOffset is called previously or there was no previous match, we
- // leave the offset as it is.
- if (strsrch->search->matchedIndex != USEARCH_DONE) {
- textoffset = strsrch->search->matchedIndex;
- }
-
- textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
- patterncelength);
-
- while (textoffset >= 0)
- {
- int32_t patternceindex = 1;
- int32_t targetce;
- UBool found = FALSE;
- int32_t firstce = UCOL_NULLORDER;
-
- // if status is a failure, ucol_setOffset does nothing
- setColEIterOffset(coleiter, textoffset);
-
- for (;;) {
- // finding the first pattern ce match, imagine composite
- // characters. for example: search for pattern \u0300 in text
- // \u00C0, we'll have to skip A first before we get to
- // \u0300 the grave accent
- targetce = ucol_next(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
- firstce = targetce;
- }
- if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
- continue;
- }
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- if (targetce == patternce[0]) {
- found = TRUE;
- break;
- }
- if (!hasExpansion(coleiter)) {
- // checking for accents in composite character
- found = FALSE;
- break;
- }
- }
-
- //targetce = firstce;
-
- while (found && (patternceindex < patterncelength)) {
- firstce = targetce;
- targetce = ucol_next(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (targetce == UCOL_IGNORABLE) {
- continue;
- }
-
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- found = found && targetce == patternce[patternceindex];
- patternceindex ++;
- }
-
- targetce = firstce;
-
- if (!found) {
- if (U_FAILURE(*status)) {
- break;
- }
-
- textoffset = reverseShift(strsrch, textoffset, targetce,
- patternceindex);
- patternceindex = 0;
- continue;
- }
-
- if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
- setColEIterOffset(coleiter, textoffset);
- return TRUE;
- }
- }
- setMatchNotFound(strsrch);
- return FALSE;
-#else
int32_t textOffset;
if (strsrch->search->isOverlap) {
@@ -4795,7 +2511,6 @@ UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
setMatchNotFound(strsrch);
return FALSE;
}
-#endif
}
UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
@@ -4806,108 +2521,6 @@ UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
return FALSE;
}
-#if BOYER_MOORE
- UCollationElements *coleiter = strsrch->textIter;
- int32_t *patternce = strsrch->pattern.ces;
- int32_t patterncelength = strsrch->pattern.cesLength;
- int32_t textoffset = ucol_getOffset(coleiter);
- UBool hasPatternAccents =
- strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
-
- // shifting it check for setting offset
- // if setOffset is called previously or there was no previous match, we
- // leave the offset as it is.
- if (strsrch->search->matchedIndex != USEARCH_DONE) {
- textoffset = strsrch->search->matchedIndex;
- }
-
- textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
- patterncelength);
- strsrch->canonicalPrefixAccents[0] = 0;
- strsrch->canonicalSuffixAccents[0] = 0;
-
- while (textoffset >= 0)
- {
- int32_t patternceindex = 1;
- int32_t targetce;
- UBool found = FALSE;
- int32_t firstce = UCOL_NULLORDER;
-
- setColEIterOffset(coleiter, textoffset);
- for (;;) {
- // finding the first pattern ce match, imagine composite
- // characters. for example: search for pattern \u0300 in text
- // \u00C0, we'll have to skip A first before we get to
- // \u0300 the grave accent
- targetce = ucol_next(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
- firstce = targetce;
- }
-
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- if (targetce == patternce[0]) {
- // the first ce can be a contraction
- found = TRUE;
- break;
- }
- if (!hasExpansion(coleiter)) {
- // checking for accents in composite character
- found = FALSE;
- break;
- }
- }
-
- targetce = firstce;
-
- while (found && patternceindex < patterncelength) {
- targetce = ucol_next(coleiter, status);
- if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
- found = FALSE;
- break;
- }
- targetce = getCE(strsrch, targetce);
- if (targetce == UCOL_IGNORABLE) {
- continue;
- }
-
- // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
- found = found && targetce == patternce[patternceindex];
- patternceindex ++;
- }
-
- // initializing the rearranged accent array
- if (hasPatternAccents && !found) {
- strsrch->canonicalPrefixAccents[0] = 0;
- strsrch->canonicalSuffixAccents[0] = 0;
- if (U_FAILURE(*status)) {
- break;
- }
- found = doPreviousCanonicalMatch(strsrch, textoffset, status);
- }
-
- if (!found) {
- if (U_FAILURE(*status)) {
- break;
- }
- textoffset = reverseShift(strsrch, textoffset, targetce,
- patternceindex);
- patternceindex = 0;
- continue;
- }
-
- if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
- setColEIterOffset(coleiter, textoffset);
- return TRUE;
- }
- }
- setMatchNotFound(strsrch);
- return FALSE;
-#else
int32_t textOffset;
if (strsrch->search->isOverlap) {
@@ -4948,7 +2561,6 @@ UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
setMatchNotFound(strsrch);
return FALSE;
}
-#endif
}
#endif /* #if !UCONFIG_NO_COLLATION */