Skip to content

Commit

Permalink
fix infinite loop in replace with AI collations (babelfish-for-postgrโ€ฆ
Browse files Browse the repository at this point in the history
โ€ฆesql#2849) (babelfish-for-postgresql#2867)

ICU usearch_next() goes into infinite loop when pattern to search starts with a surrogate pair.
To get around this we check if output of usearch_next() is stuck and not proceeding forwards
and set the offset for next search ourselves.
The next offset is simply the next character after the current char in source string.

SRC STRING - 'abc๐Ÿ™‚defghi๐Ÿ™‚๐Ÿ™‚'    PATTERN TO FIND = '๐Ÿ™‚def'

usearch_next() gets stuck on "๐Ÿ™‚" idx = 3 and repeatedly returns this index.
We will intervene and set the offset to "d" idx = 4. 
So that usearch_next only starts looking from this character.

Taks: BABEL-5167

Signed-off-by: Tanzeel Khan <[email protected]>
  • Loading branch information
tanscorpio7 authored Aug 20, 2024
1 parent eecb8f2 commit 8607687
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 4 deletions.
65 changes: 61 additions & 4 deletions contrib/babelfishpg_tsql/src/collation.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@
#define MAX_BYTES_PER_CHAR 4
#define MAX_INPUT_LENGTH_TO_REMOVE_ACCENTS 250 * 1024 * 1024

/*
* Check if Uchar is lead surrogate pair, If Uchar is in
* the range D800 - DBFF then it is a lead surrogate pair
*/
#define UCHAR_IS_SURROGATE(c) ((c & 0xF800) == 0xD800)

/* Find length of given Uchar */
#define UCHAR_LENGTH(c) (UCHAR_IS_SURROGATE(c) ? 2 : 1)

Oid server_collation_oid = InvalidOid;
collation_callbacks *collation_callbacks_ptr = NULL;
extern bool babelfish_dump_restore;
Expand Down Expand Up @@ -1475,12 +1484,13 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
int32_t src_len_utf8 = VARSIZE_ANY_EXHDR(src_text);
int32_t substr_len_utf8 = VARSIZE_ANY_EXHDR(substr_text);
int32_t src_ulen, substr_ulen;
int32_t u8_pos = -1;
int32_t u8_pos = -1, pos_prev_loop = -1;
UErrorCode status = U_ZERO_ERROR;
UStringSearch *usearch;
UChar *src_uchar, *substr_uchar;
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
bool is_CS_AI = false;
bool is_substr_starts_with_surrogate;

if (OidIsValid(coll_info_of_inputcollid.oid) &&
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
Expand All @@ -1491,6 +1501,8 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len_utf8);
substr_ulen = icu_to_uchar(&substr_uchar, VARDATA_ANY(substr_text), substr_len_utf8);

is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE(substr_uchar[0]);

usearch = usearch_openFromCollator(substr_uchar,
substr_ulen,
src_uchar,
Expand All @@ -1507,7 +1519,7 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
errmsg("failed to perform ICU search: %s",
u_errorName(status))));

for (int u16_pos = usearch_first(usearch, &status);
for (int32_t u16_pos = usearch_first(usearch, &status);
u16_pos != USEARCH_DONE;
u16_pos = usearch_next(usearch, &status))
{
Expand All @@ -1517,6 +1529,27 @@ pltsql_strpos_non_determinstic(text *src_text, text *substr_text, Oid collid, in
errmsg("failed to perform ICU search: %s",
u_errorName(status))));

/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
if (u16_pos == pos_prev_loop)
{
int32_t next_char_idx = u16_pos + UCHAR_LENGTH(src_uchar[u16_pos]);

if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
{
usearch_setOffset(usearch, next_char_idx, &status);

if (U_FAILURE(status))
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
errmsg("failed to set offset in ICU search: %s", u_errorName(status))));

continue;
}
else
break;
}

pos_prev_loop = u16_pos;

/* for CS_AI collations usearch can give false positives so we double check the results here */
if (!(is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), substr_uchar, substr_ulen, false) != 0))
{
Expand Down Expand Up @@ -1564,7 +1597,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
int32_t src_len = VARSIZE_ANY_EXHDR(src_text);
int32_t from_str_len = VARSIZE_ANY_EXHDR(from_text);
int32_t to_str_len = VARSIZE_ANY_EXHDR(to_text);
int32_t previous_pos;
int32_t previous_pos, pos_prev_loop = -1;
int32_t src_ulen, from_ulen; /* in utf-16 units */
UErrorCode status = U_ZERO_ERROR;
UStringSearch *usearch;
Expand All @@ -1573,6 +1606,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
StringInfoData resbuf;
coll_info_t coll_info_of_inputcollid = tsql_lookup_collation_table_internal(collid);
bool is_CS_AI = false;
bool is_substr_starts_with_surrogate;

if (OidIsValid(coll_info_of_inputcollid.oid) &&
coll_info_of_inputcollid.collateflags == 0x000e /* CS_AI */ )
Expand All @@ -1583,6 +1617,8 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
src_ulen = icu_to_uchar(&src_uchar, VARDATA_ANY(src_text), src_len);
from_ulen = icu_to_uchar(&from_uchar, VARDATA_ANY(from_text), from_str_len);

is_substr_starts_with_surrogate = UCHAR_IS_SURROGATE(from_uchar[0]);

usearch = usearch_openFromCollator(from_uchar, /* needle */
from_ulen,
src_uchar, /* haystack */
Expand All @@ -1596,7 +1632,7 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
initStringInfo(&resbuf);
previous_pos = 0;

for (int pos = usearch_first(usearch, &status);
for (int32_t pos = usearch_first(usearch, &status);
pos != USEARCH_DONE;
pos = usearch_next(usearch, &status))
{
Expand All @@ -1609,6 +1645,27 @@ pltsql_replace_non_determinstic(text *src_text, text *from_text, text *to_text,
errmsg("failed to perform ICU search: %s",
u_errorName(status))));

/* ICU bug, When pattern start with a surrogate pair ICU usearch_next stops moving forward entering an infinite loop */
if (pos == pos_prev_loop)
{
int32_t next_char_idx = pos + UCHAR_LENGTH(src_uchar[pos]);

if (is_substr_starts_with_surrogate && next_char_idx < src_ulen)
{
usearch_setOffset(usearch, next_char_idx, &status);

if (U_FAILURE(status))
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
errmsg("failed to set offset in ICU search: %s", u_errorName(status))));

continue;
}
else
break;
}

pos_prev_loop = pos;

/* for CS_AI collations usearch can give false positives so we double check the results here */
if (is_CS_AI && icu_compare_utf8_coll(mylocale->info.icu.ucol, &src_uchar[usearch_getMatchedStart(usearch)], usearch_getMatchedLength(usearch), from_uchar, from_ulen, false) != 0)
continue;
Expand Down
60 changes: 60 additions & 0 deletions test/JDBC/expected/charindex_and_replace_CIAI_collations.out
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,66 @@ AAAAAABBBBBBBEEEEEEAAAAA
DROP TABLE BABEL_4850_T
GO

/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT CHARINDEX(N'๐Ÿ™‚dEf', N'abc๐Ÿ™‚def๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'๐Ÿ™‚D', N'abc๐Ÿ™‚d๐Ÿ™‚d๐Ÿ™‚D' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'๐Ÿ™‚dEf', N'abc๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CI_AI)
SELECT CHARINDEX(N'๐Ÿ™‚', N'abc๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'๐Ÿ™‚', N'abc๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CI_AI)
GO
~~START~~
int
14
~~END~~

~~START~~
int
8
~~END~~

~~START~~
int
4
~~END~~

~~START~~
int
4
~~END~~

~~START~~
int
4
~~END~~


/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT REPLACE(N'abc๐Ÿ™‚defghi๐Ÿ™‚๐Ÿ™‚', N'๐Ÿ™‚def', N'jhi๐Ÿ™‚' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚defghi๐Ÿ™‚๐Ÿ™‚', N'๐Ÿ™‚', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚', N'๐Ÿ™‚', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'๐Ÿ™‚abc๐Ÿ™‚', N'๐Ÿ™‚', N'<---->' COLLATE Latin1_General_CI_AI)
GO
~~START~~
nvarchar
abcjhi๐Ÿ™‚ghi๐Ÿ™‚๐Ÿ™‚
~~END~~

~~START~~
nvarchar
abc<----><----><----><----><---->defghi<----><---->
~~END~~

~~START~~
nvarchar
abc<----><----><----><---->
~~END~~

~~START~~
nvarchar
<---->abc<---->
~~END~~


-- psql
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);
Expand Down
15 changes: 15 additions & 0 deletions test/JDBC/input/charindex_and_replace_CIAI_collations.mix
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,21 @@ GO
DROP TABLE BABEL_4850_T
GO

/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT CHARINDEX(N'๐Ÿ™‚dEf', N'abc๐Ÿ™‚def๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'๐Ÿ™‚D', N'abc๐Ÿ™‚d๐Ÿ™‚d๐Ÿ™‚D' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'๐Ÿ™‚dEf', N'abc๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CI_AI)
SELECT CHARINDEX(N'๐Ÿ™‚', N'abc๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CS_AI)
SELECT CHARINDEX(N'๐Ÿ™‚', N'abc๐Ÿ™‚defgh๐Ÿ™‚dEfi๐Ÿ™‚๐Ÿ™‚' COLLATE Latin1_General_CI_AI)
GO

/* Substring to find starts with surrogate pair BABEL-5169 */
SELECT REPLACE(N'abc๐Ÿ™‚defghi๐Ÿ™‚๐Ÿ™‚', N'๐Ÿ™‚def', N'jhi๐Ÿ™‚' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚defghi๐Ÿ™‚๐Ÿ™‚', N'๐Ÿ™‚', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'abc๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚๐Ÿ™‚', N'๐Ÿ™‚', N'<---->' COLLATE Latin1_General_CI_AI)
SELECT REPLACE(N'๐Ÿ™‚abc๐Ÿ™‚', N'๐Ÿ™‚', N'<---->' COLLATE Latin1_General_CI_AI)
GO

-- psql
CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
CREATE COLLATION ignore_accents (provider = icu, locale = 'nd-u-kc-true-ks-level1', deterministic = false);
Expand Down

0 comments on commit 8607687

Please sign in to comment.