-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathicu_search.c
385 lines (328 loc) · 9.33 KB
/
icu_search.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
/*
* icu_search.c
*
* Part of icu_ext: a PostgreSQL extension to expose functionality from ICU
* (see http://icu-project.org)
*
* By Daniel Vérité, 2018-2023. See LICENSE.md
*/
#include "icu_ext.h"
/* Postgres includes */
#include "funcapi.h"
#include "lib/stringinfo.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
/* ICU includes */
#include "unicode/ucol.h"
#include "unicode/usearch.h"
PG_FUNCTION_INFO_V1(icu_strpos);
PG_FUNCTION_INFO_V1(icu_strpos_coll);
PG_FUNCTION_INFO_V1(icu_replace);
PG_FUNCTION_INFO_V1(icu_replace_coll);
/*
* Given @str in the database encoding and @str_utf16 its UTF-16
* representation, translate the character position @u16_pos (expressed in
* UTF-16 code units and 0-based) to a character position in @str.
* It differs from @u16_pos if @str_utf16 contains surrogate pairs.
*
* if @p_str null, make it point to the first byte
* corresponding to @pos in @str
*/
static int32_t
translate_char_pos(const char* str,
int32_t str_len,
const UChar* str_utf16,
int32_t u16_len, /* in 16-bit code units */
int32_t u16_pos,
const char **p_str)
{
UChar32 c;
int32_t u16_idx = 0;
int32_t out_pos = 0;
if (GetDatabaseEncoding() == PG_UTF8)
{
int32_t u8_offset = 0;
/* for UTF-8, use ICU macros instead of calling pg_mblen() */
while (u16_idx < u16_pos)
{
U16_NEXT(str_utf16, u16_idx, u16_len, c);
U8_NEXT(str, u8_offset, str_len, c);
out_pos++;
}
if (p_str != NULL)
*p_str = str + u8_offset;
}
else if (pg_encoding_max_length(GetDatabaseEncoding()) == 1)
{
/*
* for mono-byte encodings, assume a 1:1 mapping with UTF-16
* code units, since they should not contain characters
* outside of the BMP.
*/
out_pos = u16_pos;
if (p_str != NULL)
*p_str = str + out_pos;
}
else
{
/* for non-UTF-8 multi-byte encodings, use pg_mblen() */
while (u16_idx < u16_pos)
{
U16_NEXT(str_utf16, u16_idx, u16_len, c);
str += pg_mblen(str);
out_pos++;
}
if (p_str != NULL)
*p_str = str;
}
return out_pos;
}
/*
* Do the bulk of the work for icu_strpos and icu_strpos_coll.
* Return values:
* 0: not found
* >0: the 1-based position of txt2 into txt1
*/
static int32_t
internal_strpos(text *txt1, text *txt2, UCollator *collator)
{
int32_t len1 = VARSIZE_ANY_EXHDR(txt1);
int32_t len2 = VARSIZE_ANY_EXHDR(txt2);
UErrorCode status = U_ZERO_ERROR;
UStringSearch *usearch;
UChar *uchar1, *uchar2;
int32_t ulen1, ulen2;
int32_t pos;
/*
* A non-empty substring is never contained by an empty string.
*/
if (len1 == 0 && len2 != 0)
return 0;
/*
* An empty substring is always found at the first character (even
* inside an empty string), to be consistent with strpos() in
* core.
*/
if (len2 == 0)
return 1;
ulen1 = icu_to_uchar(&uchar1, VARDATA_ANY(txt1), len1);
ulen2 = icu_to_uchar(&uchar2, VARDATA_ANY(txt2), len2);
usearch = usearch_openFromCollator(uchar2, /* needle */
ulen2,
uchar1, /* haystack */
ulen1,
collator,
NULL,
&status);
if (U_FAILURE(status))
elog(ERROR, "failed to start search: %s", u_errorName(status));
else
{
pos = usearch_first(usearch, &status);
if (!U_FAILURE(status) && pos != USEARCH_DONE)
{
/*
* pos is in UTF-16 code units, with surrogate pairs counting
* as two, so we need a non-trivial translation to the corresponding
* position in the original string.
*/
pos = translate_char_pos(VARDATA_ANY(txt1), len1, uchar1, ulen1, pos, NULL);
}
else
pos = -1;
}
pfree(uchar1);
pfree(uchar2);
usearch_close(usearch);
if (U_FAILURE(status))
elog(ERROR, "failed to perform ICU search: %s", u_errorName(status));
/* return 0 if not found or the 1-based position of txt2 inside txt1 */
return pos + 1;
}
/*
* Equivalent of strpos(haystack, needle) using ICU search
*/
Datum
icu_strpos(PG_FUNCTION_ARGS)
{
UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION());
PG_RETURN_INT32(internal_strpos(PG_GETARG_TEXT_PP(0), /* haystack */
PG_GETARG_TEXT_PP(1), /* needle */
collator));
}
/*
* Equivalent of strpos(haystack, needle) using ICU search
*/
Datum
icu_strpos_coll(PG_FUNCTION_ARGS)
{
const char *collname = text_to_cstring(PG_GETARG_TEXT_PP(2));
UCollator *collator = NULL;
UErrorCode status = U_ZERO_ERROR;
int32_t pos;
collator = ucol_open(collname, &status);
if (!collator || U_FAILURE(status)) {
elog(ERROR, "failed to open collation: %s", u_errorName(status));
}
pos = internal_strpos(PG_GETARG_TEXT_PP(0), /* haystack */
PG_GETARG_TEXT_PP(1), /* needle */
collator);
ucol_close(collator);
PG_RETURN_INT32(pos);
}
/*
* Search for @txt2 in @txt1 with the ICU @collator and replace the
* matched substrings with @txt3.
*
* The replacement text is always txt3, but the replaced text may not
* be exactly txt2, and its length in bytes may differ too, depending on
* the collation rules. For example in utf-8 with an accent-insensitive
* collation, {LATIN SMALL LETTER E WITH ACUTE} (2 bytes) will match
* {LATIN SMALL LETTER E} (1 byte).
*/
static text *
internal_str_replace(text *txt1, /* not const because it may be returned */
const text *txt2, /* search for txt2 with collator */
const text *txt3, /* replace the matched substrings by txt3 */
UCollator *collator)
{
int32_t len1 = VARSIZE_ANY_EXHDR(txt1);
int32_t len2 = VARSIZE_ANY_EXHDR(txt2);
int32_t len3 = VARSIZE_ANY_EXHDR(txt3);
UErrorCode status = U_ZERO_ERROR;
UStringSearch *usearch;
UChar *uchar1, *uchar2;
int32_t ulen1, ulen2; /* in utf-16 units */
text *result;
int32_t pos;
StringInfoData resbuf;
if (len1 == 0 || len2 == 0)
return txt1;
ulen1 = icu_to_uchar(&uchar1, VARDATA_ANY(txt1), len1);
ulen2 = icu_to_uchar(&uchar2, VARDATA_ANY(txt2), len2);
usearch = usearch_openFromCollator(uchar2, /* needle */
ulen2,
uchar1, /* haystack */
ulen1,
collator,
NULL,
&status);
/* "nana" in "nananana" must be found 2 times, not 3 times. */
usearch_setAttribute(usearch, USEARCH_OVERLAP, USEARCH_OFF, &status);
pos = usearch_first(usearch, &status);
if (U_FAILURE(status))
elog(ERROR, "failed to perform ICU search: %s", u_errorName(status));
if (pos != USEARCH_DONE)
{
const char *txt1_currptr;
const char* txt1_startptr = VARDATA_ANY(txt1);
initStringInfo(&resbuf);
/* initialize the output string with the segment before the first match */
translate_char_pos(txt1_startptr,
len1,
uchar1,
ulen1,
pos,
&txt1_currptr);
appendBinaryStringInfo(&resbuf,
txt1_startptr,
txt1_currptr - txt1_startptr);
/* append the replacement text */
appendBinaryStringInfo(&resbuf, VARDATA_ANY(txt3), len3);
/* skip the replaced text in txt1 */
translate_char_pos(
txt1_currptr,
len1 - (txt1_currptr - txt1_startptr),
uchar1 + pos,
usearch_getMatchedLength(usearch),
usearch_getMatchedLength(usearch),
&txt1_currptr);
do {
int32_t previous_pos = pos + usearch_getMatchedLength(usearch);
CHECK_FOR_INTERRUPTS();
pos = usearch_next(usearch, &status);
if (U_FAILURE(status))
break;
if (pos != USEARCH_DONE)
{
const char *txt1_nextptr;
/* copy the segment before the match */
translate_char_pos(
txt1_currptr,
len1 - (txt1_currptr - txt1_startptr),
uchar1 + previous_pos,
len1 - previous_pos,
pos - previous_pos,
&txt1_nextptr);
appendBinaryStringInfo(&resbuf,
txt1_currptr,
txt1_nextptr - txt1_currptr);
/* compute the length of the replaced text in txt1 */
translate_char_pos(
txt1_nextptr,
len1 - (txt1_nextptr - txt1_startptr),
uchar1 + pos,
usearch_getMatchedLength(usearch),
usearch_getMatchedLength(usearch),
&txt1_currptr);
/* append the replacement text */
appendBinaryStringInfo(&resbuf, VARDATA_ANY(txt3), len3);
}
} while (pos != USEARCH_DONE);
/* copy the segment after the last match */
if (len1 - (txt1_currptr - txt1_startptr) > 0)
{
appendBinaryStringInfo(&resbuf,
txt1_currptr,
len1 - (txt1_currptr - txt1_startptr));
}
result = cstring_to_text_with_len(resbuf.data, resbuf.len);
pfree(resbuf.data);
}
else
{
/*
* The substring is not found: return the original string
*/
result = txt1;
}
pfree(uchar1);
pfree(uchar2);
if (usearch != NULL)
usearch_close(usearch);
if (U_FAILURE(status))
elog(ERROR, "failed to perform ICU search: %s", u_errorName(status));
return result;
}
Datum
icu_replace(PG_FUNCTION_ARGS)
{
UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION());
text *string;
string = internal_str_replace(
PG_GETARG_TEXT_PP(0), /* haystack */
PG_GETARG_TEXT_PP(1), /* needle */
PG_GETARG_TEXT_PP(2), /* replacement */
collator);
PG_RETURN_TEXT_P(string);
}
Datum
icu_replace_coll(PG_FUNCTION_ARGS)
{
const char *collname = text_to_cstring(PG_GETARG_TEXT_PP(3));
UCollator *collator = NULL;
UErrorCode status = U_ZERO_ERROR;
collator = ucol_open(collname, &status);
if (!collator || U_FAILURE(status)) {
elog(ERROR, "failed to open collation: %s", u_errorName(status));
}
PG_RETURN_TEXT_P(
internal_str_replace(
PG_GETARG_TEXT_PP(0), /* haystack */
PG_GETARG_TEXT_PP(1), /* needle */
PG_GETARG_TEXT_PP(2), /* replacement */
collator)
);
}