forked from DBD-SQLite/DBD-SQLite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbdimp_tokenizer.inc
316 lines (262 loc) · 9.18 KB
/
dbdimp_tokenizer.inc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
typedef struct perl_tokenizer {
sqlite3_tokenizer base;
SV *coderef; /* the perl tokenizer is a coderef that takes
a string and returns a cursor coderef */
} perl_tokenizer;
typedef struct perl_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
SV *coderef; /* ref to the closure that returns terms */
char *pToken; /* storage for a copy of the last token */
int nTokenAllocated; /* space allocated to pToken buffer */
/* members below are only used if the input string is in utf8 */
const char *pInput; /* input we are tokenizing */
const char *currentByte; /* pointer into pInput */
int currentChar; /* char position corresponding to currentByte */
} perl_tokenizer_cursor;
/*
** Create a new tokenizer instance.
** Will be called whenever a FTS3 table is created with
** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
** where qualified::function::name is a fully qualified perl function
*/
static int perl_tokenizer_Create(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
dTHX;
dSP;
int n_retval;
SV *retval;
perl_tokenizer *t;
if (!argc) {
return SQLITE_ERROR;
}
t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
ENTER;
SAVETMPS;
/* call the qualified::function::name */
PUSHMARK(SP);
PUTBACK;
n_retval = call_pv(argv[0], G_SCALAR);
SPAGAIN;
/* store a copy of the returned coderef into the tokenizer structure */
if (n_retval != 1) {
warn("tokenizer_Create returned %d arguments", n_retval);
}
retval = POPs;
t->coderef = newSVsv(retval);
*ppTokenizer = &t->base;
PUTBACK;
FREETMPS;
LEAVE;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
dTHX;
perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
sv_free(t->coderef);
sqlite3_free(t);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is supposed to be pInput[0..nBytes-1] ..
** except that nBytes passed by fts3 is -1 (don't know why) !
** This is passed to the tokenizer instance, which then returns a
** closure implementing the cursor (so the cursor is again a coderef).
*/
static int perl_tokenizer_Open(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
){
dTHX;
dSP;
dMY_CXT;
U32 flags;
SV *perl_string;
int n_retval;
/* build a Perl copy of the input string */
if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
nBytes = strlen(pInput);
}
/* SVs_TEMP will call sv_2mortal */
perl_string = newSVpvn_flags(pInput, nBytes, SVs_TEMP);
switch (MY_CXT.last_dbh_string_mode) {
DBD_SQLITE_STRING_MODE_UNICODE_NAIVE:
DBD_SQLITE_UTF8_DECODE_NAIVE(perl_string);
break;
DBD_SQLITE_STRING_MODE_UNICODE_FALLBACK:
DBD_SQLITE_STRING_MODE_UNICODE_STRICT:
DBD_SQLITE_UTF8_DECODE_WITH_FALLBACK(perl_string);
break;
default:
break;
}
DBD_SQLITE_UTF8_DECODE_IF_NEEDED(perl_string, MY_CXT.last_dbh_string_mode);
perl_tokenizer *t = (perl_tokenizer *)pTokenizer;
/* allocate and initialize the cursor struct */
perl_tokenizer_cursor *c;
c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
memset(c, 0, sizeof(*c));
*ppCursor = &c->base;
/* special handling if working with utf8 strings */
if (MY_CXT.last_dbh_string_mode & DBD_SQLITE_STRING_MODE_UNICODE_ANY) {
/* data to keep track of byte positions */
c->currentByte = c->pInput = pInput;
c->currentChar = 0;
}
ENTER;
SAVETMPS;
/* call the tokenizer coderef */
PUSHMARK(SP);
XPUSHs(perl_string);
PUTBACK;
n_retval = call_sv(t->coderef, G_SCALAR);
SPAGAIN;
/* store the cursor coderef returned by the tokenizer */
if (n_retval != 1) {
warn("tokenizer returned %d arguments, expected 1", n_retval);
}
c->coderef = newSVsv(POPs);
PUTBACK;
FREETMPS;
LEAVE;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** perl_tokenizer_Open() above.
*/
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
dTHX;
sv_free(c->coderef);
if (c->pToken) sqlite3_free(c->pToken);
sqlite3_free(c);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to perl_tokenizer_Open().
*/
static int perl_tokenizer_Next(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */
const char **ppToken, /* OUT: Normalized text for token */
int *pnBytes, /* OUT: Number of bytes in normalized text */
int *piStartOffset, /* Starting offset of token. IN : char offset; OUT : byte offset */
int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */
int *piPosition /* OUT: Number of tokens returned before this one */
){
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
int result;
int n_retval;
char *token;
char *nextByte;
STRLEN n_a; /* this is required for older perls < 5.8.8 */
I32 hop;
dTHX;
dSP;
ENTER;
SAVETMPS;
/* call the cursor */
PUSHMARK(SP);
PUTBACK;
n_retval = call_sv(c->coderef, G_ARRAY);
SPAGAIN;
/* if we get back an empty list, there is no more token */
if (n_retval == 0) {
result = SQLITE_DONE;
}
/* otherwise, get token details from the return list */
else {
if (n_retval != 5) {
warn("tokenizer cursor returned %d arguments, expected 5", n_retval);
}
*piPosition = POPi;
*piEndOffset = POPi;
*piStartOffset = POPi;
*pnBytes = POPi;
token = POPpx;
if (c->pInput) { /* if working with utf8 data */
/* compute first hop : nb of chars from last position to the start of the token */
hop = *piStartOffset - c->currentChar;
/* hop: advance to the first byte in token */
nextByte = (char*)utf8_hop((U8*)c->currentByte, hop);
/* compute 2nd hop : nb of chars from start of the token to end of token */
hop = *piEndOffset - *piStartOffset;
/* now recompute the start offset in bytes, not in chars */
*piStartOffset = nextByte - c->pInput;
/* 2nd hop: advance past to the last byte in token */
nextByte = (char*)utf8_hop((U8*)nextByte, hop);
/* remember current position (useful for the next invocation) */
c->currentChar = *piEndOffset;
c->currentByte = nextByte;
/* now recompute the end offset in bytes, not in chars */
*piEndOffset = nextByte - c->pInput;
/* compute the size of the normalized token in bytes, not in chars */
*pnBytes = strlen(token);
}
/* make sure we have enough storage for copying the token */
if (*pnBytes > c->nTokenAllocated ){
char *pNew;
c->nTokenAllocated = *pnBytes + 20;
pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
if( !pNew ) return SQLITE_NOMEM;
c->pToken = pNew;
}
/* need to copy the token into the C cursor before perl frees that memory */
memcpy(c->pToken, token, *pnBytes);
*ppToken = c->pToken;
result = SQLITE_OK;
}
PUTBACK;
FREETMPS;
LEAVE;
return result;
}
/*
** The set of routines that implement the perl tokenizer
*/
sqlite3_tokenizer_module perl_tokenizer_Module = {
0,
perl_tokenizer_Create,
perl_tokenizer_Destroy,
perl_tokenizer_Open,
perl_tokenizer_Close,
perl_tokenizer_Next
};
/*
** Register the perl tokenizer with FTS3
*/
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
{
D_imp_dbh(dbh);
int rc;
sqlite3_stmt *pStmt;
const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
sqlite3_tokenizer_module *p = &perl_tokenizer_Module;
if (!DBIc_ACTIVE(imp_dbh)) {
sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle");
return FALSE;
}
#if SQLITE_VERSION_NUMBER >= 3012000
rc = sqlite3_db_config(imp_dbh->db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
if( rc!=SQLITE_OK ){
return rc;
}
#endif
rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(pStmt);
return sqlite3_finalize(pStmt);
}