Annotation of embedaddon/sqlite3/ext/fts3/fts3_tokenizer.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2007 June 22
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: ******************************************************************************
12: **
13: ** This is part of an SQLite module implementing full-text search.
14: ** This particular file implements the generic tokenizer interface.
15: */
16:
17: /*
18: ** The code in this file is only compiled if:
19: **
20: ** * The FTS3 module is being built as an extension
21: ** (in which case SQLITE_CORE is not defined), or
22: **
23: ** * The FTS3 module is being built into the core of
24: ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
25: */
26: #include "fts3Int.h"
27: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
28:
29: #include <assert.h>
30: #include <string.h>
31:
32: /*
33: ** Implementation of the SQL scalar function for accessing the underlying
34: ** hash table. This function may be called as follows:
35: **
36: ** SELECT <function-name>(<key-name>);
37: ** SELECT <function-name>(<key-name>, <pointer>);
38: **
39: ** where <function-name> is the name passed as the second argument
40: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer').
41: **
42: ** If the <pointer> argument is specified, it must be a blob value
43: ** containing a pointer to be stored as the hash data corresponding
44: ** to the string <key-name>. If <pointer> is not specified, then
45: ** the string <key-name> must already exist in the has table. Otherwise,
46: ** an error is returned.
47: **
48: ** Whether or not the <pointer> argument is specified, the value returned
49: ** is a blob containing the pointer stored as the hash data corresponding
50: ** to string <key-name> (after the hash-table is updated, if applicable).
51: */
52: static void scalarFunc(
53: sqlite3_context *context,
54: int argc,
55: sqlite3_value **argv
56: ){
57: Fts3Hash *pHash;
58: void *pPtr = 0;
59: const unsigned char *zName;
60: int nName;
61:
62: assert( argc==1 || argc==2 );
63:
64: pHash = (Fts3Hash *)sqlite3_user_data(context);
65:
66: zName = sqlite3_value_text(argv[0]);
67: nName = sqlite3_value_bytes(argv[0])+1;
68:
69: if( argc==2 ){
70: void *pOld;
71: int n = sqlite3_value_bytes(argv[1]);
72: if( n!=sizeof(pPtr) ){
73: sqlite3_result_error(context, "argument type mismatch", -1);
74: return;
75: }
76: pPtr = *(void **)sqlite3_value_blob(argv[1]);
77: pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr);
78: if( pOld==pPtr ){
79: sqlite3_result_error(context, "out of memory", -1);
80: return;
81: }
82: }else{
83: pPtr = sqlite3Fts3HashFind(pHash, zName, nName);
84: if( !pPtr ){
85: char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
86: sqlite3_result_error(context, zErr, -1);
87: sqlite3_free(zErr);
88: return;
89: }
90: }
91:
92: sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
93: }
94:
95: int sqlite3Fts3IsIdChar(char c){
96: static const char isFtsIdChar[] = {
97: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */
98: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */
99: 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
100: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
101: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
102: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
103: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
104: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
105: };
106: return (c&0x80 || isFtsIdChar[(int)(c)]);
107: }
108:
109: const char *sqlite3Fts3NextToken(const char *zStr, int *pn){
110: const char *z1;
111: const char *z2 = 0;
112:
113: /* Find the start of the next token. */
114: z1 = zStr;
115: while( z2==0 ){
116: char c = *z1;
117: switch( c ){
118: case '\0': return 0; /* No more tokens here */
119: case '\'':
120: case '"':
121: case '`': {
122: z2 = z1;
123: while( *++z2 && (*z2!=c || *++z2==c) );
124: break;
125: }
126: case '[':
127: z2 = &z1[1];
128: while( *z2 && z2[0]!=']' ) z2++;
129: if( *z2 ) z2++;
130: break;
131:
132: default:
133: if( sqlite3Fts3IsIdChar(*z1) ){
134: z2 = &z1[1];
135: while( sqlite3Fts3IsIdChar(*z2) ) z2++;
136: }else{
137: z1++;
138: }
139: }
140: }
141:
142: *pn = (int)(z2-z1);
143: return z1;
144: }
145:
146: int sqlite3Fts3InitTokenizer(
147: Fts3Hash *pHash, /* Tokenizer hash table */
148: const char *zArg, /* Tokenizer name */
149: sqlite3_tokenizer **ppTok, /* OUT: Tokenizer (if applicable) */
150: char **pzErr /* OUT: Set to malloced error message */
151: ){
152: int rc;
153: char *z = (char *)zArg;
154: int n = 0;
155: char *zCopy;
156: char *zEnd; /* Pointer to nul-term of zCopy */
157: sqlite3_tokenizer_module *m;
158:
159: zCopy = sqlite3_mprintf("%s", zArg);
160: if( !zCopy ) return SQLITE_NOMEM;
161: zEnd = &zCopy[strlen(zCopy)];
162:
163: z = (char *)sqlite3Fts3NextToken(zCopy, &n);
164: z[n] = '\0';
165: sqlite3Fts3Dequote(z);
166:
167: m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash,z,(int)strlen(z)+1);
168: if( !m ){
169: *pzErr = sqlite3_mprintf("unknown tokenizer: %s", z);
170: rc = SQLITE_ERROR;
171: }else{
172: char const **aArg = 0;
173: int iArg = 0;
174: z = &z[n+1];
175: while( z<zEnd && (NULL!=(z = (char *)sqlite3Fts3NextToken(z, &n))) ){
176: int nNew = sizeof(char *)*(iArg+1);
177: char const **aNew = (const char **)sqlite3_realloc((void *)aArg, nNew);
178: if( !aNew ){
179: sqlite3_free(zCopy);
180: sqlite3_free((void *)aArg);
181: return SQLITE_NOMEM;
182: }
183: aArg = aNew;
184: aArg[iArg++] = z;
185: z[n] = '\0';
186: sqlite3Fts3Dequote(z);
187: z = &z[n+1];
188: }
189: rc = m->xCreate(iArg, aArg, ppTok);
190: assert( rc!=SQLITE_OK || *ppTok );
191: if( rc!=SQLITE_OK ){
192: *pzErr = sqlite3_mprintf("unknown tokenizer");
193: }else{
194: (*ppTok)->pModule = m;
195: }
196: sqlite3_free((void *)aArg);
197: }
198:
199: sqlite3_free(zCopy);
200: return rc;
201: }
202:
203:
204: #ifdef SQLITE_TEST
205:
206: #include <tcl.h>
207: #include <string.h>
208:
209: /*
210: ** Implementation of a special SQL scalar function for testing tokenizers
211: ** designed to be used in concert with the Tcl testing framework. This
212: ** function must be called with two arguments:
213: **
214: ** SELECT <function-name>(<key-name>, <input-string>);
215: ** SELECT <function-name>(<key-name>, <pointer>);
216: **
217: ** where <function-name> is the name passed as the second argument
218: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
219: ** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test').
220: **
221: ** The return value is a string that may be interpreted as a Tcl
222: ** list. For each token in the <input-string>, three elements are
223: ** added to the returned list. The first is the token position, the
224: ** second is the token text (folded, stemmed, etc.) and the third is the
225: ** substring of <input-string> associated with the token. For example,
226: ** using the built-in "simple" tokenizer:
227: **
228: ** SELECT fts_tokenizer_test('simple', 'I don't see how');
229: **
230: ** will return the string:
231: **
232: ** "{0 i I 1 dont don't 2 see see 3 how how}"
233: **
234: */
235: static void testFunc(
236: sqlite3_context *context,
237: int argc,
238: sqlite3_value **argv
239: ){
240: Fts3Hash *pHash;
241: sqlite3_tokenizer_module *p;
242: sqlite3_tokenizer *pTokenizer = 0;
243: sqlite3_tokenizer_cursor *pCsr = 0;
244:
245: const char *zErr = 0;
246:
247: const char *zName;
248: int nName;
249: const char *zInput;
250: int nInput;
251:
252: const char *zArg = 0;
253:
254: const char *zToken;
255: int nToken;
256: int iStart;
257: int iEnd;
258: int iPos;
259:
260: Tcl_Obj *pRet;
261:
262: assert( argc==2 || argc==3 );
263:
264: nName = sqlite3_value_bytes(argv[0]);
265: zName = (const char *)sqlite3_value_text(argv[0]);
266: nInput = sqlite3_value_bytes(argv[argc-1]);
267: zInput = (const char *)sqlite3_value_text(argv[argc-1]);
268:
269: if( argc==3 ){
270: zArg = (const char *)sqlite3_value_text(argv[1]);
271: }
272:
273: pHash = (Fts3Hash *)sqlite3_user_data(context);
274: p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
275:
276: if( !p ){
277: char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
278: sqlite3_result_error(context, zErr, -1);
279: sqlite3_free(zErr);
280: return;
281: }
282:
283: pRet = Tcl_NewObj();
284: Tcl_IncrRefCount(pRet);
285:
286: if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
287: zErr = "error in xCreate()";
288: goto finish;
289: }
290: pTokenizer->pModule = p;
291: if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
292: zErr = "error in xOpen()";
293: goto finish;
294: }
295: pCsr->pTokenizer = pTokenizer;
296:
297: while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
298: Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
299: Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
300: zToken = &zInput[iStart];
301: nToken = iEnd-iStart;
302: Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
303: }
304:
305: if( SQLITE_OK!=p->xClose(pCsr) ){
306: zErr = "error in xClose()";
307: goto finish;
308: }
309: if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
310: zErr = "error in xDestroy()";
311: goto finish;
312: }
313:
314: finish:
315: if( zErr ){
316: sqlite3_result_error(context, zErr, -1);
317: }else{
318: sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
319: }
320: Tcl_DecrRefCount(pRet);
321: }
322:
323: static
324: int registerTokenizer(
325: sqlite3 *db,
326: char *zName,
327: const sqlite3_tokenizer_module *p
328: ){
329: int rc;
330: sqlite3_stmt *pStmt;
331: const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
332:
333: rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
334: if( rc!=SQLITE_OK ){
335: return rc;
336: }
337:
338: sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
339: sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
340: sqlite3_step(pStmt);
341:
342: return sqlite3_finalize(pStmt);
343: }
344:
345: static
346: int queryTokenizer(
347: sqlite3 *db,
348: char *zName,
349: const sqlite3_tokenizer_module **pp
350: ){
351: int rc;
352: sqlite3_stmt *pStmt;
353: const char zSql[] = "SELECT fts3_tokenizer(?)";
354:
355: *pp = 0;
356: rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
357: if( rc!=SQLITE_OK ){
358: return rc;
359: }
360:
361: sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
362: if( SQLITE_ROW==sqlite3_step(pStmt) ){
363: if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
364: memcpy((void *)pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
365: }
366: }
367:
368: return sqlite3_finalize(pStmt);
369: }
370:
371: void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
372:
373: /*
374: ** Implementation of the scalar function fts3_tokenizer_internal_test().
375: ** This function is used for testing only, it is not included in the
376: ** build unless SQLITE_TEST is defined.
377: **
378: ** The purpose of this is to test that the fts3_tokenizer() function
379: ** can be used as designed by the C-code in the queryTokenizer and
380: ** registerTokenizer() functions above. These two functions are repeated
381: ** in the README.tokenizer file as an example, so it is important to
382: ** test them.
383: **
384: ** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar
385: ** function with no arguments. An assert() will fail if a problem is
386: ** detected. i.e.:
387: **
388: ** SELECT fts3_tokenizer_internal_test();
389: **
390: */
391: static void intTestFunc(
392: sqlite3_context *context,
393: int argc,
394: sqlite3_value **argv
395: ){
396: int rc;
397: const sqlite3_tokenizer_module *p1;
398: const sqlite3_tokenizer_module *p2;
399: sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
400:
401: UNUSED_PARAMETER(argc);
402: UNUSED_PARAMETER(argv);
403:
404: /* Test the query function */
405: sqlite3Fts3SimpleTokenizerModule(&p1);
406: rc = queryTokenizer(db, "simple", &p2);
407: assert( rc==SQLITE_OK );
408: assert( p1==p2 );
409: rc = queryTokenizer(db, "nosuchtokenizer", &p2);
410: assert( rc==SQLITE_ERROR );
411: assert( p2==0 );
412: assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
413:
414: /* Test the storage function */
415: rc = registerTokenizer(db, "nosuchtokenizer", p1);
416: assert( rc==SQLITE_OK );
417: rc = queryTokenizer(db, "nosuchtokenizer", &p2);
418: assert( rc==SQLITE_OK );
419: assert( p2==p1 );
420:
421: sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
422: }
423:
424: #endif
425:
426: /*
427: ** Set up SQL objects in database db used to access the contents of
428: ** the hash table pointed to by argument pHash. The hash table must
429: ** been initialised to use string keys, and to take a private copy
430: ** of the key when a value is inserted. i.e. by a call similar to:
431: **
432: ** sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
433: **
434: ** This function adds a scalar function (see header comment above
435: ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
436: ** defined at compilation time, a temporary virtual table (see header
437: ** comment above struct HashTableVtab) to the database schema. Both
438: ** provide read/write access to the contents of *pHash.
439: **
440: ** The third argument to this function, zName, is used as the name
441: ** of both the scalar and, if created, the virtual table.
442: */
443: int sqlite3Fts3InitHashTable(
444: sqlite3 *db,
445: Fts3Hash *pHash,
446: const char *zName
447: ){
448: int rc = SQLITE_OK;
449: void *p = (void *)pHash;
450: const int any = SQLITE_ANY;
451:
452: #ifdef SQLITE_TEST
453: char *zTest = 0;
454: char *zTest2 = 0;
455: void *pdb = (void *)db;
456: zTest = sqlite3_mprintf("%s_test", zName);
457: zTest2 = sqlite3_mprintf("%s_internal_test", zName);
458: if( !zTest || !zTest2 ){
459: rc = SQLITE_NOMEM;
460: }
461: #endif
462:
463: if( SQLITE_OK==rc ){
464: rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0);
465: }
466: if( SQLITE_OK==rc ){
467: rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0);
468: }
469: #ifdef SQLITE_TEST
470: if( SQLITE_OK==rc ){
471: rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0);
472: }
473: if( SQLITE_OK==rc ){
474: rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0);
475: }
476: if( SQLITE_OK==rc ){
477: rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0);
478: }
479: #endif
480:
481: #ifdef SQLITE_TEST
482: sqlite3_free(zTest);
483: sqlite3_free(zTest2);
484: #endif
485:
486: return rc;
487: }
488:
489: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>