Annotation of embedaddon/sqlite3/ext/fts3/fts3_tokenizer.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** 2007 June 22
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: ******************************************************************************
                     12: **
                     13: ** This is part of an SQLite module implementing full-text search.
                     14: ** This particular file implements the generic tokenizer interface.
                     15: */
                     16: 
                     17: /*
                     18: ** The code in this file is only compiled if:
                     19: **
                     20: **     * The FTS3 module is being built as an extension
                     21: **       (in which case SQLITE_CORE is not defined), or
                     22: **
                     23: **     * The FTS3 module is being built into the core of
                     24: **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
                     25: */
                     26: #include "fts3Int.h"
                     27: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
                     28: 
                     29: #include <assert.h>
                     30: #include <string.h>
                     31: 
                     32: /*
                     33: ** Implementation of the SQL scalar function for accessing the underlying 
                     34: ** hash table. This function may be called as follows:
                     35: **
                     36: **   SELECT <function-name>(<key-name>);
                     37: **   SELECT <function-name>(<key-name>, <pointer>);
                     38: **
                     39: ** where <function-name> is the name passed as the second argument
                     40: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer').
                     41: **
                     42: ** If the <pointer> argument is specified, it must be a blob value
                     43: ** containing a pointer to be stored as the hash data corresponding
                     44: ** to the string <key-name>. If <pointer> is not specified, then
                     45: ** the string <key-name> must already exist in the has table. Otherwise,
                     46: ** an error is returned.
                     47: **
                     48: ** Whether or not the <pointer> argument is specified, the value returned
                     49: ** is a blob containing the pointer stored as the hash data corresponding
                     50: ** to string <key-name> (after the hash-table is updated, if applicable).
                     51: */
                     52: static void scalarFunc(
                     53:   sqlite3_context *context,
                     54:   int argc,
                     55:   sqlite3_value **argv
                     56: ){
                     57:   Fts3Hash *pHash;
                     58:   void *pPtr = 0;
                     59:   const unsigned char *zName;
                     60:   int nName;
                     61: 
                     62:   assert( argc==1 || argc==2 );
                     63: 
                     64:   pHash = (Fts3Hash *)sqlite3_user_data(context);
                     65: 
                     66:   zName = sqlite3_value_text(argv[0]);
                     67:   nName = sqlite3_value_bytes(argv[0])+1;
                     68: 
                     69:   if( argc==2 ){
                     70:     void *pOld;
                     71:     int n = sqlite3_value_bytes(argv[1]);
                     72:     if( n!=sizeof(pPtr) ){
                     73:       sqlite3_result_error(context, "argument type mismatch", -1);
                     74:       return;
                     75:     }
                     76:     pPtr = *(void **)sqlite3_value_blob(argv[1]);
                     77:     pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr);
                     78:     if( pOld==pPtr ){
                     79:       sqlite3_result_error(context, "out of memory", -1);
                     80:       return;
                     81:     }
                     82:   }else{
                     83:     pPtr = sqlite3Fts3HashFind(pHash, zName, nName);
                     84:     if( !pPtr ){
                     85:       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
                     86:       sqlite3_result_error(context, zErr, -1);
                     87:       sqlite3_free(zErr);
                     88:       return;
                     89:     }
                     90:   }
                     91: 
                     92:   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
                     93: }
                     94: 
                     95: int sqlite3Fts3IsIdChar(char c){
                     96:   static const char isFtsIdChar[] = {
                     97:       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x */
                     98:       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 1x */
                     99:       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
                    100:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
                    101:       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
                    102:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
                    103:       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
                    104:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
                    105:   };
                    106:   return (c&0x80 || isFtsIdChar[(int)(c)]);
                    107: }
                    108: 
                    109: const char *sqlite3Fts3NextToken(const char *zStr, int *pn){
                    110:   const char *z1;
                    111:   const char *z2 = 0;
                    112: 
                    113:   /* Find the start of the next token. */
                    114:   z1 = zStr;
                    115:   while( z2==0 ){
                    116:     char c = *z1;
                    117:     switch( c ){
                    118:       case '\0': return 0;        /* No more tokens here */
                    119:       case '\'':
                    120:       case '"':
                    121:       case '`': {
                    122:         z2 = z1;
                    123:         while( *++z2 && (*z2!=c || *++z2==c) );
                    124:         break;
                    125:       }
                    126:       case '[':
                    127:         z2 = &z1[1];
                    128:         while( *z2 && z2[0]!=']' ) z2++;
                    129:         if( *z2 ) z2++;
                    130:         break;
                    131: 
                    132:       default:
                    133:         if( sqlite3Fts3IsIdChar(*z1) ){
                    134:           z2 = &z1[1];
                    135:           while( sqlite3Fts3IsIdChar(*z2) ) z2++;
                    136:         }else{
                    137:           z1++;
                    138:         }
                    139:     }
                    140:   }
                    141: 
                    142:   *pn = (int)(z2-z1);
                    143:   return z1;
                    144: }
                    145: 
                    146: int sqlite3Fts3InitTokenizer(
                    147:   Fts3Hash *pHash,                /* Tokenizer hash table */
                    148:   const char *zArg,               /* Tokenizer name */
                    149:   sqlite3_tokenizer **ppTok,      /* OUT: Tokenizer (if applicable) */
                    150:   char **pzErr                    /* OUT: Set to malloced error message */
                    151: ){
                    152:   int rc;
                    153:   char *z = (char *)zArg;
                    154:   int n = 0;
                    155:   char *zCopy;
                    156:   char *zEnd;                     /* Pointer to nul-term of zCopy */
                    157:   sqlite3_tokenizer_module *m;
                    158: 
                    159:   zCopy = sqlite3_mprintf("%s", zArg);
                    160:   if( !zCopy ) return SQLITE_NOMEM;
                    161:   zEnd = &zCopy[strlen(zCopy)];
                    162: 
                    163:   z = (char *)sqlite3Fts3NextToken(zCopy, &n);
                    164:   z[n] = '\0';
                    165:   sqlite3Fts3Dequote(z);
                    166: 
                    167:   m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash,z,(int)strlen(z)+1);
                    168:   if( !m ){
                    169:     *pzErr = sqlite3_mprintf("unknown tokenizer: %s", z);
                    170:     rc = SQLITE_ERROR;
                    171:   }else{
                    172:     char const **aArg = 0;
                    173:     int iArg = 0;
                    174:     z = &z[n+1];
                    175:     while( z<zEnd && (NULL!=(z = (char *)sqlite3Fts3NextToken(z, &n))) ){
                    176:       int nNew = sizeof(char *)*(iArg+1);
                    177:       char const **aNew = (const char **)sqlite3_realloc((void *)aArg, nNew);
                    178:       if( !aNew ){
                    179:         sqlite3_free(zCopy);
                    180:         sqlite3_free((void *)aArg);
                    181:         return SQLITE_NOMEM;
                    182:       }
                    183:       aArg = aNew;
                    184:       aArg[iArg++] = z;
                    185:       z[n] = '\0';
                    186:       sqlite3Fts3Dequote(z);
                    187:       z = &z[n+1];
                    188:     }
                    189:     rc = m->xCreate(iArg, aArg, ppTok);
                    190:     assert( rc!=SQLITE_OK || *ppTok );
                    191:     if( rc!=SQLITE_OK ){
                    192:       *pzErr = sqlite3_mprintf("unknown tokenizer");
                    193:     }else{
                    194:       (*ppTok)->pModule = m; 
                    195:     }
                    196:     sqlite3_free((void *)aArg);
                    197:   }
                    198: 
                    199:   sqlite3_free(zCopy);
                    200:   return rc;
                    201: }
                    202: 
                    203: 
                    204: #ifdef SQLITE_TEST
                    205: 
                    206: #include <tcl.h>
                    207: #include <string.h>
                    208: 
                    209: /*
                    210: ** Implementation of a special SQL scalar function for testing tokenizers 
                    211: ** designed to be used in concert with the Tcl testing framework. This
                    212: ** function must be called with two arguments:
                    213: **
                    214: **   SELECT <function-name>(<key-name>, <input-string>);
                    215: **   SELECT <function-name>(<key-name>, <pointer>);
                    216: **
                    217: ** where <function-name> is the name passed as the second argument
                    218: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
                    219: ** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test').
                    220: **
                    221: ** The return value is a string that may be interpreted as a Tcl
                    222: ** list. For each token in the <input-string>, three elements are
                    223: ** added to the returned list. The first is the token position, the 
                    224: ** second is the token text (folded, stemmed, etc.) and the third is the
                    225: ** substring of <input-string> associated with the token. For example, 
                    226: ** using the built-in "simple" tokenizer:
                    227: **
                    228: **   SELECT fts_tokenizer_test('simple', 'I don't see how');
                    229: **
                    230: ** will return the string:
                    231: **
                    232: **   "{0 i I 1 dont don't 2 see see 3 how how}"
                    233: **   
                    234: */
                    235: static void testFunc(
                    236:   sqlite3_context *context,
                    237:   int argc,
                    238:   sqlite3_value **argv
                    239: ){
                    240:   Fts3Hash *pHash;
                    241:   sqlite3_tokenizer_module *p;
                    242:   sqlite3_tokenizer *pTokenizer = 0;
                    243:   sqlite3_tokenizer_cursor *pCsr = 0;
                    244: 
                    245:   const char *zErr = 0;
                    246: 
                    247:   const char *zName;
                    248:   int nName;
                    249:   const char *zInput;
                    250:   int nInput;
                    251: 
                    252:   const char *zArg = 0;
                    253: 
                    254:   const char *zToken;
                    255:   int nToken;
                    256:   int iStart;
                    257:   int iEnd;
                    258:   int iPos;
                    259: 
                    260:   Tcl_Obj *pRet;
                    261: 
                    262:   assert( argc==2 || argc==3 );
                    263: 
                    264:   nName = sqlite3_value_bytes(argv[0]);
                    265:   zName = (const char *)sqlite3_value_text(argv[0]);
                    266:   nInput = sqlite3_value_bytes(argv[argc-1]);
                    267:   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
                    268: 
                    269:   if( argc==3 ){
                    270:     zArg = (const char *)sqlite3_value_text(argv[1]);
                    271:   }
                    272: 
                    273:   pHash = (Fts3Hash *)sqlite3_user_data(context);
                    274:   p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
                    275: 
                    276:   if( !p ){
                    277:     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
                    278:     sqlite3_result_error(context, zErr, -1);
                    279:     sqlite3_free(zErr);
                    280:     return;
                    281:   }
                    282: 
                    283:   pRet = Tcl_NewObj();
                    284:   Tcl_IncrRefCount(pRet);
                    285: 
                    286:   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
                    287:     zErr = "error in xCreate()";
                    288:     goto finish;
                    289:   }
                    290:   pTokenizer->pModule = p;
                    291:   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
                    292:     zErr = "error in xOpen()";
                    293:     goto finish;
                    294:   }
                    295:   pCsr->pTokenizer = pTokenizer;
                    296: 
                    297:   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
                    298:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
                    299:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
                    300:     zToken = &zInput[iStart];
                    301:     nToken = iEnd-iStart;
                    302:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
                    303:   }
                    304: 
                    305:   if( SQLITE_OK!=p->xClose(pCsr) ){
                    306:     zErr = "error in xClose()";
                    307:     goto finish;
                    308:   }
                    309:   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
                    310:     zErr = "error in xDestroy()";
                    311:     goto finish;
                    312:   }
                    313: 
                    314: finish:
                    315:   if( zErr ){
                    316:     sqlite3_result_error(context, zErr, -1);
                    317:   }else{
                    318:     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
                    319:   }
                    320:   Tcl_DecrRefCount(pRet);
                    321: }
                    322: 
                    323: static
                    324: int registerTokenizer(
                    325:   sqlite3 *db, 
                    326:   char *zName, 
                    327:   const sqlite3_tokenizer_module *p
                    328: ){
                    329:   int rc;
                    330:   sqlite3_stmt *pStmt;
                    331:   const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
                    332: 
                    333:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
                    334:   if( rc!=SQLITE_OK ){
                    335:     return rc;
                    336:   }
                    337: 
                    338:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
                    339:   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
                    340:   sqlite3_step(pStmt);
                    341: 
                    342:   return sqlite3_finalize(pStmt);
                    343: }
                    344: 
                    345: static
                    346: int queryTokenizer(
                    347:   sqlite3 *db, 
                    348:   char *zName,  
                    349:   const sqlite3_tokenizer_module **pp
                    350: ){
                    351:   int rc;
                    352:   sqlite3_stmt *pStmt;
                    353:   const char zSql[] = "SELECT fts3_tokenizer(?)";
                    354: 
                    355:   *pp = 0;
                    356:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
                    357:   if( rc!=SQLITE_OK ){
                    358:     return rc;
                    359:   }
                    360: 
                    361:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
                    362:   if( SQLITE_ROW==sqlite3_step(pStmt) ){
                    363:     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
                    364:       memcpy((void *)pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
                    365:     }
                    366:   }
                    367: 
                    368:   return sqlite3_finalize(pStmt);
                    369: }
                    370: 
                    371: void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
                    372: 
                    373: /*
                    374: ** Implementation of the scalar function fts3_tokenizer_internal_test().
                    375: ** This function is used for testing only, it is not included in the
                    376: ** build unless SQLITE_TEST is defined.
                    377: **
                    378: ** The purpose of this is to test that the fts3_tokenizer() function
                    379: ** can be used as designed by the C-code in the queryTokenizer and
                    380: ** registerTokenizer() functions above. These two functions are repeated
                    381: ** in the README.tokenizer file as an example, so it is important to
                    382: ** test them.
                    383: **
                    384: ** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar
                    385: ** function with no arguments. An assert() will fail if a problem is
                    386: ** detected. i.e.:
                    387: **
                    388: **     SELECT fts3_tokenizer_internal_test();
                    389: **
                    390: */
                    391: static void intTestFunc(
                    392:   sqlite3_context *context,
                    393:   int argc,
                    394:   sqlite3_value **argv
                    395: ){
                    396:   int rc;
                    397:   const sqlite3_tokenizer_module *p1;
                    398:   const sqlite3_tokenizer_module *p2;
                    399:   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
                    400: 
                    401:   UNUSED_PARAMETER(argc);
                    402:   UNUSED_PARAMETER(argv);
                    403: 
                    404:   /* Test the query function */
                    405:   sqlite3Fts3SimpleTokenizerModule(&p1);
                    406:   rc = queryTokenizer(db, "simple", &p2);
                    407:   assert( rc==SQLITE_OK );
                    408:   assert( p1==p2 );
                    409:   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
                    410:   assert( rc==SQLITE_ERROR );
                    411:   assert( p2==0 );
                    412:   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
                    413: 
                    414:   /* Test the storage function */
                    415:   rc = registerTokenizer(db, "nosuchtokenizer", p1);
                    416:   assert( rc==SQLITE_OK );
                    417:   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
                    418:   assert( rc==SQLITE_OK );
                    419:   assert( p2==p1 );
                    420: 
                    421:   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
                    422: }
                    423: 
                    424: #endif
                    425: 
                    426: /*
                    427: ** Set up SQL objects in database db used to access the contents of
                    428: ** the hash table pointed to by argument pHash. The hash table must
                    429: ** been initialised to use string keys, and to take a private copy 
                    430: ** of the key when a value is inserted. i.e. by a call similar to:
                    431: **
                    432: **    sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
                    433: **
                    434: ** This function adds a scalar function (see header comment above
                    435: ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
                    436: ** defined at compilation time, a temporary virtual table (see header 
                    437: ** comment above struct HashTableVtab) to the database schema. Both 
                    438: ** provide read/write access to the contents of *pHash.
                    439: **
                    440: ** The third argument to this function, zName, is used as the name
                    441: ** of both the scalar and, if created, the virtual table.
                    442: */
                    443: int sqlite3Fts3InitHashTable(
                    444:   sqlite3 *db, 
                    445:   Fts3Hash *pHash, 
                    446:   const char *zName
                    447: ){
                    448:   int rc = SQLITE_OK;
                    449:   void *p = (void *)pHash;
                    450:   const int any = SQLITE_ANY;
                    451: 
                    452: #ifdef SQLITE_TEST
                    453:   char *zTest = 0;
                    454:   char *zTest2 = 0;
                    455:   void *pdb = (void *)db;
                    456:   zTest = sqlite3_mprintf("%s_test", zName);
                    457:   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
                    458:   if( !zTest || !zTest2 ){
                    459:     rc = SQLITE_NOMEM;
                    460:   }
                    461: #endif
                    462: 
                    463:   if( SQLITE_OK==rc ){
                    464:     rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0);
                    465:   }
                    466:   if( SQLITE_OK==rc ){
                    467:     rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0);
                    468:   }
                    469: #ifdef SQLITE_TEST
                    470:   if( SQLITE_OK==rc ){
                    471:     rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0);
                    472:   }
                    473:   if( SQLITE_OK==rc ){
                    474:     rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0);
                    475:   }
                    476:   if( SQLITE_OK==rc ){
                    477:     rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0);
                    478:   }
                    479: #endif
                    480: 
                    481: #ifdef SQLITE_TEST
                    482:   sqlite3_free(zTest);
                    483:   sqlite3_free(zTest2);
                    484: #endif
                    485: 
                    486:   return rc;
                    487: }
                    488: 
                    489: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>