Annotation of embedaddon/sqlite3/ext/fts2/fts2_tokenizer.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** 2007 June 22
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: ******************************************************************************
                     12: **
                     13: ** This is part of an SQLite module implementing full-text search.
                     14: ** This particular file implements the generic tokenizer interface.
                     15: */
                     16: 
                     17: /*
                     18: ** The code in this file is only compiled if:
                     19: **
                     20: **     * The FTS2 module is being built as an extension
                     21: **       (in which case SQLITE_CORE is not defined), or
                     22: **
                     23: **     * The FTS2 module is being built into the core of
                     24: **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
                     25: */
                     26: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
                     27: 
                     28: 
                     29: #include "sqlite3.h"
                     30: #include "sqlite3ext.h"
                     31: SQLITE_EXTENSION_INIT1
                     32: 
                     33: #include "fts2_hash.h"
                     34: #include "fts2_tokenizer.h"
                     35: #include <assert.h>
                     36: 
                     37: /*
                     38: ** Implementation of the SQL scalar function for accessing the underlying 
                     39: ** hash table. This function may be called as follows:
                     40: **
                     41: **   SELECT <function-name>(<key-name>);
                     42: **   SELECT <function-name>(<key-name>, <pointer>);
                     43: **
                     44: ** where <function-name> is the name passed as the second argument
                     45: ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
                     46: **
                     47: ** If the <pointer> argument is specified, it must be a blob value
                     48: ** containing a pointer to be stored as the hash data corresponding
                     49: ** to the string <key-name>. If <pointer> is not specified, then
                     50: ** the string <key-name> must already exist in the has table. Otherwise,
                     51: ** an error is returned.
                     52: **
                     53: ** Whether or not the <pointer> argument is specified, the value returned
                     54: ** is a blob containing the pointer stored as the hash data corresponding
                     55: ** to string <key-name> (after the hash-table is updated, if applicable).
                     56: */
                     57: static void scalarFunc(
                     58:   sqlite3_context *context,
                     59:   int argc,
                     60:   sqlite3_value **argv
                     61: ){
                     62:   fts2Hash *pHash;
                     63:   void *pPtr = 0;
                     64:   const unsigned char *zName;
                     65:   int nName;
                     66: 
                     67:   assert( argc==1 || argc==2 );
                     68: 
                     69:   pHash = (fts2Hash *)sqlite3_user_data(context);
                     70: 
                     71:   zName = sqlite3_value_text(argv[0]);
                     72:   nName = sqlite3_value_bytes(argv[0])+1;
                     73: 
                     74:   if( argc==2 ){
                     75:     void *pOld;
                     76:     int n = sqlite3_value_bytes(argv[1]);
                     77:     if( n!=sizeof(pPtr) ){
                     78:       sqlite3_result_error(context, "argument type mismatch", -1);
                     79:       return;
                     80:     }
                     81:     pPtr = *(void **)sqlite3_value_blob(argv[1]);
                     82:     pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
                     83:     if( pOld==pPtr ){
                     84:       sqlite3_result_error(context, "out of memory", -1);
                     85:       return;
                     86:     }
                     87:   }else{
                     88:     pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
                     89:     if( !pPtr ){
                     90:       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
                     91:       sqlite3_result_error(context, zErr, -1);
                     92:       sqlite3_free(zErr);
                     93:       return;
                     94:     }
                     95:   }
                     96: 
                     97:   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
                     98: }
                     99: 
                    100: #ifdef SQLITE_TEST
                    101: 
                    102: #include <tcl.h>
                    103: #include <string.h>
                    104: 
                    105: /*
                    106: ** Implementation of a special SQL scalar function for testing tokenizers 
                    107: ** designed to be used in concert with the Tcl testing framework. This
                    108: ** function must be called with two arguments:
                    109: **
                    110: **   SELECT <function-name>(<key-name>, <input-string>);
                    111: **   SELECT <function-name>(<key-name>, <pointer>);
                    112: **
                    113: ** where <function-name> is the name passed as the second argument
                    114: ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
                    115: ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
                    116: **
                    117: ** The return value is a string that may be interpreted as a Tcl
                    118: ** list. For each token in the <input-string>, three elements are
                    119: ** added to the returned list. The first is the token position, the 
                    120: ** second is the token text (folded, stemmed, etc.) and the third is the
                    121: ** substring of <input-string> associated with the token. For example, 
                    122: ** using the built-in "simple" tokenizer:
                    123: **
                    124: **   SELECT fts_tokenizer_test('simple', 'I don't see how');
                    125: **
                    126: ** will return the string:
                    127: **
                    128: **   "{0 i I 1 dont don't 2 see see 3 how how}"
                    129: **   
                    130: */
                    131: static void testFunc(
                    132:   sqlite3_context *context,
                    133:   int argc,
                    134:   sqlite3_value **argv
                    135: ){
                    136:   fts2Hash *pHash;
                    137:   sqlite3_tokenizer_module *p;
                    138:   sqlite3_tokenizer *pTokenizer = 0;
                    139:   sqlite3_tokenizer_cursor *pCsr = 0;
                    140: 
                    141:   const char *zErr = 0;
                    142: 
                    143:   const char *zName;
                    144:   int nName;
                    145:   const char *zInput;
                    146:   int nInput;
                    147: 
                    148:   const char *zArg = 0;
                    149: 
                    150:   const char *zToken;
                    151:   int nToken;
                    152:   int iStart;
                    153:   int iEnd;
                    154:   int iPos;
                    155: 
                    156:   Tcl_Obj *pRet;
                    157: 
                    158:   assert( argc==2 || argc==3 );
                    159: 
                    160:   nName = sqlite3_value_bytes(argv[0]);
                    161:   zName = (const char *)sqlite3_value_text(argv[0]);
                    162:   nInput = sqlite3_value_bytes(argv[argc-1]);
                    163:   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
                    164: 
                    165:   if( argc==3 ){
                    166:     zArg = (const char *)sqlite3_value_text(argv[1]);
                    167:   }
                    168: 
                    169:   pHash = (fts2Hash *)sqlite3_user_data(context);
                    170:   p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
                    171: 
                    172:   if( !p ){
                    173:     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
                    174:     sqlite3_result_error(context, zErr, -1);
                    175:     sqlite3_free(zErr);
                    176:     return;
                    177:   }
                    178: 
                    179:   pRet = Tcl_NewObj();
                    180:   Tcl_IncrRefCount(pRet);
                    181: 
                    182:   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
                    183:     zErr = "error in xCreate()";
                    184:     goto finish;
                    185:   }
                    186:   pTokenizer->pModule = p;
                    187:   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
                    188:     zErr = "error in xOpen()";
                    189:     goto finish;
                    190:   }
                    191:   pCsr->pTokenizer = pTokenizer;
                    192: 
                    193:   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
                    194:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
                    195:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
                    196:     zToken = &zInput[iStart];
                    197:     nToken = iEnd-iStart;
                    198:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
                    199:   }
                    200: 
                    201:   if( SQLITE_OK!=p->xClose(pCsr) ){
                    202:     zErr = "error in xClose()";
                    203:     goto finish;
                    204:   }
                    205:   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
                    206:     zErr = "error in xDestroy()";
                    207:     goto finish;
                    208:   }
                    209: 
                    210: finish:
                    211:   if( zErr ){
                    212:     sqlite3_result_error(context, zErr, -1);
                    213:   }else{
                    214:     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
                    215:   }
                    216:   Tcl_DecrRefCount(pRet);
                    217: }
                    218: 
                    219: static
                    220: int registerTokenizer(
                    221:   sqlite3 *db, 
                    222:   char *zName, 
                    223:   const sqlite3_tokenizer_module *p
                    224: ){
                    225:   int rc;
                    226:   sqlite3_stmt *pStmt;
                    227:   const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
                    228: 
                    229:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
                    230:   if( rc!=SQLITE_OK ){
                    231:     return rc;
                    232:   }
                    233: 
                    234:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
                    235:   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
                    236:   sqlite3_step(pStmt);
                    237: 
                    238:   return sqlite3_finalize(pStmt);
                    239: }
                    240: 
                    241: static
                    242: int queryFts2Tokenizer(
                    243:   sqlite3 *db, 
                    244:   char *zName,  
                    245:   const sqlite3_tokenizer_module **pp
                    246: ){
                    247:   int rc;
                    248:   sqlite3_stmt *pStmt;
                    249:   const char zSql[] = "SELECT fts2_tokenizer(?)";
                    250: 
                    251:   *pp = 0;
                    252:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
                    253:   if( rc!=SQLITE_OK ){
                    254:     return rc;
                    255:   }
                    256: 
                    257:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
                    258:   if( SQLITE_ROW==sqlite3_step(pStmt) ){
                    259:     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
                    260:       memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
                    261:     }
                    262:   }
                    263: 
                    264:   return sqlite3_finalize(pStmt);
                    265: }
                    266: 
                    267: void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
                    268: 
                    269: /*
                    270: ** Implementation of the scalar function fts2_tokenizer_internal_test().
                    271: ** This function is used for testing only, it is not included in the
                    272: ** build unless SQLITE_TEST is defined.
                    273: **
                    274: ** The purpose of this is to test that the fts2_tokenizer() function
                    275: ** can be used as designed by the C-code in the queryFts2Tokenizer and
                    276: ** registerTokenizer() functions above. These two functions are repeated
                    277: ** in the README.tokenizer file as an example, so it is important to
                    278: ** test them.
                    279: **
                    280: ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
                    281: ** function with no arguments. An assert() will fail if a problem is
                    282: ** detected. i.e.:
                    283: **
                    284: **     SELECT fts2_tokenizer_internal_test();
                    285: **
                    286: */
                    287: static void intTestFunc(
                    288:   sqlite3_context *context,
                    289:   int argc,
                    290:   sqlite3_value **argv
                    291: ){
                    292:   int rc;
                    293:   const sqlite3_tokenizer_module *p1;
                    294:   const sqlite3_tokenizer_module *p2;
                    295:   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
                    296: 
                    297:   /* Test the query function */
                    298:   sqlite3Fts2SimpleTokenizerModule(&p1);
                    299:   rc = queryFts2Tokenizer(db, "simple", &p2);
                    300:   assert( rc==SQLITE_OK );
                    301:   assert( p1==p2 );
                    302:   rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
                    303:   assert( rc==SQLITE_ERROR );
                    304:   assert( p2==0 );
                    305:   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
                    306: 
                    307:   /* Test the storage function */
                    308:   rc = registerTokenizer(db, "nosuchtokenizer", p1);
                    309:   assert( rc==SQLITE_OK );
                    310:   rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
                    311:   assert( rc==SQLITE_OK );
                    312:   assert( p2==p1 );
                    313: 
                    314:   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
                    315: }
                    316: 
                    317: #endif
                    318: 
                    319: /*
                    320: ** Set up SQL objects in database db used to access the contents of
                    321: ** the hash table pointed to by argument pHash. The hash table must
                    322: ** been initialised to use string keys, and to take a private copy 
                    323: ** of the key when a value is inserted. i.e. by a call similar to:
                    324: **
                    325: **    sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
                    326: **
                    327: ** This function adds a scalar function (see header comment above
                    328: ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
                    329: ** defined at compilation time, a temporary virtual table (see header 
                    330: ** comment above struct HashTableVtab) to the database schema. Both 
                    331: ** provide read/write access to the contents of *pHash.
                    332: **
                    333: ** The third argument to this function, zName, is used as the name
                    334: ** of both the scalar and, if created, the virtual table.
                    335: */
                    336: int sqlite3Fts2InitHashTable(
                    337:   sqlite3 *db, 
                    338:   fts2Hash *pHash, 
                    339:   const char *zName
                    340: ){
                    341:   int rc = SQLITE_OK;
                    342:   void *p = (void *)pHash;
                    343:   const int any = SQLITE_ANY;
                    344:   char *zTest = 0;
                    345:   char *zTest2 = 0;
                    346: 
                    347: #ifdef SQLITE_TEST
                    348:   void *pdb = (void *)db;
                    349:   zTest = sqlite3_mprintf("%s_test", zName);
                    350:   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
                    351:   if( !zTest || !zTest2 ){
                    352:     rc = SQLITE_NOMEM;
                    353:   }
                    354: #endif
                    355: 
                    356:   if( rc!=SQLITE_OK
                    357:    || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
                    358:    || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
                    359: #ifdef SQLITE_TEST
                    360:    || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
                    361:    || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
                    362:    || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
                    363: #endif
                    364:   );
                    365: 
                    366:   sqlite3_free(zTest);
                    367:   sqlite3_free(zTest2);
                    368:   return rc;
                    369: }
                    370: 
                    371: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>