Annotation of embedaddon/sqlite3/ext/fts2/fts2_tokenizer.c, revision 1.1

1.1     ! misho       1: /*
        !             2: ** 2007 June 22
        !             3: **
        !             4: ** The author disclaims copyright to this source code.  In place of
        !             5: ** a legal notice, here is a blessing:
        !             6: **
        !             7: **    May you do good and not evil.
        !             8: **    May you find forgiveness for yourself and forgive others.
        !             9: **    May you share freely, never taking more than you give.
        !            10: **
        !            11: ******************************************************************************
        !            12: **
        !            13: ** This is part of an SQLite module implementing full-text search.
        !            14: ** This particular file implements the generic tokenizer interface.
        !            15: */
        !            16: 
        !            17: /*
        !            18: ** The code in this file is only compiled if:
        !            19: **
        !            20: **     * The FTS2 module is being built as an extension
        !            21: **       (in which case SQLITE_CORE is not defined), or
        !            22: **
        !            23: **     * The FTS2 module is being built into the core of
        !            24: **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
        !            25: */
        !            26: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
        !            27: 
        !            28: 
        !            29: #include "sqlite3.h"
        !            30: #include "sqlite3ext.h"
        !            31: SQLITE_EXTENSION_INIT1
        !            32: 
        !            33: #include "fts2_hash.h"
        !            34: #include "fts2_tokenizer.h"
        !            35: #include <assert.h>
        !            36: 
        !            37: /*
        !            38: ** Implementation of the SQL scalar function for accessing the underlying 
        !            39: ** hash table. This function may be called as follows:
        !            40: **
        !            41: **   SELECT <function-name>(<key-name>);
        !            42: **   SELECT <function-name>(<key-name>, <pointer>);
        !            43: **
        !            44: ** where <function-name> is the name passed as the second argument
        !            45: ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
        !            46: **
        !            47: ** If the <pointer> argument is specified, it must be a blob value
        !            48: ** containing a pointer to be stored as the hash data corresponding
        !            49: ** to the string <key-name>. If <pointer> is not specified, then
        !            50: ** the string <key-name> must already exist in the has table. Otherwise,
        !            51: ** an error is returned.
        !            52: **
        !            53: ** Whether or not the <pointer> argument is specified, the value returned
        !            54: ** is a blob containing the pointer stored as the hash data corresponding
        !            55: ** to string <key-name> (after the hash-table is updated, if applicable).
        !            56: */
        !            57: static void scalarFunc(
        !            58:   sqlite3_context *context,
        !            59:   int argc,
        !            60:   sqlite3_value **argv
        !            61: ){
        !            62:   fts2Hash *pHash;
        !            63:   void *pPtr = 0;
        !            64:   const unsigned char *zName;
        !            65:   int nName;
        !            66: 
        !            67:   assert( argc==1 || argc==2 );
        !            68: 
        !            69:   pHash = (fts2Hash *)sqlite3_user_data(context);
        !            70: 
        !            71:   zName = sqlite3_value_text(argv[0]);
        !            72:   nName = sqlite3_value_bytes(argv[0])+1;
        !            73: 
        !            74:   if( argc==2 ){
        !            75:     void *pOld;
        !            76:     int n = sqlite3_value_bytes(argv[1]);
        !            77:     if( n!=sizeof(pPtr) ){
        !            78:       sqlite3_result_error(context, "argument type mismatch", -1);
        !            79:       return;
        !            80:     }
        !            81:     pPtr = *(void **)sqlite3_value_blob(argv[1]);
        !            82:     pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
        !            83:     if( pOld==pPtr ){
        !            84:       sqlite3_result_error(context, "out of memory", -1);
        !            85:       return;
        !            86:     }
        !            87:   }else{
        !            88:     pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
        !            89:     if( !pPtr ){
        !            90:       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
        !            91:       sqlite3_result_error(context, zErr, -1);
        !            92:       sqlite3_free(zErr);
        !            93:       return;
        !            94:     }
        !            95:   }
        !            96: 
        !            97:   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
        !            98: }
        !            99: 
        !           100: #ifdef SQLITE_TEST
        !           101: 
        !           102: #include <tcl.h>
        !           103: #include <string.h>
        !           104: 
        !           105: /*
        !           106: ** Implementation of a special SQL scalar function for testing tokenizers 
        !           107: ** designed to be used in concert with the Tcl testing framework. This
        !           108: ** function must be called with two arguments:
        !           109: **
        !           110: **   SELECT <function-name>(<key-name>, <input-string>);
        !           111: **   SELECT <function-name>(<key-name>, <pointer>);
        !           112: **
        !           113: ** where <function-name> is the name passed as the second argument
        !           114: ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
        !           115: ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
        !           116: **
        !           117: ** The return value is a string that may be interpreted as a Tcl
        !           118: ** list. For each token in the <input-string>, three elements are
        !           119: ** added to the returned list. The first is the token position, the 
        !           120: ** second is the token text (folded, stemmed, etc.) and the third is the
        !           121: ** substring of <input-string> associated with the token. For example, 
        !           122: ** using the built-in "simple" tokenizer:
        !           123: **
        !           124: **   SELECT fts_tokenizer_test('simple', 'I don't see how');
        !           125: **
        !           126: ** will return the string:
        !           127: **
        !           128: **   "{0 i I 1 dont don't 2 see see 3 how how}"
        !           129: **   
        !           130: */
        !           131: static void testFunc(
        !           132:   sqlite3_context *context,
        !           133:   int argc,
        !           134:   sqlite3_value **argv
        !           135: ){
        !           136:   fts2Hash *pHash;
        !           137:   sqlite3_tokenizer_module *p;
        !           138:   sqlite3_tokenizer *pTokenizer = 0;
        !           139:   sqlite3_tokenizer_cursor *pCsr = 0;
        !           140: 
        !           141:   const char *zErr = 0;
        !           142: 
        !           143:   const char *zName;
        !           144:   int nName;
        !           145:   const char *zInput;
        !           146:   int nInput;
        !           147: 
        !           148:   const char *zArg = 0;
        !           149: 
        !           150:   const char *zToken;
        !           151:   int nToken;
        !           152:   int iStart;
        !           153:   int iEnd;
        !           154:   int iPos;
        !           155: 
        !           156:   Tcl_Obj *pRet;
        !           157: 
        !           158:   assert( argc==2 || argc==3 );
        !           159: 
        !           160:   nName = sqlite3_value_bytes(argv[0]);
        !           161:   zName = (const char *)sqlite3_value_text(argv[0]);
        !           162:   nInput = sqlite3_value_bytes(argv[argc-1]);
        !           163:   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
        !           164: 
        !           165:   if( argc==3 ){
        !           166:     zArg = (const char *)sqlite3_value_text(argv[1]);
        !           167:   }
        !           168: 
        !           169:   pHash = (fts2Hash *)sqlite3_user_data(context);
        !           170:   p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
        !           171: 
        !           172:   if( !p ){
        !           173:     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
        !           174:     sqlite3_result_error(context, zErr, -1);
        !           175:     sqlite3_free(zErr);
        !           176:     return;
        !           177:   }
        !           178: 
        !           179:   pRet = Tcl_NewObj();
        !           180:   Tcl_IncrRefCount(pRet);
        !           181: 
        !           182:   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
        !           183:     zErr = "error in xCreate()";
        !           184:     goto finish;
        !           185:   }
        !           186:   pTokenizer->pModule = p;
        !           187:   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
        !           188:     zErr = "error in xOpen()";
        !           189:     goto finish;
        !           190:   }
        !           191:   pCsr->pTokenizer = pTokenizer;
        !           192: 
        !           193:   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
        !           194:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
        !           195:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
        !           196:     zToken = &zInput[iStart];
        !           197:     nToken = iEnd-iStart;
        !           198:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
        !           199:   }
        !           200: 
        !           201:   if( SQLITE_OK!=p->xClose(pCsr) ){
        !           202:     zErr = "error in xClose()";
        !           203:     goto finish;
        !           204:   }
        !           205:   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
        !           206:     zErr = "error in xDestroy()";
        !           207:     goto finish;
        !           208:   }
        !           209: 
        !           210: finish:
        !           211:   if( zErr ){
        !           212:     sqlite3_result_error(context, zErr, -1);
        !           213:   }else{
        !           214:     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
        !           215:   }
        !           216:   Tcl_DecrRefCount(pRet);
        !           217: }
        !           218: 
        !           219: static
        !           220: int registerTokenizer(
        !           221:   sqlite3 *db, 
        !           222:   char *zName, 
        !           223:   const sqlite3_tokenizer_module *p
        !           224: ){
        !           225:   int rc;
        !           226:   sqlite3_stmt *pStmt;
        !           227:   const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
        !           228: 
        !           229:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
        !           230:   if( rc!=SQLITE_OK ){
        !           231:     return rc;
        !           232:   }
        !           233: 
        !           234:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
        !           235:   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
        !           236:   sqlite3_step(pStmt);
        !           237: 
        !           238:   return sqlite3_finalize(pStmt);
        !           239: }
        !           240: 
        !           241: static
        !           242: int queryFts2Tokenizer(
        !           243:   sqlite3 *db, 
        !           244:   char *zName,  
        !           245:   const sqlite3_tokenizer_module **pp
        !           246: ){
        !           247:   int rc;
        !           248:   sqlite3_stmt *pStmt;
        !           249:   const char zSql[] = "SELECT fts2_tokenizer(?)";
        !           250: 
        !           251:   *pp = 0;
        !           252:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
        !           253:   if( rc!=SQLITE_OK ){
        !           254:     return rc;
        !           255:   }
        !           256: 
        !           257:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
        !           258:   if( SQLITE_ROW==sqlite3_step(pStmt) ){
        !           259:     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
        !           260:       memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
        !           261:     }
        !           262:   }
        !           263: 
        !           264:   return sqlite3_finalize(pStmt);
        !           265: }
        !           266: 
        !           267: void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
        !           268: 
        !           269: /*
        !           270: ** Implementation of the scalar function fts2_tokenizer_internal_test().
        !           271: ** This function is used for testing only, it is not included in the
        !           272: ** build unless SQLITE_TEST is defined.
        !           273: **
        !           274: ** The purpose of this is to test that the fts2_tokenizer() function
        !           275: ** can be used as designed by the C-code in the queryFts2Tokenizer and
        !           276: ** registerTokenizer() functions above. These two functions are repeated
        !           277: ** in the README.tokenizer file as an example, so it is important to
        !           278: ** test them.
        !           279: **
        !           280: ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
        !           281: ** function with no arguments. An assert() will fail if a problem is
        !           282: ** detected. i.e.:
        !           283: **
        !           284: **     SELECT fts2_tokenizer_internal_test();
        !           285: **
        !           286: */
        !           287: static void intTestFunc(
        !           288:   sqlite3_context *context,
        !           289:   int argc,
        !           290:   sqlite3_value **argv
        !           291: ){
        !           292:   int rc;
        !           293:   const sqlite3_tokenizer_module *p1;
        !           294:   const sqlite3_tokenizer_module *p2;
        !           295:   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
        !           296: 
        !           297:   /* Test the query function */
        !           298:   sqlite3Fts2SimpleTokenizerModule(&p1);
        !           299:   rc = queryFts2Tokenizer(db, "simple", &p2);
        !           300:   assert( rc==SQLITE_OK );
        !           301:   assert( p1==p2 );
        !           302:   rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
        !           303:   assert( rc==SQLITE_ERROR );
        !           304:   assert( p2==0 );
        !           305:   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
        !           306: 
        !           307:   /* Test the storage function */
        !           308:   rc = registerTokenizer(db, "nosuchtokenizer", p1);
        !           309:   assert( rc==SQLITE_OK );
        !           310:   rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
        !           311:   assert( rc==SQLITE_OK );
        !           312:   assert( p2==p1 );
        !           313: 
        !           314:   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
        !           315: }
        !           316: 
        !           317: #endif
        !           318: 
        !           319: /*
        !           320: ** Set up SQL objects in database db used to access the contents of
        !           321: ** the hash table pointed to by argument pHash. The hash table must
        !           322: ** been initialised to use string keys, and to take a private copy 
        !           323: ** of the key when a value is inserted. i.e. by a call similar to:
        !           324: **
        !           325: **    sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
        !           326: **
        !           327: ** This function adds a scalar function (see header comment above
        !           328: ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
        !           329: ** defined at compilation time, a temporary virtual table (see header 
        !           330: ** comment above struct HashTableVtab) to the database schema. Both 
        !           331: ** provide read/write access to the contents of *pHash.
        !           332: **
        !           333: ** The third argument to this function, zName, is used as the name
        !           334: ** of both the scalar and, if created, the virtual table.
        !           335: */
        !           336: int sqlite3Fts2InitHashTable(
        !           337:   sqlite3 *db, 
        !           338:   fts2Hash *pHash, 
        !           339:   const char *zName
        !           340: ){
        !           341:   int rc = SQLITE_OK;
        !           342:   void *p = (void *)pHash;
        !           343:   const int any = SQLITE_ANY;
        !           344:   char *zTest = 0;
        !           345:   char *zTest2 = 0;
        !           346: 
        !           347: #ifdef SQLITE_TEST
        !           348:   void *pdb = (void *)db;
        !           349:   zTest = sqlite3_mprintf("%s_test", zName);
        !           350:   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
        !           351:   if( !zTest || !zTest2 ){
        !           352:     rc = SQLITE_NOMEM;
        !           353:   }
        !           354: #endif
        !           355: 
        !           356:   if( rc!=SQLITE_OK
        !           357:    || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
        !           358:    || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
        !           359: #ifdef SQLITE_TEST
        !           360:    || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
        !           361:    || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
        !           362:    || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
        !           363: #endif
        !           364:   );
        !           365: 
        !           366:   sqlite3_free(zTest);
        !           367:   sqlite3_free(zTest2);
        !           368:   return rc;
        !           369: }
        !           370: 
        !           371: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>