Annotation of embedaddon/sqlite3/ext/fts3/fts3_tokenizer.c, revision 1.1

1.1     ! misho       1: /*
        !             2: ** 2007 June 22
        !             3: **
        !             4: ** The author disclaims copyright to this source code.  In place of
        !             5: ** a legal notice, here is a blessing:
        !             6: **
        !             7: **    May you do good and not evil.
        !             8: **    May you find forgiveness for yourself and forgive others.
        !             9: **    May you share freely, never taking more than you give.
        !            10: **
        !            11: ******************************************************************************
        !            12: **
        !            13: ** This is part of an SQLite module implementing full-text search.
        !            14: ** This particular file implements the generic tokenizer interface.
        !            15: */
        !            16: 
        !            17: /*
        !            18: ** The code in this file is only compiled if:
        !            19: **
        !            20: **     * The FTS3 module is being built as an extension
        !            21: **       (in which case SQLITE_CORE is not defined), or
        !            22: **
        !            23: **     * The FTS3 module is being built into the core of
        !            24: **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
        !            25: */
        !            26: #include "fts3Int.h"
        !            27: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
        !            28: 
        !            29: #include <assert.h>
        !            30: #include <string.h>
        !            31: 
        !            32: /*
        !            33: ** Implementation of the SQL scalar function for accessing the underlying 
        !            34: ** hash table. This function may be called as follows:
        !            35: **
        !            36: **   SELECT <function-name>(<key-name>);
        !            37: **   SELECT <function-name>(<key-name>, <pointer>);
        !            38: **
        !            39: ** where <function-name> is the name passed as the second argument
        !            40: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer').
        !            41: **
        !            42: ** If the <pointer> argument is specified, it must be a blob value
        !            43: ** containing a pointer to be stored as the hash data corresponding
        !            44: ** to the string <key-name>. If <pointer> is not specified, then
        !            45: ** the string <key-name> must already exist in the has table. Otherwise,
        !            46: ** an error is returned.
        !            47: **
        !            48: ** Whether or not the <pointer> argument is specified, the value returned
        !            49: ** is a blob containing the pointer stored as the hash data corresponding
        !            50: ** to string <key-name> (after the hash-table is updated, if applicable).
        !            51: */
        !            52: static void scalarFunc(
        !            53:   sqlite3_context *context,
        !            54:   int argc,
        !            55:   sqlite3_value **argv
        !            56: ){
        !            57:   Fts3Hash *pHash;
        !            58:   void *pPtr = 0;
        !            59:   const unsigned char *zName;
        !            60:   int nName;
        !            61: 
        !            62:   assert( argc==1 || argc==2 );
        !            63: 
        !            64:   pHash = (Fts3Hash *)sqlite3_user_data(context);
        !            65: 
        !            66:   zName = sqlite3_value_text(argv[0]);
        !            67:   nName = sqlite3_value_bytes(argv[0])+1;
        !            68: 
        !            69:   if( argc==2 ){
        !            70:     void *pOld;
        !            71:     int n = sqlite3_value_bytes(argv[1]);
        !            72:     if( n!=sizeof(pPtr) ){
        !            73:       sqlite3_result_error(context, "argument type mismatch", -1);
        !            74:       return;
        !            75:     }
        !            76:     pPtr = *(void **)sqlite3_value_blob(argv[1]);
        !            77:     pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr);
        !            78:     if( pOld==pPtr ){
        !            79:       sqlite3_result_error(context, "out of memory", -1);
        !            80:       return;
        !            81:     }
        !            82:   }else{
        !            83:     pPtr = sqlite3Fts3HashFind(pHash, zName, nName);
        !            84:     if( !pPtr ){
        !            85:       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
        !            86:       sqlite3_result_error(context, zErr, -1);
        !            87:       sqlite3_free(zErr);
        !            88:       return;
        !            89:     }
        !            90:   }
        !            91: 
        !            92:   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
        !            93: }
        !            94: 
        !            95: int sqlite3Fts3IsIdChar(char c){
        !            96:   static const char isFtsIdChar[] = {
        !            97:       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x */
        !            98:       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 1x */
        !            99:       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
        !           100:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
        !           101:       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
        !           102:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
        !           103:       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
        !           104:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
        !           105:   };
        !           106:   return (c&0x80 || isFtsIdChar[(int)(c)]);
        !           107: }
        !           108: 
        !           109: const char *sqlite3Fts3NextToken(const char *zStr, int *pn){
        !           110:   const char *z1;
        !           111:   const char *z2 = 0;
        !           112: 
        !           113:   /* Find the start of the next token. */
        !           114:   z1 = zStr;
        !           115:   while( z2==0 ){
        !           116:     char c = *z1;
        !           117:     switch( c ){
        !           118:       case '\0': return 0;        /* No more tokens here */
        !           119:       case '\'':
        !           120:       case '"':
        !           121:       case '`': {
        !           122:         z2 = z1;
        !           123:         while( *++z2 && (*z2!=c || *++z2==c) );
        !           124:         break;
        !           125:       }
        !           126:       case '[':
        !           127:         z2 = &z1[1];
        !           128:         while( *z2 && z2[0]!=']' ) z2++;
        !           129:         if( *z2 ) z2++;
        !           130:         break;
        !           131: 
        !           132:       default:
        !           133:         if( sqlite3Fts3IsIdChar(*z1) ){
        !           134:           z2 = &z1[1];
        !           135:           while( sqlite3Fts3IsIdChar(*z2) ) z2++;
        !           136:         }else{
        !           137:           z1++;
        !           138:         }
        !           139:     }
        !           140:   }
        !           141: 
        !           142:   *pn = (int)(z2-z1);
        !           143:   return z1;
        !           144: }
        !           145: 
        !           146: int sqlite3Fts3InitTokenizer(
        !           147:   Fts3Hash *pHash,                /* Tokenizer hash table */
        !           148:   const char *zArg,               /* Tokenizer name */
        !           149:   sqlite3_tokenizer **ppTok,      /* OUT: Tokenizer (if applicable) */
        !           150:   char **pzErr                    /* OUT: Set to malloced error message */
        !           151: ){
        !           152:   int rc;
        !           153:   char *z = (char *)zArg;
        !           154:   int n = 0;
        !           155:   char *zCopy;
        !           156:   char *zEnd;                     /* Pointer to nul-term of zCopy */
        !           157:   sqlite3_tokenizer_module *m;
        !           158: 
        !           159:   zCopy = sqlite3_mprintf("%s", zArg);
        !           160:   if( !zCopy ) return SQLITE_NOMEM;
        !           161:   zEnd = &zCopy[strlen(zCopy)];
        !           162: 
        !           163:   z = (char *)sqlite3Fts3NextToken(zCopy, &n);
        !           164:   z[n] = '\0';
        !           165:   sqlite3Fts3Dequote(z);
        !           166: 
        !           167:   m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash,z,(int)strlen(z)+1);
        !           168:   if( !m ){
        !           169:     *pzErr = sqlite3_mprintf("unknown tokenizer: %s", z);
        !           170:     rc = SQLITE_ERROR;
        !           171:   }else{
        !           172:     char const **aArg = 0;
        !           173:     int iArg = 0;
        !           174:     z = &z[n+1];
        !           175:     while( z<zEnd && (NULL!=(z = (char *)sqlite3Fts3NextToken(z, &n))) ){
        !           176:       int nNew = sizeof(char *)*(iArg+1);
        !           177:       char const **aNew = (const char **)sqlite3_realloc((void *)aArg, nNew);
        !           178:       if( !aNew ){
        !           179:         sqlite3_free(zCopy);
        !           180:         sqlite3_free((void *)aArg);
        !           181:         return SQLITE_NOMEM;
        !           182:       }
        !           183:       aArg = aNew;
        !           184:       aArg[iArg++] = z;
        !           185:       z[n] = '\0';
        !           186:       sqlite3Fts3Dequote(z);
        !           187:       z = &z[n+1];
        !           188:     }
        !           189:     rc = m->xCreate(iArg, aArg, ppTok);
        !           190:     assert( rc!=SQLITE_OK || *ppTok );
        !           191:     if( rc!=SQLITE_OK ){
        !           192:       *pzErr = sqlite3_mprintf("unknown tokenizer");
        !           193:     }else{
        !           194:       (*ppTok)->pModule = m; 
        !           195:     }
        !           196:     sqlite3_free((void *)aArg);
        !           197:   }
        !           198: 
        !           199:   sqlite3_free(zCopy);
        !           200:   return rc;
        !           201: }
        !           202: 
        !           203: 
        !           204: #ifdef SQLITE_TEST
        !           205: 
        !           206: #include <tcl.h>
        !           207: #include <string.h>
        !           208: 
        !           209: /*
        !           210: ** Implementation of a special SQL scalar function for testing tokenizers 
        !           211: ** designed to be used in concert with the Tcl testing framework. This
        !           212: ** function must be called with two arguments:
        !           213: **
        !           214: **   SELECT <function-name>(<key-name>, <input-string>);
        !           215: **   SELECT <function-name>(<key-name>, <pointer>);
        !           216: **
        !           217: ** where <function-name> is the name passed as the second argument
        !           218: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
        !           219: ** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test').
        !           220: **
        !           221: ** The return value is a string that may be interpreted as a Tcl
        !           222: ** list. For each token in the <input-string>, three elements are
        !           223: ** added to the returned list. The first is the token position, the 
        !           224: ** second is the token text (folded, stemmed, etc.) and the third is the
        !           225: ** substring of <input-string> associated with the token. For example, 
        !           226: ** using the built-in "simple" tokenizer:
        !           227: **
        !           228: **   SELECT fts_tokenizer_test('simple', 'I don't see how');
        !           229: **
        !           230: ** will return the string:
        !           231: **
        !           232: **   "{0 i I 1 dont don't 2 see see 3 how how}"
        !           233: **   
        !           234: */
        !           235: static void testFunc(
        !           236:   sqlite3_context *context,
        !           237:   int argc,
        !           238:   sqlite3_value **argv
        !           239: ){
        !           240:   Fts3Hash *pHash;
        !           241:   sqlite3_tokenizer_module *p;
        !           242:   sqlite3_tokenizer *pTokenizer = 0;
        !           243:   sqlite3_tokenizer_cursor *pCsr = 0;
        !           244: 
        !           245:   const char *zErr = 0;
        !           246: 
        !           247:   const char *zName;
        !           248:   int nName;
        !           249:   const char *zInput;
        !           250:   int nInput;
        !           251: 
        !           252:   const char *zArg = 0;
        !           253: 
        !           254:   const char *zToken;
        !           255:   int nToken;
        !           256:   int iStart;
        !           257:   int iEnd;
        !           258:   int iPos;
        !           259: 
        !           260:   Tcl_Obj *pRet;
        !           261: 
        !           262:   assert( argc==2 || argc==3 );
        !           263: 
        !           264:   nName = sqlite3_value_bytes(argv[0]);
        !           265:   zName = (const char *)sqlite3_value_text(argv[0]);
        !           266:   nInput = sqlite3_value_bytes(argv[argc-1]);
        !           267:   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
        !           268: 
        !           269:   if( argc==3 ){
        !           270:     zArg = (const char *)sqlite3_value_text(argv[1]);
        !           271:   }
        !           272: 
        !           273:   pHash = (Fts3Hash *)sqlite3_user_data(context);
        !           274:   p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
        !           275: 
        !           276:   if( !p ){
        !           277:     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
        !           278:     sqlite3_result_error(context, zErr, -1);
        !           279:     sqlite3_free(zErr);
        !           280:     return;
        !           281:   }
        !           282: 
        !           283:   pRet = Tcl_NewObj();
        !           284:   Tcl_IncrRefCount(pRet);
        !           285: 
        !           286:   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
        !           287:     zErr = "error in xCreate()";
        !           288:     goto finish;
        !           289:   }
        !           290:   pTokenizer->pModule = p;
        !           291:   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
        !           292:     zErr = "error in xOpen()";
        !           293:     goto finish;
        !           294:   }
        !           295:   pCsr->pTokenizer = pTokenizer;
        !           296: 
        !           297:   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
        !           298:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
        !           299:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
        !           300:     zToken = &zInput[iStart];
        !           301:     nToken = iEnd-iStart;
        !           302:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
        !           303:   }
        !           304: 
        !           305:   if( SQLITE_OK!=p->xClose(pCsr) ){
        !           306:     zErr = "error in xClose()";
        !           307:     goto finish;
        !           308:   }
        !           309:   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
        !           310:     zErr = "error in xDestroy()";
        !           311:     goto finish;
        !           312:   }
        !           313: 
        !           314: finish:
        !           315:   if( zErr ){
        !           316:     sqlite3_result_error(context, zErr, -1);
        !           317:   }else{
        !           318:     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
        !           319:   }
        !           320:   Tcl_DecrRefCount(pRet);
        !           321: }
        !           322: 
        !           323: static
        !           324: int registerTokenizer(
        !           325:   sqlite3 *db, 
        !           326:   char *zName, 
        !           327:   const sqlite3_tokenizer_module *p
        !           328: ){
        !           329:   int rc;
        !           330:   sqlite3_stmt *pStmt;
        !           331:   const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
        !           332: 
        !           333:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
        !           334:   if( rc!=SQLITE_OK ){
        !           335:     return rc;
        !           336:   }
        !           337: 
        !           338:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
        !           339:   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
        !           340:   sqlite3_step(pStmt);
        !           341: 
        !           342:   return sqlite3_finalize(pStmt);
        !           343: }
        !           344: 
        !           345: static
        !           346: int queryTokenizer(
        !           347:   sqlite3 *db, 
        !           348:   char *zName,  
        !           349:   const sqlite3_tokenizer_module **pp
        !           350: ){
        !           351:   int rc;
        !           352:   sqlite3_stmt *pStmt;
        !           353:   const char zSql[] = "SELECT fts3_tokenizer(?)";
        !           354: 
        !           355:   *pp = 0;
        !           356:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
        !           357:   if( rc!=SQLITE_OK ){
        !           358:     return rc;
        !           359:   }
        !           360: 
        !           361:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
        !           362:   if( SQLITE_ROW==sqlite3_step(pStmt) ){
        !           363:     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
        !           364:       memcpy((void *)pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
        !           365:     }
        !           366:   }
        !           367: 
        !           368:   return sqlite3_finalize(pStmt);
        !           369: }
        !           370: 
        !           371: void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
        !           372: 
        !           373: /*
        !           374: ** Implementation of the scalar function fts3_tokenizer_internal_test().
        !           375: ** This function is used for testing only, it is not included in the
        !           376: ** build unless SQLITE_TEST is defined.
        !           377: **
        !           378: ** The purpose of this is to test that the fts3_tokenizer() function
        !           379: ** can be used as designed by the C-code in the queryTokenizer and
        !           380: ** registerTokenizer() functions above. These two functions are repeated
        !           381: ** in the README.tokenizer file as an example, so it is important to
        !           382: ** test them.
        !           383: **
        !           384: ** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar
        !           385: ** function with no arguments. An assert() will fail if a problem is
        !           386: ** detected. i.e.:
        !           387: **
        !           388: **     SELECT fts3_tokenizer_internal_test();
        !           389: **
        !           390: */
        !           391: static void intTestFunc(
        !           392:   sqlite3_context *context,
        !           393:   int argc,
        !           394:   sqlite3_value **argv
        !           395: ){
        !           396:   int rc;
        !           397:   const sqlite3_tokenizer_module *p1;
        !           398:   const sqlite3_tokenizer_module *p2;
        !           399:   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
        !           400: 
        !           401:   UNUSED_PARAMETER(argc);
        !           402:   UNUSED_PARAMETER(argv);
        !           403: 
        !           404:   /* Test the query function */
        !           405:   sqlite3Fts3SimpleTokenizerModule(&p1);
        !           406:   rc = queryTokenizer(db, "simple", &p2);
        !           407:   assert( rc==SQLITE_OK );
        !           408:   assert( p1==p2 );
        !           409:   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
        !           410:   assert( rc==SQLITE_ERROR );
        !           411:   assert( p2==0 );
        !           412:   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
        !           413: 
        !           414:   /* Test the storage function */
        !           415:   rc = registerTokenizer(db, "nosuchtokenizer", p1);
        !           416:   assert( rc==SQLITE_OK );
        !           417:   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
        !           418:   assert( rc==SQLITE_OK );
        !           419:   assert( p2==p1 );
        !           420: 
        !           421:   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
        !           422: }
        !           423: 
        !           424: #endif
        !           425: 
        !           426: /*
        !           427: ** Set up SQL objects in database db used to access the contents of
        !           428: ** the hash table pointed to by argument pHash. The hash table must
        !           429: ** been initialised to use string keys, and to take a private copy 
        !           430: ** of the key when a value is inserted. i.e. by a call similar to:
        !           431: **
        !           432: **    sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
        !           433: **
        !           434: ** This function adds a scalar function (see header comment above
        !           435: ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
        !           436: ** defined at compilation time, a temporary virtual table (see header 
        !           437: ** comment above struct HashTableVtab) to the database schema. Both 
        !           438: ** provide read/write access to the contents of *pHash.
        !           439: **
        !           440: ** The third argument to this function, zName, is used as the name
        !           441: ** of both the scalar and, if created, the virtual table.
        !           442: */
        !           443: int sqlite3Fts3InitHashTable(
        !           444:   sqlite3 *db, 
        !           445:   Fts3Hash *pHash, 
        !           446:   const char *zName
        !           447: ){
        !           448:   int rc = SQLITE_OK;
        !           449:   void *p = (void *)pHash;
        !           450:   const int any = SQLITE_ANY;
        !           451: 
        !           452: #ifdef SQLITE_TEST
        !           453:   char *zTest = 0;
        !           454:   char *zTest2 = 0;
        !           455:   void *pdb = (void *)db;
        !           456:   zTest = sqlite3_mprintf("%s_test", zName);
        !           457:   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
        !           458:   if( !zTest || !zTest2 ){
        !           459:     rc = SQLITE_NOMEM;
        !           460:   }
        !           461: #endif
        !           462: 
        !           463:   if( SQLITE_OK==rc ){
        !           464:     rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0);
        !           465:   }
        !           466:   if( SQLITE_OK==rc ){
        !           467:     rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0);
        !           468:   }
        !           469: #ifdef SQLITE_TEST
        !           470:   if( SQLITE_OK==rc ){
        !           471:     rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0);
        !           472:   }
        !           473:   if( SQLITE_OK==rc ){
        !           474:     rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0);
        !           475:   }
        !           476:   if( SQLITE_OK==rc ){
        !           477:     rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0);
        !           478:   }
        !           479: #endif
        !           480: 
        !           481: #ifdef SQLITE_TEST
        !           482:   sqlite3_free(zTest);
        !           483:   sqlite3_free(zTest2);
        !           484: #endif
        !           485: 
        !           486:   return rc;
        !           487: }
        !           488: 
        !           489: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>