File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts3 / fts3_tokenizer.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:17 2012 UTC (12 years, 4 months ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD
sqlite3

    1: /*
    2: ** 2007 June 22
    3: **
    4: ** The author disclaims copyright to this source code.  In place of
    5: ** a legal notice, here is a blessing:
    6: **
    7: **    May you do good and not evil.
    8: **    May you find forgiveness for yourself and forgive others.
    9: **    May you share freely, never taking more than you give.
   10: **
   11: ******************************************************************************
   12: **
   13: ** This is part of an SQLite module implementing full-text search.
   14: ** This particular file implements the generic tokenizer interface.
   15: */
   16: 
   17: /*
   18: ** The code in this file is only compiled if:
   19: **
   20: **     * The FTS3 module is being built as an extension
   21: **       (in which case SQLITE_CORE is not defined), or
   22: **
   23: **     * The FTS3 module is being built into the core of
   24: **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
   25: */
   26: #include "fts3Int.h"
   27: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
   28: 
   29: #include <assert.h>
   30: #include <string.h>
   31: 
   32: /*
   33: ** Implementation of the SQL scalar function for accessing the underlying 
   34: ** hash table. This function may be called as follows:
   35: **
   36: **   SELECT <function-name>(<key-name>);
   37: **   SELECT <function-name>(<key-name>, <pointer>);
   38: **
   39: ** where <function-name> is the name passed as the second argument
   40: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer').
   41: **
   42: ** If the <pointer> argument is specified, it must be a blob value
   43: ** containing a pointer to be stored as the hash data corresponding
   44: ** to the string <key-name>. If <pointer> is not specified, then
   45: ** the string <key-name> must already exist in the has table. Otherwise,
   46: ** an error is returned.
   47: **
   48: ** Whether or not the <pointer> argument is specified, the value returned
   49: ** is a blob containing the pointer stored as the hash data corresponding
   50: ** to string <key-name> (after the hash-table is updated, if applicable).
   51: */
   52: static void scalarFunc(
   53:   sqlite3_context *context,
   54:   int argc,
   55:   sqlite3_value **argv
   56: ){
   57:   Fts3Hash *pHash;
   58:   void *pPtr = 0;
   59:   const unsigned char *zName;
   60:   int nName;
   61: 
   62:   assert( argc==1 || argc==2 );
   63: 
   64:   pHash = (Fts3Hash *)sqlite3_user_data(context);
   65: 
   66:   zName = sqlite3_value_text(argv[0]);
   67:   nName = sqlite3_value_bytes(argv[0])+1;
   68: 
   69:   if( argc==2 ){
   70:     void *pOld;
   71:     int n = sqlite3_value_bytes(argv[1]);
   72:     if( n!=sizeof(pPtr) ){
   73:       sqlite3_result_error(context, "argument type mismatch", -1);
   74:       return;
   75:     }
   76:     pPtr = *(void **)sqlite3_value_blob(argv[1]);
   77:     pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr);
   78:     if( pOld==pPtr ){
   79:       sqlite3_result_error(context, "out of memory", -1);
   80:       return;
   81:     }
   82:   }else{
   83:     pPtr = sqlite3Fts3HashFind(pHash, zName, nName);
   84:     if( !pPtr ){
   85:       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
   86:       sqlite3_result_error(context, zErr, -1);
   87:       sqlite3_free(zErr);
   88:       return;
   89:     }
   90:   }
   91: 
   92:   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
   93: }
   94: 
   95: int sqlite3Fts3IsIdChar(char c){
   96:   static const char isFtsIdChar[] = {
   97:       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 0x */
   98:       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 1x */
   99:       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
  100:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
  101:       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
  102:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
  103:       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
  104:       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
  105:   };
  106:   return (c&0x80 || isFtsIdChar[(int)(c)]);
  107: }
  108: 
  109: const char *sqlite3Fts3NextToken(const char *zStr, int *pn){
  110:   const char *z1;
  111:   const char *z2 = 0;
  112: 
  113:   /* Find the start of the next token. */
  114:   z1 = zStr;
  115:   while( z2==0 ){
  116:     char c = *z1;
  117:     switch( c ){
  118:       case '\0': return 0;        /* No more tokens here */
  119:       case '\'':
  120:       case '"':
  121:       case '`': {
  122:         z2 = z1;
  123:         while( *++z2 && (*z2!=c || *++z2==c) );
  124:         break;
  125:       }
  126:       case '[':
  127:         z2 = &z1[1];
  128:         while( *z2 && z2[0]!=']' ) z2++;
  129:         if( *z2 ) z2++;
  130:         break;
  131: 
  132:       default:
  133:         if( sqlite3Fts3IsIdChar(*z1) ){
  134:           z2 = &z1[1];
  135:           while( sqlite3Fts3IsIdChar(*z2) ) z2++;
  136:         }else{
  137:           z1++;
  138:         }
  139:     }
  140:   }
  141: 
  142:   *pn = (int)(z2-z1);
  143:   return z1;
  144: }
  145: 
  146: int sqlite3Fts3InitTokenizer(
  147:   Fts3Hash *pHash,                /* Tokenizer hash table */
  148:   const char *zArg,               /* Tokenizer name */
  149:   sqlite3_tokenizer **ppTok,      /* OUT: Tokenizer (if applicable) */
  150:   char **pzErr                    /* OUT: Set to malloced error message */
  151: ){
  152:   int rc;
  153:   char *z = (char *)zArg;
  154:   int n = 0;
  155:   char *zCopy;
  156:   char *zEnd;                     /* Pointer to nul-term of zCopy */
  157:   sqlite3_tokenizer_module *m;
  158: 
  159:   zCopy = sqlite3_mprintf("%s", zArg);
  160:   if( !zCopy ) return SQLITE_NOMEM;
  161:   zEnd = &zCopy[strlen(zCopy)];
  162: 
  163:   z = (char *)sqlite3Fts3NextToken(zCopy, &n);
  164:   z[n] = '\0';
  165:   sqlite3Fts3Dequote(z);
  166: 
  167:   m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash,z,(int)strlen(z)+1);
  168:   if( !m ){
  169:     *pzErr = sqlite3_mprintf("unknown tokenizer: %s", z);
  170:     rc = SQLITE_ERROR;
  171:   }else{
  172:     char const **aArg = 0;
  173:     int iArg = 0;
  174:     z = &z[n+1];
  175:     while( z<zEnd && (NULL!=(z = (char *)sqlite3Fts3NextToken(z, &n))) ){
  176:       int nNew = sizeof(char *)*(iArg+1);
  177:       char const **aNew = (const char **)sqlite3_realloc((void *)aArg, nNew);
  178:       if( !aNew ){
  179:         sqlite3_free(zCopy);
  180:         sqlite3_free((void *)aArg);
  181:         return SQLITE_NOMEM;
  182:       }
  183:       aArg = aNew;
  184:       aArg[iArg++] = z;
  185:       z[n] = '\0';
  186:       sqlite3Fts3Dequote(z);
  187:       z = &z[n+1];
  188:     }
  189:     rc = m->xCreate(iArg, aArg, ppTok);
  190:     assert( rc!=SQLITE_OK || *ppTok );
  191:     if( rc!=SQLITE_OK ){
  192:       *pzErr = sqlite3_mprintf("unknown tokenizer");
  193:     }else{
  194:       (*ppTok)->pModule = m; 
  195:     }
  196:     sqlite3_free((void *)aArg);
  197:   }
  198: 
  199:   sqlite3_free(zCopy);
  200:   return rc;
  201: }
  202: 
  203: 
  204: #ifdef SQLITE_TEST
  205: 
  206: #include <tcl.h>
  207: #include <string.h>
  208: 
  209: /*
  210: ** Implementation of a special SQL scalar function for testing tokenizers 
  211: ** designed to be used in concert with the Tcl testing framework. This
  212: ** function must be called with two arguments:
  213: **
  214: **   SELECT <function-name>(<key-name>, <input-string>);
  215: **   SELECT <function-name>(<key-name>, <pointer>);
  216: **
  217: ** where <function-name> is the name passed as the second argument
  218: ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
  219: ** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test').
  220: **
  221: ** The return value is a string that may be interpreted as a Tcl
  222: ** list. For each token in the <input-string>, three elements are
  223: ** added to the returned list. The first is the token position, the 
  224: ** second is the token text (folded, stemmed, etc.) and the third is the
  225: ** substring of <input-string> associated with the token. For example, 
  226: ** using the built-in "simple" tokenizer:
  227: **
  228: **   SELECT fts_tokenizer_test('simple', 'I don't see how');
  229: **
  230: ** will return the string:
  231: **
  232: **   "{0 i I 1 dont don't 2 see see 3 how how}"
  233: **   
  234: */
  235: static void testFunc(
  236:   sqlite3_context *context,
  237:   int argc,
  238:   sqlite3_value **argv
  239: ){
  240:   Fts3Hash *pHash;
  241:   sqlite3_tokenizer_module *p;
  242:   sqlite3_tokenizer *pTokenizer = 0;
  243:   sqlite3_tokenizer_cursor *pCsr = 0;
  244: 
  245:   const char *zErr = 0;
  246: 
  247:   const char *zName;
  248:   int nName;
  249:   const char *zInput;
  250:   int nInput;
  251: 
  252:   const char *zArg = 0;
  253: 
  254:   const char *zToken;
  255:   int nToken;
  256:   int iStart;
  257:   int iEnd;
  258:   int iPos;
  259: 
  260:   Tcl_Obj *pRet;
  261: 
  262:   assert( argc==2 || argc==3 );
  263: 
  264:   nName = sqlite3_value_bytes(argv[0]);
  265:   zName = (const char *)sqlite3_value_text(argv[0]);
  266:   nInput = sqlite3_value_bytes(argv[argc-1]);
  267:   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
  268: 
  269:   if( argc==3 ){
  270:     zArg = (const char *)sqlite3_value_text(argv[1]);
  271:   }
  272: 
  273:   pHash = (Fts3Hash *)sqlite3_user_data(context);
  274:   p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
  275: 
  276:   if( !p ){
  277:     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
  278:     sqlite3_result_error(context, zErr, -1);
  279:     sqlite3_free(zErr);
  280:     return;
  281:   }
  282: 
  283:   pRet = Tcl_NewObj();
  284:   Tcl_IncrRefCount(pRet);
  285: 
  286:   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
  287:     zErr = "error in xCreate()";
  288:     goto finish;
  289:   }
  290:   pTokenizer->pModule = p;
  291:   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
  292:     zErr = "error in xOpen()";
  293:     goto finish;
  294:   }
  295:   pCsr->pTokenizer = pTokenizer;
  296: 
  297:   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
  298:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
  299:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
  300:     zToken = &zInput[iStart];
  301:     nToken = iEnd-iStart;
  302:     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
  303:   }
  304: 
  305:   if( SQLITE_OK!=p->xClose(pCsr) ){
  306:     zErr = "error in xClose()";
  307:     goto finish;
  308:   }
  309:   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
  310:     zErr = "error in xDestroy()";
  311:     goto finish;
  312:   }
  313: 
  314: finish:
  315:   if( zErr ){
  316:     sqlite3_result_error(context, zErr, -1);
  317:   }else{
  318:     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
  319:   }
  320:   Tcl_DecrRefCount(pRet);
  321: }
  322: 
  323: static
  324: int registerTokenizer(
  325:   sqlite3 *db, 
  326:   char *zName, 
  327:   const sqlite3_tokenizer_module *p
  328: ){
  329:   int rc;
  330:   sqlite3_stmt *pStmt;
  331:   const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
  332: 
  333:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
  334:   if( rc!=SQLITE_OK ){
  335:     return rc;
  336:   }
  337: 
  338:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
  339:   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
  340:   sqlite3_step(pStmt);
  341: 
  342:   return sqlite3_finalize(pStmt);
  343: }
  344: 
  345: static
  346: int queryTokenizer(
  347:   sqlite3 *db, 
  348:   char *zName,  
  349:   const sqlite3_tokenizer_module **pp
  350: ){
  351:   int rc;
  352:   sqlite3_stmt *pStmt;
  353:   const char zSql[] = "SELECT fts3_tokenizer(?)";
  354: 
  355:   *pp = 0;
  356:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
  357:   if( rc!=SQLITE_OK ){
  358:     return rc;
  359:   }
  360: 
  361:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
  362:   if( SQLITE_ROW==sqlite3_step(pStmt) ){
  363:     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
  364:       memcpy((void *)pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
  365:     }
  366:   }
  367: 
  368:   return sqlite3_finalize(pStmt);
  369: }
  370: 
  371: void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
  372: 
  373: /*
  374: ** Implementation of the scalar function fts3_tokenizer_internal_test().
  375: ** This function is used for testing only, it is not included in the
  376: ** build unless SQLITE_TEST is defined.
  377: **
  378: ** The purpose of this is to test that the fts3_tokenizer() function
  379: ** can be used as designed by the C-code in the queryTokenizer and
  380: ** registerTokenizer() functions above. These two functions are repeated
  381: ** in the README.tokenizer file as an example, so it is important to
  382: ** test them.
  383: **
  384: ** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar
  385: ** function with no arguments. An assert() will fail if a problem is
  386: ** detected. i.e.:
  387: **
  388: **     SELECT fts3_tokenizer_internal_test();
  389: **
  390: */
  391: static void intTestFunc(
  392:   sqlite3_context *context,
  393:   int argc,
  394:   sqlite3_value **argv
  395: ){
  396:   int rc;
  397:   const sqlite3_tokenizer_module *p1;
  398:   const sqlite3_tokenizer_module *p2;
  399:   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
  400: 
  401:   UNUSED_PARAMETER(argc);
  402:   UNUSED_PARAMETER(argv);
  403: 
  404:   /* Test the query function */
  405:   sqlite3Fts3SimpleTokenizerModule(&p1);
  406:   rc = queryTokenizer(db, "simple", &p2);
  407:   assert( rc==SQLITE_OK );
  408:   assert( p1==p2 );
  409:   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
  410:   assert( rc==SQLITE_ERROR );
  411:   assert( p2==0 );
  412:   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
  413: 
  414:   /* Test the storage function */
  415:   rc = registerTokenizer(db, "nosuchtokenizer", p1);
  416:   assert( rc==SQLITE_OK );
  417:   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
  418:   assert( rc==SQLITE_OK );
  419:   assert( p2==p1 );
  420: 
  421:   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
  422: }
  423: 
  424: #endif
  425: 
  426: /*
  427: ** Set up SQL objects in database db used to access the contents of
  428: ** the hash table pointed to by argument pHash. The hash table must
  429: ** been initialised to use string keys, and to take a private copy 
  430: ** of the key when a value is inserted. i.e. by a call similar to:
  431: **
  432: **    sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
  433: **
  434: ** This function adds a scalar function (see header comment above
  435: ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
  436: ** defined at compilation time, a temporary virtual table (see header 
  437: ** comment above struct HashTableVtab) to the database schema. Both 
  438: ** provide read/write access to the contents of *pHash.
  439: **
  440: ** The third argument to this function, zName, is used as the name
  441: ** of both the scalar and, if created, the virtual table.
  442: */
  443: int sqlite3Fts3InitHashTable(
  444:   sqlite3 *db, 
  445:   Fts3Hash *pHash, 
  446:   const char *zName
  447: ){
  448:   int rc = SQLITE_OK;
  449:   void *p = (void *)pHash;
  450:   const int any = SQLITE_ANY;
  451: 
  452: #ifdef SQLITE_TEST
  453:   char *zTest = 0;
  454:   char *zTest2 = 0;
  455:   void *pdb = (void *)db;
  456:   zTest = sqlite3_mprintf("%s_test", zName);
  457:   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
  458:   if( !zTest || !zTest2 ){
  459:     rc = SQLITE_NOMEM;
  460:   }
  461: #endif
  462: 
  463:   if( SQLITE_OK==rc ){
  464:     rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0);
  465:   }
  466:   if( SQLITE_OK==rc ){
  467:     rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0);
  468:   }
  469: #ifdef SQLITE_TEST
  470:   if( SQLITE_OK==rc ){
  471:     rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0);
  472:   }
  473:   if( SQLITE_OK==rc ){
  474:     rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0);
  475:   }
  476:   if( SQLITE_OK==rc ){
  477:     rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0);
  478:   }
  479: #endif
  480: 
  481: #ifdef SQLITE_TEST
  482:   sqlite3_free(zTest);
  483:   sqlite3_free(zTest2);
  484: #endif
  485: 
  486:   return rc;
  487: }
  488: 
  489: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>