embedaddon/sqlite3/ext/fts1/fts1_tokenizer1.c - annotate

Return to fts1_tokenizer1.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts1
Annotation of embedaddon/sqlite3/ext/fts1/fts1_tokenizer1.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** The author disclaims copyright to this source code.
                      3: **
                      4: *************************************************************************
                      5: ** Implementation of the "simple" full-text-search tokenizer.
                      6: */
                      7: 
                      8: /*
                      9: ** The code in this file is only compiled if:
                     10: **
                     11: **     * The FTS1 module is being built as an extension
                     12: **       (in which case SQLITE_CORE is not defined), or
                     13: **
                     14: **     * The FTS1 module is being built into the core of
                     15: **       SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
                     16: */
                     17: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
                     18: 
                     19: 
                     20: #include <assert.h>
                     21: #include <stdlib.h>
                     22: #include <stdio.h>
                     23: #include <string.h>
                     24: #include <ctype.h>
                     25: 
                     26: #include "fts1_tokenizer.h"
                     27: 
                     28: typedef struct simple_tokenizer {
                     29:   sqlite3_tokenizer base;
                     30:   char delim[128];             /* flag ASCII delimiters */
                     31: } simple_tokenizer;
                     32: 
                     33: typedef struct simple_tokenizer_cursor {
                     34:   sqlite3_tokenizer_cursor base;
                     35:   const char *pInput;          /* input we are tokenizing */
                     36:   int nBytes;                  /* size of the input */
                     37:   int iOffset;                 /* current position in pInput */
                     38:   int iToken;                  /* index of next token to be returned */
                     39:   char *pToken;                /* storage for current token */
                     40:   int nTokenAllocated;         /* space allocated to zToken buffer */
                     41: } simple_tokenizer_cursor;
                     42: 
                     43: 
                     44: /* Forward declaration */
                     45: static const sqlite3_tokenizer_module simpleTokenizerModule;
                     46: 
                     47: static int isDelim(simple_tokenizer *t, unsigned char c){
                     48:   return c<0x80 && t->delim[c];
                     49: }
                     50: 
                     51: /*
                     52: ** Create a new tokenizer instance.
                     53: */
                     54: static int simpleCreate(
                     55:   int argc, const char * const *argv,
                     56:   sqlite3_tokenizer **ppTokenizer
                     57: ){
                     58:   simple_tokenizer *t;
                     59: 
                     60:   t = (simple_tokenizer *) calloc(sizeof(*t), 1);
                     61:   if( t==NULL ) return SQLITE_NOMEM;
                     62: 
                     63:   /* TODO(shess) Delimiters need to remain the same from run to run,
                     64:   ** else we need to reindex.  One solution would be a meta-table to
                     65:   ** track such information in the database, then we'd only want this
                     66:   ** information on the initial create.
                     67:   */
                     68:   if( argc>1 ){
                     69:     int i, n = strlen(argv[1]);
                     70:     for(i=0; i<n; i++){
                     71:       unsigned char ch = argv[1][i];
                     72:       /* We explicitly don't support UTF-8 delimiters for now. */
                     73:       if( ch>=0x80 ){
                     74:         free(t);
                     75:         return SQLITE_ERROR;
                     76:       }
                     77:       t->delim[ch] = 1;
                     78:     }
                     79:   } else {
                     80:     /* Mark non-alphanumeric ASCII characters as delimiters */
                     81:     int i;
                     82:     for(i=1; i<0x80; i++){
                     83:       t->delim[i] = !isalnum(i);
                     84:     }
                     85:   }
                     86: 
                     87:   *ppTokenizer = &t->base;
                     88:   return SQLITE_OK;
                     89: }
                     90: 
                     91: /*
                     92: ** Destroy a tokenizer
                     93: */
                     94: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
                     95:   free(pTokenizer);
                     96:   return SQLITE_OK;
                     97: }
                     98: 
                     99: /*
                    100: ** Prepare to begin tokenizing a particular string.  The input
                    101: ** string to be tokenized is pInput[0..nBytes-1].  A cursor
                    102: ** used to incrementally tokenize this string is returned in 
                    103: ** *ppCursor.
                    104: */
                    105: static int simpleOpen(
                    106:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
                    107:   const char *pInput, int nBytes,        /* String to be tokenized */
                    108:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
                    109: ){
                    110:   simple_tokenizer_cursor *c;
                    111: 
                    112:   c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
                    113:   if( c==NULL ) return SQLITE_NOMEM;
                    114: 
                    115:   c->pInput = pInput;
                    116:   if( pInput==0 ){
                    117:     c->nBytes = 0;
                    118:   }else if( nBytes<0 ){
                    119:     c->nBytes = (int)strlen(pInput);
                    120:   }else{
                    121:     c->nBytes = nBytes;
                    122:   }
                    123:   c->iOffset = 0;                 /* start tokenizing at the beginning */
                    124:   c->iToken = 0;
                    125:   c->pToken = NULL;               /* no space allocated, yet. */
                    126:   c->nTokenAllocated = 0;
                    127: 
                    128:   *ppCursor = &c->base;
                    129:   return SQLITE_OK;
                    130: }
                    131: 
                    132: /*
                    133: ** Close a tokenization cursor previously opened by a call to
                    134: ** simpleOpen() above.
                    135: */
                    136: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
                    137:   simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
                    138:   free(c->pToken);
                    139:   free(c);
                    140:   return SQLITE_OK;
                    141: }
                    142: 
                    143: /*
                    144: ** Extract the next token from a tokenization cursor.  The cursor must
                    145: ** have been opened by a prior call to simpleOpen().
                    146: */
                    147: static int simpleNext(
                    148:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
                    149:   const char **ppToken,               /* OUT: *ppToken is the token text */
                    150:   int *pnBytes,                       /* OUT: Number of bytes in token */
                    151:   int *piStartOffset,                 /* OUT: Starting offset of token */
                    152:   int *piEndOffset,                   /* OUT: Ending offset of token */
                    153:   int *piPosition                     /* OUT: Position integer of token */
                    154: ){
                    155:   simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
                    156:   simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
                    157:   unsigned char *p = (unsigned char *)c->pInput;
                    158: 
                    159:   while( c->iOffset<c->nBytes ){
                    160:     int iStartOffset;
                    161: 
                    162:     /* Scan past delimiter characters */
                    163:     while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
                    164:       c->iOffset++;
                    165:     }
                    166: 
                    167:     /* Count non-delimiter characters. */
                    168:     iStartOffset = c->iOffset;
                    169:     while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
                    170:       c->iOffset++;
                    171:     }
                    172: 
                    173:     if( c->iOffset>iStartOffset ){
                    174:       int i, n = c->iOffset-iStartOffset;
                    175:       if( n>c->nTokenAllocated ){
                    176:         c->nTokenAllocated = n+20;
                    177:         c->pToken = realloc(c->pToken, c->nTokenAllocated);
                    178:         if( c->pToken==NULL ) return SQLITE_NOMEM;
                    179:       }
                    180:       for(i=0; i<n; i++){
                    181:         /* TODO(shess) This needs expansion to handle UTF-8
                    182:         ** case-insensitivity.
                    183:         */
                    184:         unsigned char ch = p[iStartOffset+i];
                    185:         c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
                    186:       }
                    187:       *ppToken = c->pToken;
                    188:       *pnBytes = n;
                    189:       *piStartOffset = iStartOffset;
                    190:       *piEndOffset = c->iOffset;
                    191:       *piPosition = c->iToken++;
                    192: 
                    193:       return SQLITE_OK;
                    194:     }
                    195:   }
                    196:   return SQLITE_DONE;
                    197: }
                    198: 
                    199: /*
                    200: ** The set of routines that implement the simple tokenizer
                    201: */
                    202: static const sqlite3_tokenizer_module simpleTokenizerModule = {
                    203:   0,
                    204:   simpleCreate,
                    205:   simpleDestroy,
                    206:   simpleOpen,
                    207:   simpleClose,
                    208:   simpleNext,
                    209: };
                    210: 
                    211: /*
                    212: ** Allocate a new simple tokenizer.  Return a pointer to the new
                    213: ** tokenizer in *ppModule
                    214: */
                    215: void sqlite3Fts1SimpleTokenizerModule(
                    216:   sqlite3_tokenizer_module const**ppModule
                    217: ){
                    218:   *ppModule = &simpleTokenizerModule;
                    219: }
                    220: 
                    221: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>