Annotation of embedaddon/sqlite3/ext/fts2/fts2_tokenizer1.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** 2006 Oct 10
! 3: **
! 4: ** The author disclaims copyright to this source code. In place of
! 5: ** a legal notice, here is a blessing:
! 6: **
! 7: ** May you do good and not evil.
! 8: ** May you find forgiveness for yourself and forgive others.
! 9: ** May you share freely, never taking more than you give.
! 10: **
! 11: ******************************************************************************
! 12: **
! 13: ** Implementation of the "simple" full-text-search tokenizer.
! 14: */
! 15:
! 16: /*
! 17: ** The code in this file is only compiled if:
! 18: **
! 19: ** * The FTS2 module is being built as an extension
! 20: ** (in which case SQLITE_CORE is not defined), or
! 21: **
! 22: ** * The FTS2 module is being built into the core of
! 23: ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
! 24: */
! 25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
! 26:
! 27:
! 28: #include <assert.h>
! 29: #include <stdlib.h>
! 30: #include <stdio.h>
! 31: #include <string.h>
! 32:
! 33: #include "fts2_tokenizer.h"
! 34:
! 35: typedef struct simple_tokenizer {
! 36: sqlite3_tokenizer base;
! 37: char delim[128]; /* flag ASCII delimiters */
! 38: } simple_tokenizer;
! 39:
! 40: typedef struct simple_tokenizer_cursor {
! 41: sqlite3_tokenizer_cursor base;
! 42: const char *pInput; /* input we are tokenizing */
! 43: int nBytes; /* size of the input */
! 44: int iOffset; /* current position in pInput */
! 45: int iToken; /* index of next token to be returned */
! 46: char *pToken; /* storage for current token */
! 47: int nTokenAllocated; /* space allocated to zToken buffer */
! 48: } simple_tokenizer_cursor;
! 49:
! 50:
! 51: /* Forward declaration */
! 52: static const sqlite3_tokenizer_module simpleTokenizerModule;
! 53:
! 54: static int simpleDelim(simple_tokenizer *t, unsigned char c){
! 55: return c<0x80 && t->delim[c];
! 56: }
! 57:
! 58: /*
! 59: ** Create a new tokenizer instance.
! 60: */
! 61: static int simpleCreate(
! 62: int argc, const char * const *argv,
! 63: sqlite3_tokenizer **ppTokenizer
! 64: ){
! 65: simple_tokenizer *t;
! 66:
! 67: t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
! 68: if( t==NULL ) return SQLITE_NOMEM;
! 69: memset(t, 0, sizeof(*t));
! 70:
! 71: /* TODO(shess) Delimiters need to remain the same from run to run,
! 72: ** else we need to reindex. One solution would be a meta-table to
! 73: ** track such information in the database, then we'd only want this
! 74: ** information on the initial create.
! 75: */
! 76: if( argc>1 ){
! 77: int i, n = strlen(argv[1]);
! 78: for(i=0; i<n; i++){
! 79: unsigned char ch = argv[1][i];
! 80: /* We explicitly don't support UTF-8 delimiters for now. */
! 81: if( ch>=0x80 ){
! 82: sqlite3_free(t);
! 83: return SQLITE_ERROR;
! 84: }
! 85: t->delim[ch] = 1;
! 86: }
! 87: } else {
! 88: /* Mark non-alphanumeric ASCII characters as delimiters */
! 89: int i;
! 90: for(i=1; i<0x80; i++){
! 91: t->delim[i] = !((i>='0' && i<='9') || (i>='A' && i<='Z') ||
! 92: (i>='a' && i<='z'));
! 93: }
! 94: }
! 95:
! 96: *ppTokenizer = &t->base;
! 97: return SQLITE_OK;
! 98: }
! 99:
! 100: /*
! 101: ** Destroy a tokenizer
! 102: */
! 103: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
! 104: sqlite3_free(pTokenizer);
! 105: return SQLITE_OK;
! 106: }
! 107:
! 108: /*
! 109: ** Prepare to begin tokenizing a particular string. The input
! 110: ** string to be tokenized is pInput[0..nBytes-1]. A cursor
! 111: ** used to incrementally tokenize this string is returned in
! 112: ** *ppCursor.
! 113: */
! 114: static int simpleOpen(
! 115: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
! 116: const char *pInput, int nBytes, /* String to be tokenized */
! 117: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
! 118: ){
! 119: simple_tokenizer_cursor *c;
! 120:
! 121: c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
! 122: if( c==NULL ) return SQLITE_NOMEM;
! 123:
! 124: c->pInput = pInput;
! 125: if( pInput==0 ){
! 126: c->nBytes = 0;
! 127: }else if( nBytes<0 ){
! 128: c->nBytes = (int)strlen(pInput);
! 129: }else{
! 130: c->nBytes = nBytes;
! 131: }
! 132: c->iOffset = 0; /* start tokenizing at the beginning */
! 133: c->iToken = 0;
! 134: c->pToken = NULL; /* no space allocated, yet. */
! 135: c->nTokenAllocated = 0;
! 136:
! 137: *ppCursor = &c->base;
! 138: return SQLITE_OK;
! 139: }
! 140:
! 141: /*
! 142: ** Close a tokenization cursor previously opened by a call to
! 143: ** simpleOpen() above.
! 144: */
! 145: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
! 146: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 147: sqlite3_free(c->pToken);
! 148: sqlite3_free(c);
! 149: return SQLITE_OK;
! 150: }
! 151:
! 152: /*
! 153: ** Extract the next token from a tokenization cursor. The cursor must
! 154: ** have been opened by a prior call to simpleOpen().
! 155: */
! 156: static int simpleNext(
! 157: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
! 158: const char **ppToken, /* OUT: *ppToken is the token text */
! 159: int *pnBytes, /* OUT: Number of bytes in token */
! 160: int *piStartOffset, /* OUT: Starting offset of token */
! 161: int *piEndOffset, /* OUT: Ending offset of token */
! 162: int *piPosition /* OUT: Position integer of token */
! 163: ){
! 164: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 165: simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
! 166: unsigned char *p = (unsigned char *)c->pInput;
! 167:
! 168: while( c->iOffset<c->nBytes ){
! 169: int iStartOffset;
! 170:
! 171: /* Scan past delimiter characters */
! 172: while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
! 173: c->iOffset++;
! 174: }
! 175:
! 176: /* Count non-delimiter characters. */
! 177: iStartOffset = c->iOffset;
! 178: while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
! 179: c->iOffset++;
! 180: }
! 181:
! 182: if( c->iOffset>iStartOffset ){
! 183: int i, n = c->iOffset-iStartOffset;
! 184: if( n>c->nTokenAllocated ){
! 185: c->nTokenAllocated = n+20;
! 186: c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
! 187: if( c->pToken==NULL ) return SQLITE_NOMEM;
! 188: }
! 189: for(i=0; i<n; i++){
! 190: /* TODO(shess) This needs expansion to handle UTF-8
! 191: ** case-insensitivity.
! 192: */
! 193: unsigned char ch = p[iStartOffset+i];
! 194: c->pToken[i] = (ch>='A' && ch<='Z') ? (ch - 'A' + 'a') : ch;
! 195: }
! 196: *ppToken = c->pToken;
! 197: *pnBytes = n;
! 198: *piStartOffset = iStartOffset;
! 199: *piEndOffset = c->iOffset;
! 200: *piPosition = c->iToken++;
! 201:
! 202: return SQLITE_OK;
! 203: }
! 204: }
! 205: return SQLITE_DONE;
! 206: }
! 207:
! 208: /*
! 209: ** The set of routines that implement the simple tokenizer
! 210: */
! 211: static const sqlite3_tokenizer_module simpleTokenizerModule = {
! 212: 0,
! 213: simpleCreate,
! 214: simpleDestroy,
! 215: simpleOpen,
! 216: simpleClose,
! 217: simpleNext,
! 218: };
! 219:
! 220: /*
! 221: ** Allocate a new simple tokenizer. Return a pointer to the new
! 222: ** tokenizer in *ppModule
! 223: */
! 224: void sqlite3Fts2SimpleTokenizerModule(
! 225: sqlite3_tokenizer_module const**ppModule
! 226: ){
! 227: *ppModule = &simpleTokenizerModule;
! 228: }
! 229:
! 230: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>