Annotation of embedaddon/sqlite3/ext/fts3/fts3_tokenizer1.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** 2006 Oct 10
! 3: **
! 4: ** The author disclaims copyright to this source code. In place of
! 5: ** a legal notice, here is a blessing:
! 6: **
! 7: ** May you do good and not evil.
! 8: ** May you find forgiveness for yourself and forgive others.
! 9: ** May you share freely, never taking more than you give.
! 10: **
! 11: ******************************************************************************
! 12: **
! 13: ** Implementation of the "simple" full-text-search tokenizer.
! 14: */
! 15:
! 16: /*
! 17: ** The code in this file is only compiled if:
! 18: **
! 19: ** * The FTS3 module is being built as an extension
! 20: ** (in which case SQLITE_CORE is not defined), or
! 21: **
! 22: ** * The FTS3 module is being built into the core of
! 23: ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
! 24: */
! 25: #include "fts3Int.h"
! 26: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
! 27:
! 28: #include <assert.h>
! 29: #include <stdlib.h>
! 30: #include <stdio.h>
! 31: #include <string.h>
! 32:
! 33: #include "fts3_tokenizer.h"
! 34:
! 35: typedef struct simple_tokenizer {
! 36: sqlite3_tokenizer base;
! 37: char delim[128]; /* flag ASCII delimiters */
! 38: } simple_tokenizer;
! 39:
! 40: typedef struct simple_tokenizer_cursor {
! 41: sqlite3_tokenizer_cursor base;
! 42: const char *pInput; /* input we are tokenizing */
! 43: int nBytes; /* size of the input */
! 44: int iOffset; /* current position in pInput */
! 45: int iToken; /* index of next token to be returned */
! 46: char *pToken; /* storage for current token */
! 47: int nTokenAllocated; /* space allocated to zToken buffer */
! 48: } simple_tokenizer_cursor;
! 49:
! 50:
! 51: static int simpleDelim(simple_tokenizer *t, unsigned char c){
! 52: return c<0x80 && t->delim[c];
! 53: }
! 54: static int fts3_isalnum(int x){
! 55: return (x>='0' && x<='9') || (x>='A' && x<='Z') || (x>='a' && x<='z');
! 56: }
! 57:
! 58: /*
! 59: ** Create a new tokenizer instance.
! 60: */
! 61: static int simpleCreate(
! 62: int argc, const char * const *argv,
! 63: sqlite3_tokenizer **ppTokenizer
! 64: ){
! 65: simple_tokenizer *t;
! 66:
! 67: t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
! 68: if( t==NULL ) return SQLITE_NOMEM;
! 69: memset(t, 0, sizeof(*t));
! 70:
! 71: /* TODO(shess) Delimiters need to remain the same from run to run,
! 72: ** else we need to reindex. One solution would be a meta-table to
! 73: ** track such information in the database, then we'd only want this
! 74: ** information on the initial create.
! 75: */
! 76: if( argc>1 ){
! 77: int i, n = (int)strlen(argv[1]);
! 78: for(i=0; i<n; i++){
! 79: unsigned char ch = argv[1][i];
! 80: /* We explicitly don't support UTF-8 delimiters for now. */
! 81: if( ch>=0x80 ){
! 82: sqlite3_free(t);
! 83: return SQLITE_ERROR;
! 84: }
! 85: t->delim[ch] = 1;
! 86: }
! 87: } else {
! 88: /* Mark non-alphanumeric ASCII characters as delimiters */
! 89: int i;
! 90: for(i=1; i<0x80; i++){
! 91: t->delim[i] = !fts3_isalnum(i) ? -1 : 0;
! 92: }
! 93: }
! 94:
! 95: *ppTokenizer = &t->base;
! 96: return SQLITE_OK;
! 97: }
! 98:
! 99: /*
! 100: ** Destroy a tokenizer
! 101: */
! 102: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
! 103: sqlite3_free(pTokenizer);
! 104: return SQLITE_OK;
! 105: }
! 106:
! 107: /*
! 108: ** Prepare to begin tokenizing a particular string. The input
! 109: ** string to be tokenized is pInput[0..nBytes-1]. A cursor
! 110: ** used to incrementally tokenize this string is returned in
! 111: ** *ppCursor.
! 112: */
! 113: static int simpleOpen(
! 114: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
! 115: const char *pInput, int nBytes, /* String to be tokenized */
! 116: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
! 117: ){
! 118: simple_tokenizer_cursor *c;
! 119:
! 120: UNUSED_PARAMETER(pTokenizer);
! 121:
! 122: c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
! 123: if( c==NULL ) return SQLITE_NOMEM;
! 124:
! 125: c->pInput = pInput;
! 126: if( pInput==0 ){
! 127: c->nBytes = 0;
! 128: }else if( nBytes<0 ){
! 129: c->nBytes = (int)strlen(pInput);
! 130: }else{
! 131: c->nBytes = nBytes;
! 132: }
! 133: c->iOffset = 0; /* start tokenizing at the beginning */
! 134: c->iToken = 0;
! 135: c->pToken = NULL; /* no space allocated, yet. */
! 136: c->nTokenAllocated = 0;
! 137:
! 138: *ppCursor = &c->base;
! 139: return SQLITE_OK;
! 140: }
! 141:
! 142: /*
! 143: ** Close a tokenization cursor previously opened by a call to
! 144: ** simpleOpen() above.
! 145: */
! 146: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
! 147: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 148: sqlite3_free(c->pToken);
! 149: sqlite3_free(c);
! 150: return SQLITE_OK;
! 151: }
! 152:
! 153: /*
! 154: ** Extract the next token from a tokenization cursor. The cursor must
! 155: ** have been opened by a prior call to simpleOpen().
! 156: */
! 157: static int simpleNext(
! 158: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
! 159: const char **ppToken, /* OUT: *ppToken is the token text */
! 160: int *pnBytes, /* OUT: Number of bytes in token */
! 161: int *piStartOffset, /* OUT: Starting offset of token */
! 162: int *piEndOffset, /* OUT: Ending offset of token */
! 163: int *piPosition /* OUT: Position integer of token */
! 164: ){
! 165: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 166: simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
! 167: unsigned char *p = (unsigned char *)c->pInput;
! 168:
! 169: while( c->iOffset<c->nBytes ){
! 170: int iStartOffset;
! 171:
! 172: /* Scan past delimiter characters */
! 173: while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
! 174: c->iOffset++;
! 175: }
! 176:
! 177: /* Count non-delimiter characters. */
! 178: iStartOffset = c->iOffset;
! 179: while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
! 180: c->iOffset++;
! 181: }
! 182:
! 183: if( c->iOffset>iStartOffset ){
! 184: int i, n = c->iOffset-iStartOffset;
! 185: if( n>c->nTokenAllocated ){
! 186: char *pNew;
! 187: c->nTokenAllocated = n+20;
! 188: pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
! 189: if( !pNew ) return SQLITE_NOMEM;
! 190: c->pToken = pNew;
! 191: }
! 192: for(i=0; i<n; i++){
! 193: /* TODO(shess) This needs expansion to handle UTF-8
! 194: ** case-insensitivity.
! 195: */
! 196: unsigned char ch = p[iStartOffset+i];
! 197: c->pToken[i] = (char)((ch>='A' && ch<='Z') ? ch-'A'+'a' : ch);
! 198: }
! 199: *ppToken = c->pToken;
! 200: *pnBytes = n;
! 201: *piStartOffset = iStartOffset;
! 202: *piEndOffset = c->iOffset;
! 203: *piPosition = c->iToken++;
! 204:
! 205: return SQLITE_OK;
! 206: }
! 207: }
! 208: return SQLITE_DONE;
! 209: }
! 210:
! 211: /*
! 212: ** The set of routines that implement the simple tokenizer
! 213: */
! 214: static const sqlite3_tokenizer_module simpleTokenizerModule = {
! 215: 0,
! 216: simpleCreate,
! 217: simpleDestroy,
! 218: simpleOpen,
! 219: simpleClose,
! 220: simpleNext,
! 221: };
! 222:
! 223: /*
! 224: ** Allocate a new simple tokenizer. Return a pointer to the new
! 225: ** tokenizer in *ppModule
! 226: */
! 227: void sqlite3Fts3SimpleTokenizerModule(
! 228: sqlite3_tokenizer_module const**ppModule
! 229: ){
! 230: *ppModule = &simpleTokenizerModule;
! 231: }
! 232:
! 233: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>