Annotation of embedaddon/sqlite3/ext/fts1/fts1_tokenizer1.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** The author disclaims copyright to this source code.
! 3: **
! 4: *************************************************************************
! 5: ** Implementation of the "simple" full-text-search tokenizer.
! 6: */
! 7:
! 8: /*
! 9: ** The code in this file is only compiled if:
! 10: **
! 11: ** * The FTS1 module is being built as an extension
! 12: ** (in which case SQLITE_CORE is not defined), or
! 13: **
! 14: ** * The FTS1 module is being built into the core of
! 15: ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
! 16: */
! 17: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
! 18:
! 19:
! 20: #include <assert.h>
! 21: #include <stdlib.h>
! 22: #include <stdio.h>
! 23: #include <string.h>
! 24: #include <ctype.h>
! 25:
! 26: #include "fts1_tokenizer.h"
! 27:
! 28: typedef struct simple_tokenizer {
! 29: sqlite3_tokenizer base;
! 30: char delim[128]; /* flag ASCII delimiters */
! 31: } simple_tokenizer;
! 32:
! 33: typedef struct simple_tokenizer_cursor {
! 34: sqlite3_tokenizer_cursor base;
! 35: const char *pInput; /* input we are tokenizing */
! 36: int nBytes; /* size of the input */
! 37: int iOffset; /* current position in pInput */
! 38: int iToken; /* index of next token to be returned */
! 39: char *pToken; /* storage for current token */
! 40: int nTokenAllocated; /* space allocated to zToken buffer */
! 41: } simple_tokenizer_cursor;
! 42:
! 43:
! 44: /* Forward declaration */
! 45: static const sqlite3_tokenizer_module simpleTokenizerModule;
! 46:
! 47: static int isDelim(simple_tokenizer *t, unsigned char c){
! 48: return c<0x80 && t->delim[c];
! 49: }
! 50:
! 51: /*
! 52: ** Create a new tokenizer instance.
! 53: */
! 54: static int simpleCreate(
! 55: int argc, const char * const *argv,
! 56: sqlite3_tokenizer **ppTokenizer
! 57: ){
! 58: simple_tokenizer *t;
! 59:
! 60: t = (simple_tokenizer *) calloc(sizeof(*t), 1);
! 61: if( t==NULL ) return SQLITE_NOMEM;
! 62:
! 63: /* TODO(shess) Delimiters need to remain the same from run to run,
! 64: ** else we need to reindex. One solution would be a meta-table to
! 65: ** track such information in the database, then we'd only want this
! 66: ** information on the initial create.
! 67: */
! 68: if( argc>1 ){
! 69: int i, n = strlen(argv[1]);
! 70: for(i=0; i<n; i++){
! 71: unsigned char ch = argv[1][i];
! 72: /* We explicitly don't support UTF-8 delimiters for now. */
! 73: if( ch>=0x80 ){
! 74: free(t);
! 75: return SQLITE_ERROR;
! 76: }
! 77: t->delim[ch] = 1;
! 78: }
! 79: } else {
! 80: /* Mark non-alphanumeric ASCII characters as delimiters */
! 81: int i;
! 82: for(i=1; i<0x80; i++){
! 83: t->delim[i] = !isalnum(i);
! 84: }
! 85: }
! 86:
! 87: *ppTokenizer = &t->base;
! 88: return SQLITE_OK;
! 89: }
! 90:
! 91: /*
! 92: ** Destroy a tokenizer
! 93: */
! 94: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
! 95: free(pTokenizer);
! 96: return SQLITE_OK;
! 97: }
! 98:
! 99: /*
! 100: ** Prepare to begin tokenizing a particular string. The input
! 101: ** string to be tokenized is pInput[0..nBytes-1]. A cursor
! 102: ** used to incrementally tokenize this string is returned in
! 103: ** *ppCursor.
! 104: */
! 105: static int simpleOpen(
! 106: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
! 107: const char *pInput, int nBytes, /* String to be tokenized */
! 108: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
! 109: ){
! 110: simple_tokenizer_cursor *c;
! 111:
! 112: c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
! 113: if( c==NULL ) return SQLITE_NOMEM;
! 114:
! 115: c->pInput = pInput;
! 116: if( pInput==0 ){
! 117: c->nBytes = 0;
! 118: }else if( nBytes<0 ){
! 119: c->nBytes = (int)strlen(pInput);
! 120: }else{
! 121: c->nBytes = nBytes;
! 122: }
! 123: c->iOffset = 0; /* start tokenizing at the beginning */
! 124: c->iToken = 0;
! 125: c->pToken = NULL; /* no space allocated, yet. */
! 126: c->nTokenAllocated = 0;
! 127:
! 128: *ppCursor = &c->base;
! 129: return SQLITE_OK;
! 130: }
! 131:
! 132: /*
! 133: ** Close a tokenization cursor previously opened by a call to
! 134: ** simpleOpen() above.
! 135: */
! 136: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
! 137: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 138: free(c->pToken);
! 139: free(c);
! 140: return SQLITE_OK;
! 141: }
! 142:
! 143: /*
! 144: ** Extract the next token from a tokenization cursor. The cursor must
! 145: ** have been opened by a prior call to simpleOpen().
! 146: */
! 147: static int simpleNext(
! 148: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
! 149: const char **ppToken, /* OUT: *ppToken is the token text */
! 150: int *pnBytes, /* OUT: Number of bytes in token */
! 151: int *piStartOffset, /* OUT: Starting offset of token */
! 152: int *piEndOffset, /* OUT: Ending offset of token */
! 153: int *piPosition /* OUT: Position integer of token */
! 154: ){
! 155: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 156: simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
! 157: unsigned char *p = (unsigned char *)c->pInput;
! 158:
! 159: while( c->iOffset<c->nBytes ){
! 160: int iStartOffset;
! 161:
! 162: /* Scan past delimiter characters */
! 163: while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
! 164: c->iOffset++;
! 165: }
! 166:
! 167: /* Count non-delimiter characters. */
! 168: iStartOffset = c->iOffset;
! 169: while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
! 170: c->iOffset++;
! 171: }
! 172:
! 173: if( c->iOffset>iStartOffset ){
! 174: int i, n = c->iOffset-iStartOffset;
! 175: if( n>c->nTokenAllocated ){
! 176: c->nTokenAllocated = n+20;
! 177: c->pToken = realloc(c->pToken, c->nTokenAllocated);
! 178: if( c->pToken==NULL ) return SQLITE_NOMEM;
! 179: }
! 180: for(i=0; i<n; i++){
! 181: /* TODO(shess) This needs expansion to handle UTF-8
! 182: ** case-insensitivity.
! 183: */
! 184: unsigned char ch = p[iStartOffset+i];
! 185: c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
! 186: }
! 187: *ppToken = c->pToken;
! 188: *pnBytes = n;
! 189: *piStartOffset = iStartOffset;
! 190: *piEndOffset = c->iOffset;
! 191: *piPosition = c->iToken++;
! 192:
! 193: return SQLITE_OK;
! 194: }
! 195: }
! 196: return SQLITE_DONE;
! 197: }
! 198:
! 199: /*
! 200: ** The set of routines that implement the simple tokenizer
! 201: */
! 202: static const sqlite3_tokenizer_module simpleTokenizerModule = {
! 203: 0,
! 204: simpleCreate,
! 205: simpleDestroy,
! 206: simpleOpen,
! 207: simpleClose,
! 208: simpleNext,
! 209: };
! 210:
! 211: /*
! 212: ** Allocate a new simple tokenizer. Return a pointer to the new
! 213: ** tokenizer in *ppModule
! 214: */
! 215: void sqlite3Fts1SimpleTokenizerModule(
! 216: sqlite3_tokenizer_module const**ppModule
! 217: ){
! 218: *ppModule = &simpleTokenizerModule;
! 219: }
! 220:
! 221: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>