Return to simple_tokenizer.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts1 |
1.1 ! misho 1: /* ! 2: ** The author disclaims copyright to this source code. ! 3: ** ! 4: ************************************************************************* ! 5: ** Implementation of the "simple" full-text-search tokenizer. ! 6: */ ! 7: ! 8: #include <assert.h> ! 9: #if !defined(__APPLE__) ! 10: #include <malloc.h> ! 11: #else ! 12: #include <stdlib.h> ! 13: #endif ! 14: #include <stdio.h> ! 15: #include <string.h> ! 16: #include <ctype.h> ! 17: ! 18: #include "tokenizer.h" ! 19: ! 20: /* Duplicate a string; the caller must free() the returned string. ! 21: * (We don't use strdup() since it's not part of the standard C library and ! 22: * may not be available everywhere.) */ ! 23: /* TODO(shess) Copied from fulltext.c, consider util.c for such ! 24: ** things. */ ! 25: static char *string_dup(const char *s){ ! 26: char *str = malloc(strlen(s) + 1); ! 27: strcpy(str, s); ! 28: return str; ! 29: } ! 30: ! 31: typedef struct simple_tokenizer { ! 32: sqlite3_tokenizer base; ! 33: const char *zDelim; /* token delimiters */ ! 34: } simple_tokenizer; ! 35: ! 36: typedef struct simple_tokenizer_cursor { ! 37: sqlite3_tokenizer_cursor base; ! 38: const char *pInput; /* input we are tokenizing */ ! 39: int nBytes; /* size of the input */ ! 40: const char *pCurrent; /* current position in pInput */ ! 41: int iToken; /* index of next token to be returned */ ! 42: char *zToken; /* storage for current token */ ! 43: int nTokenBytes; /* actual size of current token */ ! 44: int nTokenAllocated; /* space allocated to zToken buffer */ ! 45: } simple_tokenizer_cursor; ! 46: ! 47: static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */ ! 48: ! 49: static int simpleCreate( ! 50: int argc, const char **argv, ! 51: sqlite3_tokenizer **ppTokenizer ! 52: ){ ! 53: simple_tokenizer *t; ! 54: ! 55: t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer)); ! 56: /* TODO(shess) Delimiters need to remain the same from run to run, ! 57: ** else we need to reindex. One solution would be a meta-table to ! 58: ** track such information in the database, then we'd only want this ! 59: ** information on the initial create. ! 60: */ ! 61: if( argc>1 ){ ! 62: t->zDelim = string_dup(argv[1]); ! 63: } else { ! 64: /* Build a string excluding alphanumeric ASCII characters */ ! 65: char zDelim[0x80]; /* nul-terminated, so nul not a member */ ! 66: int i, j; ! 67: for(i=1, j=0; i<0x80; i++){ ! 68: if( !isalnum(i) ){ ! 69: zDelim[j++] = i; ! 70: } ! 71: } ! 72: zDelim[j++] = '\0'; ! 73: assert( j<=sizeof(zDelim) ); ! 74: t->zDelim = string_dup(zDelim); ! 75: } ! 76: ! 77: *ppTokenizer = &t->base; ! 78: return SQLITE_OK; ! 79: } ! 80: ! 81: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ ! 82: simple_tokenizer *t = (simple_tokenizer *) pTokenizer; ! 83: ! 84: free((void *) t->zDelim); ! 85: free(t); ! 86: ! 87: return SQLITE_OK; ! 88: } ! 89: ! 90: static int simpleOpen( ! 91: sqlite3_tokenizer *pTokenizer, ! 92: const char *pInput, int nBytes, ! 93: sqlite3_tokenizer_cursor **ppCursor ! 94: ){ ! 95: simple_tokenizer_cursor *c; ! 96: ! 97: c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor)); ! 98: c->pInput = pInput; ! 99: c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes; ! 100: c->pCurrent = c->pInput; /* start tokenizing at the beginning */ ! 101: c->iToken = 0; ! 102: c->zToken = NULL; /* no space allocated, yet. */ ! 103: c->nTokenBytes = 0; ! 104: c->nTokenAllocated = 0; ! 105: ! 106: *ppCursor = &c->base; ! 107: return SQLITE_OK; ! 108: } ! 109: ! 110: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ ! 111: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; ! 112: ! 113: if( NULL!=c->zToken ){ ! 114: free(c->zToken); ! 115: } ! 116: free(c); ! 117: ! 118: return SQLITE_OK; ! 119: } ! 120: ! 121: static int simpleNext( ! 122: sqlite3_tokenizer_cursor *pCursor, ! 123: const char **ppToken, int *pnBytes, ! 124: int *piStartOffset, int *piEndOffset, int *piPosition ! 125: ){ ! 126: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; ! 127: simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; ! 128: int ii; ! 129: ! 130: while( c->pCurrent-c->pInput<c->nBytes ){ ! 131: int n = (int) strcspn(c->pCurrent, t->zDelim); ! 132: if( n>0 ){ ! 133: if( n+1>c->nTokenAllocated ){ ! 134: c->zToken = realloc(c->zToken, n+1); ! 135: } ! 136: for(ii=0; ii<n; ii++){ ! 137: /* TODO(shess) This needs expansion to handle UTF-8 ! 138: ** case-insensitivity. ! 139: */ ! 140: char ch = c->pCurrent[ii]; ! 141: c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch; ! 142: } ! 143: c->zToken[n] = '\0'; ! 144: *ppToken = c->zToken; ! 145: *pnBytes = n; ! 146: *piStartOffset = (int) (c->pCurrent-c->pInput); ! 147: *piEndOffset = *piStartOffset+n; ! 148: *piPosition = c->iToken++; ! 149: c->pCurrent += n + 1; ! 150: ! 151: return SQLITE_OK; ! 152: } ! 153: c->pCurrent += n + 1; ! 154: /* TODO(shess) could strspn() to skip delimiters en masse. Needs ! 155: ** to happen in two places, though, which is annoying. ! 156: */ ! 157: } ! 158: return SQLITE_DONE; ! 159: } ! 160: ! 161: static sqlite3_tokenizer_module simpleTokenizerModule = { ! 162: 0, ! 163: simpleCreate, ! 164: simpleDestroy, ! 165: simpleOpen, ! 166: simpleClose, ! 167: simpleNext, ! 168: }; ! 169: ! 170: void get_simple_tokenizer_module( ! 171: sqlite3_tokenizer_module **ppModule ! 172: ){ ! 173: *ppModule = &simpleTokenizerModule; ! 174: }