Annotation of embedaddon/sqlite3/ext/fts1/simple_tokenizer.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** The author disclaims copyright to this source code.
! 3: **
! 4: *************************************************************************
! 5: ** Implementation of the "simple" full-text-search tokenizer.
! 6: */
! 7:
! 8: #include <assert.h>
! 9: #if !defined(__APPLE__)
! 10: #include <malloc.h>
! 11: #else
! 12: #include <stdlib.h>
! 13: #endif
! 14: #include <stdio.h>
! 15: #include <string.h>
! 16: #include <ctype.h>
! 17:
! 18: #include "tokenizer.h"
! 19:
! 20: /* Duplicate a string; the caller must free() the returned string.
! 21: * (We don't use strdup() since it's not part of the standard C library and
! 22: * may not be available everywhere.) */
! 23: /* TODO(shess) Copied from fulltext.c, consider util.c for such
! 24: ** things. */
! 25: static char *string_dup(const char *s){
! 26: char *str = malloc(strlen(s) + 1);
! 27: strcpy(str, s);
! 28: return str;
! 29: }
! 30:
! 31: typedef struct simple_tokenizer {
! 32: sqlite3_tokenizer base;
! 33: const char *zDelim; /* token delimiters */
! 34: } simple_tokenizer;
! 35:
! 36: typedef struct simple_tokenizer_cursor {
! 37: sqlite3_tokenizer_cursor base;
! 38: const char *pInput; /* input we are tokenizing */
! 39: int nBytes; /* size of the input */
! 40: const char *pCurrent; /* current position in pInput */
! 41: int iToken; /* index of next token to be returned */
! 42: char *zToken; /* storage for current token */
! 43: int nTokenBytes; /* actual size of current token */
! 44: int nTokenAllocated; /* space allocated to zToken buffer */
! 45: } simple_tokenizer_cursor;
! 46:
! 47: static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
! 48:
! 49: static int simpleCreate(
! 50: int argc, const char **argv,
! 51: sqlite3_tokenizer **ppTokenizer
! 52: ){
! 53: simple_tokenizer *t;
! 54:
! 55: t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
! 56: /* TODO(shess) Delimiters need to remain the same from run to run,
! 57: ** else we need to reindex. One solution would be a meta-table to
! 58: ** track such information in the database, then we'd only want this
! 59: ** information on the initial create.
! 60: */
! 61: if( argc>1 ){
! 62: t->zDelim = string_dup(argv[1]);
! 63: } else {
! 64: /* Build a string excluding alphanumeric ASCII characters */
! 65: char zDelim[0x80]; /* nul-terminated, so nul not a member */
! 66: int i, j;
! 67: for(i=1, j=0; i<0x80; i++){
! 68: if( !isalnum(i) ){
! 69: zDelim[j++] = i;
! 70: }
! 71: }
! 72: zDelim[j++] = '\0';
! 73: assert( j<=sizeof(zDelim) );
! 74: t->zDelim = string_dup(zDelim);
! 75: }
! 76:
! 77: *ppTokenizer = &t->base;
! 78: return SQLITE_OK;
! 79: }
! 80:
! 81: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
! 82: simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
! 83:
! 84: free((void *) t->zDelim);
! 85: free(t);
! 86:
! 87: return SQLITE_OK;
! 88: }
! 89:
! 90: static int simpleOpen(
! 91: sqlite3_tokenizer *pTokenizer,
! 92: const char *pInput, int nBytes,
! 93: sqlite3_tokenizer_cursor **ppCursor
! 94: ){
! 95: simple_tokenizer_cursor *c;
! 96:
! 97: c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
! 98: c->pInput = pInput;
! 99: c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
! 100: c->pCurrent = c->pInput; /* start tokenizing at the beginning */
! 101: c->iToken = 0;
! 102: c->zToken = NULL; /* no space allocated, yet. */
! 103: c->nTokenBytes = 0;
! 104: c->nTokenAllocated = 0;
! 105:
! 106: *ppCursor = &c->base;
! 107: return SQLITE_OK;
! 108: }
! 109:
! 110: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
! 111: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 112:
! 113: if( NULL!=c->zToken ){
! 114: free(c->zToken);
! 115: }
! 116: free(c);
! 117:
! 118: return SQLITE_OK;
! 119: }
! 120:
! 121: static int simpleNext(
! 122: sqlite3_tokenizer_cursor *pCursor,
! 123: const char **ppToken, int *pnBytes,
! 124: int *piStartOffset, int *piEndOffset, int *piPosition
! 125: ){
! 126: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
! 127: simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
! 128: int ii;
! 129:
! 130: while( c->pCurrent-c->pInput<c->nBytes ){
! 131: int n = (int) strcspn(c->pCurrent, t->zDelim);
! 132: if( n>0 ){
! 133: if( n+1>c->nTokenAllocated ){
! 134: c->zToken = realloc(c->zToken, n+1);
! 135: }
! 136: for(ii=0; ii<n; ii++){
! 137: /* TODO(shess) This needs expansion to handle UTF-8
! 138: ** case-insensitivity.
! 139: */
! 140: char ch = c->pCurrent[ii];
! 141: c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
! 142: }
! 143: c->zToken[n] = '\0';
! 144: *ppToken = c->zToken;
! 145: *pnBytes = n;
! 146: *piStartOffset = (int) (c->pCurrent-c->pInput);
! 147: *piEndOffset = *piStartOffset+n;
! 148: *piPosition = c->iToken++;
! 149: c->pCurrent += n + 1;
! 150:
! 151: return SQLITE_OK;
! 152: }
! 153: c->pCurrent += n + 1;
! 154: /* TODO(shess) could strspn() to skip delimiters en masse. Needs
! 155: ** to happen in two places, though, which is annoying.
! 156: */
! 157: }
! 158: return SQLITE_DONE;
! 159: }
! 160:
! 161: static sqlite3_tokenizer_module simpleTokenizerModule = {
! 162: 0,
! 163: simpleCreate,
! 164: simpleDestroy,
! 165: simpleOpen,
! 166: simpleClose,
! 167: simpleNext,
! 168: };
! 169:
! 170: void get_simple_tokenizer_module(
! 171: sqlite3_tokenizer_module **ppModule
! 172: ){
! 173: *ppModule = &simpleTokenizerModule;
! 174: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>