Annotation of embedaddon/sqlite3/ext/fts1/simple_tokenizer.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** The author disclaims copyright to this source code.
3: **
4: *************************************************************************
5: ** Implementation of the "simple" full-text-search tokenizer.
6: */
7:
8: #include <assert.h>
9: #if !defined(__APPLE__)
10: #include <malloc.h>
11: #else
12: #include <stdlib.h>
13: #endif
14: #include <stdio.h>
15: #include <string.h>
16: #include <ctype.h>
17:
18: #include "tokenizer.h"
19:
20: /* Duplicate a string; the caller must free() the returned string.
21: * (We don't use strdup() since it's not part of the standard C library and
22: * may not be available everywhere.) */
23: /* TODO(shess) Copied from fulltext.c, consider util.c for such
24: ** things. */
25: static char *string_dup(const char *s){
26: char *str = malloc(strlen(s) + 1);
27: strcpy(str, s);
28: return str;
29: }
30:
31: typedef struct simple_tokenizer {
32: sqlite3_tokenizer base;
33: const char *zDelim; /* token delimiters */
34: } simple_tokenizer;
35:
36: typedef struct simple_tokenizer_cursor {
37: sqlite3_tokenizer_cursor base;
38: const char *pInput; /* input we are tokenizing */
39: int nBytes; /* size of the input */
40: const char *pCurrent; /* current position in pInput */
41: int iToken; /* index of next token to be returned */
42: char *zToken; /* storage for current token */
43: int nTokenBytes; /* actual size of current token */
44: int nTokenAllocated; /* space allocated to zToken buffer */
45: } simple_tokenizer_cursor;
46:
47: static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
48:
49: static int simpleCreate(
50: int argc, const char **argv,
51: sqlite3_tokenizer **ppTokenizer
52: ){
53: simple_tokenizer *t;
54:
55: t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
56: /* TODO(shess) Delimiters need to remain the same from run to run,
57: ** else we need to reindex. One solution would be a meta-table to
58: ** track such information in the database, then we'd only want this
59: ** information on the initial create.
60: */
61: if( argc>1 ){
62: t->zDelim = string_dup(argv[1]);
63: } else {
64: /* Build a string excluding alphanumeric ASCII characters */
65: char zDelim[0x80]; /* nul-terminated, so nul not a member */
66: int i, j;
67: for(i=1, j=0; i<0x80; i++){
68: if( !isalnum(i) ){
69: zDelim[j++] = i;
70: }
71: }
72: zDelim[j++] = '\0';
73: assert( j<=sizeof(zDelim) );
74: t->zDelim = string_dup(zDelim);
75: }
76:
77: *ppTokenizer = &t->base;
78: return SQLITE_OK;
79: }
80:
81: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
82: simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
83:
84: free((void *) t->zDelim);
85: free(t);
86:
87: return SQLITE_OK;
88: }
89:
90: static int simpleOpen(
91: sqlite3_tokenizer *pTokenizer,
92: const char *pInput, int nBytes,
93: sqlite3_tokenizer_cursor **ppCursor
94: ){
95: simple_tokenizer_cursor *c;
96:
97: c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
98: c->pInput = pInput;
99: c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
100: c->pCurrent = c->pInput; /* start tokenizing at the beginning */
101: c->iToken = 0;
102: c->zToken = NULL; /* no space allocated, yet. */
103: c->nTokenBytes = 0;
104: c->nTokenAllocated = 0;
105:
106: *ppCursor = &c->base;
107: return SQLITE_OK;
108: }
109:
110: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
111: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
112:
113: if( NULL!=c->zToken ){
114: free(c->zToken);
115: }
116: free(c);
117:
118: return SQLITE_OK;
119: }
120:
121: static int simpleNext(
122: sqlite3_tokenizer_cursor *pCursor,
123: const char **ppToken, int *pnBytes,
124: int *piStartOffset, int *piEndOffset, int *piPosition
125: ){
126: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
127: simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
128: int ii;
129:
130: while( c->pCurrent-c->pInput<c->nBytes ){
131: int n = (int) strcspn(c->pCurrent, t->zDelim);
132: if( n>0 ){
133: if( n+1>c->nTokenAllocated ){
134: c->zToken = realloc(c->zToken, n+1);
135: }
136: for(ii=0; ii<n; ii++){
137: /* TODO(shess) This needs expansion to handle UTF-8
138: ** case-insensitivity.
139: */
140: char ch = c->pCurrent[ii];
141: c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
142: }
143: c->zToken[n] = '\0';
144: *ppToken = c->zToken;
145: *pnBytes = n;
146: *piStartOffset = (int) (c->pCurrent-c->pInput);
147: *piEndOffset = *piStartOffset+n;
148: *piPosition = c->iToken++;
149: c->pCurrent += n + 1;
150:
151: return SQLITE_OK;
152: }
153: c->pCurrent += n + 1;
154: /* TODO(shess) could strspn() to skip delimiters en masse. Needs
155: ** to happen in two places, though, which is annoying.
156: */
157: }
158: return SQLITE_DONE;
159: }
160:
161: static sqlite3_tokenizer_module simpleTokenizerModule = {
162: 0,
163: simpleCreate,
164: simpleDestroy,
165: simpleOpen,
166: simpleClose,
167: simpleNext,
168: };
169:
170: void get_simple_tokenizer_module(
171: sqlite3_tokenizer_module **ppModule
172: ){
173: *ppModule = &simpleTokenizerModule;
174: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>