Annotation of embedaddon/sqlite3/ext/fts2/fts2_icu.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2007 June 22
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: *************************************************************************
12: ** This file implements a tokenizer for fts2 based on the ICU library.
13: **
14: ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
15: */
16:
17: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
18: #ifdef SQLITE_ENABLE_ICU
19:
20: #include <assert.h>
21: #include <string.h>
22: #include "fts2_tokenizer.h"
23:
24: #include <unicode/ubrk.h>
25: #include <unicode/ucol.h>
26: #include <unicode/ustring.h>
27: #include <unicode/utf16.h>
28:
29: typedef struct IcuTokenizer IcuTokenizer;
30: typedef struct IcuCursor IcuCursor;
31:
32: struct IcuTokenizer {
33: sqlite3_tokenizer base;
34: char *zLocale;
35: };
36:
37: struct IcuCursor {
38: sqlite3_tokenizer_cursor base;
39:
40: UBreakIterator *pIter; /* ICU break-iterator object */
41: int nChar; /* Number of UChar elements in pInput */
42: UChar *aChar; /* Copy of input using utf-16 encoding */
43: int *aOffset; /* Offsets of each character in utf-8 input */
44:
45: int nBuffer;
46: char *zBuffer;
47:
48: int iToken;
49: };
50:
51: /*
52: ** Create a new tokenizer instance.
53: */
54: static int icuCreate(
55: int argc, /* Number of entries in argv[] */
56: const char * const *argv, /* Tokenizer creation arguments */
57: sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
58: ){
59: IcuTokenizer *p;
60: int n = 0;
61:
62: if( argc>0 ){
63: n = strlen(argv[0])+1;
64: }
65: p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
66: if( !p ){
67: return SQLITE_NOMEM;
68: }
69: memset(p, 0, sizeof(IcuTokenizer));
70:
71: if( n ){
72: p->zLocale = (char *)&p[1];
73: memcpy(p->zLocale, argv[0], n);
74: }
75:
76: *ppTokenizer = (sqlite3_tokenizer *)p;
77:
78: return SQLITE_OK;
79: }
80:
81: /*
82: ** Destroy a tokenizer
83: */
84: static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85: IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
86: sqlite3_free(p);
87: return SQLITE_OK;
88: }
89:
90: /*
91: ** Prepare to begin tokenizing a particular string. The input
92: ** string to be tokenized is pInput[0..nBytes-1]. A cursor
93: ** used to incrementally tokenize this string is returned in
94: ** *ppCursor.
95: */
96: static int icuOpen(
97: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
98: const char *zInput, /* Input string */
99: int nInput, /* Length of zInput in bytes */
100: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
101: ){
102: IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
103: IcuCursor *pCsr;
104:
105: const int32_t opt = U_FOLD_CASE_DEFAULT;
106: UErrorCode status = U_ZERO_ERROR;
107: int nChar;
108:
109: UChar32 c;
110: int iInput = 0;
111: int iOut = 0;
112:
113: *ppCursor = 0;
114:
115: if( nInput<0 ){
116: nInput = strlen(zInput);
117: }
118: nChar = nInput+1;
119: pCsr = (IcuCursor *)sqlite3_malloc(
120: sizeof(IcuCursor) + /* IcuCursor */
121: nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
122: (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
123: );
124: if( !pCsr ){
125: return SQLITE_NOMEM;
126: }
127: memset(pCsr, 0, sizeof(IcuCursor));
128: pCsr->aChar = (UChar *)&pCsr[1];
129: pCsr->aOffset = (int *)&pCsr->aChar[nChar];
130:
131: pCsr->aOffset[iOut] = iInput;
132: U8_NEXT(zInput, iInput, nInput, c);
133: while( c>0 ){
134: int isError = 0;
135: c = u_foldCase(c, opt);
136: U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
137: if( isError ){
138: sqlite3_free(pCsr);
139: return SQLITE_ERROR;
140: }
141: pCsr->aOffset[iOut] = iInput;
142:
143: if( iInput<nInput ){
144: U8_NEXT(zInput, iInput, nInput, c);
145: }else{
146: c = 0;
147: }
148: }
149:
150: pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
151: if( !U_SUCCESS(status) ){
152: sqlite3_free(pCsr);
153: return SQLITE_ERROR;
154: }
155: pCsr->nChar = iOut;
156:
157: ubrk_first(pCsr->pIter);
158: *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
159: return SQLITE_OK;
160: }
161:
162: /*
163: ** Close a tokenization cursor previously opened by a call to icuOpen().
164: */
165: static int icuClose(sqlite3_tokenizer_cursor *pCursor){
166: IcuCursor *pCsr = (IcuCursor *)pCursor;
167: ubrk_close(pCsr->pIter);
168: sqlite3_free(pCsr->zBuffer);
169: sqlite3_free(pCsr);
170: return SQLITE_OK;
171: }
172:
173: /*
174: ** Extract the next token from a tokenization cursor.
175: */
176: static int icuNext(
177: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
178: const char **ppToken, /* OUT: *ppToken is the token text */
179: int *pnBytes, /* OUT: Number of bytes in token */
180: int *piStartOffset, /* OUT: Starting offset of token */
181: int *piEndOffset, /* OUT: Ending offset of token */
182: int *piPosition /* OUT: Position integer of token */
183: ){
184: IcuCursor *pCsr = (IcuCursor *)pCursor;
185:
186: int iStart = 0;
187: int iEnd = 0;
188: int nByte = 0;
189:
190: while( iStart==iEnd ){
191: UChar32 c;
192:
193: iStart = ubrk_current(pCsr->pIter);
194: iEnd = ubrk_next(pCsr->pIter);
195: if( iEnd==UBRK_DONE ){
196: return SQLITE_DONE;
197: }
198:
199: while( iStart<iEnd ){
200: int iWhite = iStart;
201: U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
202: if( u_isspace(c) ){
203: iStart = iWhite;
204: }else{
205: break;
206: }
207: }
208: assert(iStart<=iEnd);
209: }
210:
211: do {
212: UErrorCode status = U_ZERO_ERROR;
213: if( nByte ){
214: char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
215: if( !zNew ){
216: return SQLITE_NOMEM;
217: }
218: pCsr->zBuffer = zNew;
219: pCsr->nBuffer = nByte;
220: }
221:
222: u_strToUTF8(
223: pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
224: &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
225: &status /* Output success/failure */
226: );
227: } while( nByte>pCsr->nBuffer );
228:
229: *ppToken = pCsr->zBuffer;
230: *pnBytes = nByte;
231: *piStartOffset = pCsr->aOffset[iStart];
232: *piEndOffset = pCsr->aOffset[iEnd];
233: *piPosition = pCsr->iToken++;
234:
235: return SQLITE_OK;
236: }
237:
238: /*
239: ** The set of routines that implement the simple tokenizer
240: */
241: static const sqlite3_tokenizer_module icuTokenizerModule = {
242: 0, /* iVersion */
243: icuCreate, /* xCreate */
244: icuDestroy, /* xCreate */
245: icuOpen, /* xOpen */
246: icuClose, /* xClose */
247: icuNext, /* xNext */
248: };
249:
250: /*
251: ** Set *ppModule to point at the implementation of the ICU tokenizer.
252: */
253: void sqlite3Fts2IcuTokenizerModule(
254: sqlite3_tokenizer_module const**ppModule
255: ){
256: *ppModule = &icuTokenizerModule;
257: }
258:
259: #endif /* defined(SQLITE_ENABLE_ICU) */
260: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>