Annotation of embedaddon/sqlite3/ext/fts3/fts3_icu.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2007 June 22
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: *************************************************************************
12: ** This file implements a tokenizer for fts3 based on the ICU library.
13: */
14: #include "fts3Int.h"
15: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
16: #ifdef SQLITE_ENABLE_ICU
17:
18: #include <assert.h>
19: #include <string.h>
20: #include "fts3_tokenizer.h"
21:
22: #include <unicode/ubrk.h>
23: #include <unicode/ucol.h>
24: #include <unicode/ustring.h>
25: #include <unicode/utf16.h>
26:
27: typedef struct IcuTokenizer IcuTokenizer;
28: typedef struct IcuCursor IcuCursor;
29:
30: struct IcuTokenizer {
31: sqlite3_tokenizer base;
32: char *zLocale;
33: };
34:
35: struct IcuCursor {
36: sqlite3_tokenizer_cursor base;
37:
38: UBreakIterator *pIter; /* ICU break-iterator object */
39: int nChar; /* Number of UChar elements in pInput */
40: UChar *aChar; /* Copy of input using utf-16 encoding */
41: int *aOffset; /* Offsets of each character in utf-8 input */
42:
43: int nBuffer;
44: char *zBuffer;
45:
46: int iToken;
47: };
48:
49: /*
50: ** Create a new tokenizer instance.
51: */
52: static int icuCreate(
53: int argc, /* Number of entries in argv[] */
54: const char * const *argv, /* Tokenizer creation arguments */
55: sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
56: ){
57: IcuTokenizer *p;
58: int n = 0;
59:
60: if( argc>0 ){
61: n = strlen(argv[0])+1;
62: }
63: p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
64: if( !p ){
65: return SQLITE_NOMEM;
66: }
67: memset(p, 0, sizeof(IcuTokenizer));
68:
69: if( n ){
70: p->zLocale = (char *)&p[1];
71: memcpy(p->zLocale, argv[0], n);
72: }
73:
74: *ppTokenizer = (sqlite3_tokenizer *)p;
75:
76: return SQLITE_OK;
77: }
78:
79: /*
80: ** Destroy a tokenizer
81: */
82: static int icuDestroy(sqlite3_tokenizer *pTokenizer){
83: IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
84: sqlite3_free(p);
85: return SQLITE_OK;
86: }
87:
88: /*
89: ** Prepare to begin tokenizing a particular string. The input
90: ** string to be tokenized is pInput[0..nBytes-1]. A cursor
91: ** used to incrementally tokenize this string is returned in
92: ** *ppCursor.
93: */
94: static int icuOpen(
95: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
96: const char *zInput, /* Input string */
97: int nInput, /* Length of zInput in bytes */
98: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
99: ){
100: IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
101: IcuCursor *pCsr;
102:
103: const int32_t opt = U_FOLD_CASE_DEFAULT;
104: UErrorCode status = U_ZERO_ERROR;
105: int nChar;
106:
107: UChar32 c;
108: int iInput = 0;
109: int iOut = 0;
110:
111: *ppCursor = 0;
112:
113: if( nInput<0 ){
114: nInput = strlen(zInput);
115: }
116: nChar = nInput+1;
117: pCsr = (IcuCursor *)sqlite3_malloc(
118: sizeof(IcuCursor) + /* IcuCursor */
119: nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
120: (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
121: );
122: if( !pCsr ){
123: return SQLITE_NOMEM;
124: }
125: memset(pCsr, 0, sizeof(IcuCursor));
126: pCsr->aChar = (UChar *)&pCsr[1];
127: pCsr->aOffset = (int *)&pCsr->aChar[nChar];
128:
129: pCsr->aOffset[iOut] = iInput;
130: U8_NEXT(zInput, iInput, nInput, c);
131: while( c>0 ){
132: int isError = 0;
133: c = u_foldCase(c, opt);
134: U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
135: if( isError ){
136: sqlite3_free(pCsr);
137: return SQLITE_ERROR;
138: }
139: pCsr->aOffset[iOut] = iInput;
140:
141: if( iInput<nInput ){
142: U8_NEXT(zInput, iInput, nInput, c);
143: }else{
144: c = 0;
145: }
146: }
147:
148: pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
149: if( !U_SUCCESS(status) ){
150: sqlite3_free(pCsr);
151: return SQLITE_ERROR;
152: }
153: pCsr->nChar = iOut;
154:
155: ubrk_first(pCsr->pIter);
156: *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
157: return SQLITE_OK;
158: }
159:
160: /*
161: ** Close a tokenization cursor previously opened by a call to icuOpen().
162: */
163: static int icuClose(sqlite3_tokenizer_cursor *pCursor){
164: IcuCursor *pCsr = (IcuCursor *)pCursor;
165: ubrk_close(pCsr->pIter);
166: sqlite3_free(pCsr->zBuffer);
167: sqlite3_free(pCsr);
168: return SQLITE_OK;
169: }
170:
171: /*
172: ** Extract the next token from a tokenization cursor.
173: */
174: static int icuNext(
175: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
176: const char **ppToken, /* OUT: *ppToken is the token text */
177: int *pnBytes, /* OUT: Number of bytes in token */
178: int *piStartOffset, /* OUT: Starting offset of token */
179: int *piEndOffset, /* OUT: Ending offset of token */
180: int *piPosition /* OUT: Position integer of token */
181: ){
182: IcuCursor *pCsr = (IcuCursor *)pCursor;
183:
184: int iStart = 0;
185: int iEnd = 0;
186: int nByte = 0;
187:
188: while( iStart==iEnd ){
189: UChar32 c;
190:
191: iStart = ubrk_current(pCsr->pIter);
192: iEnd = ubrk_next(pCsr->pIter);
193: if( iEnd==UBRK_DONE ){
194: return SQLITE_DONE;
195: }
196:
197: while( iStart<iEnd ){
198: int iWhite = iStart;
199: U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
200: if( u_isspace(c) ){
201: iStart = iWhite;
202: }else{
203: break;
204: }
205: }
206: assert(iStart<=iEnd);
207: }
208:
209: do {
210: UErrorCode status = U_ZERO_ERROR;
211: if( nByte ){
212: char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
213: if( !zNew ){
214: return SQLITE_NOMEM;
215: }
216: pCsr->zBuffer = zNew;
217: pCsr->nBuffer = nByte;
218: }
219:
220: u_strToUTF8(
221: pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
222: &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
223: &status /* Output success/failure */
224: );
225: } while( nByte>pCsr->nBuffer );
226:
227: *ppToken = pCsr->zBuffer;
228: *pnBytes = nByte;
229: *piStartOffset = pCsr->aOffset[iStart];
230: *piEndOffset = pCsr->aOffset[iEnd];
231: *piPosition = pCsr->iToken++;
232:
233: return SQLITE_OK;
234: }
235:
236: /*
237: ** The set of routines that implement the simple tokenizer
238: */
239: static const sqlite3_tokenizer_module icuTokenizerModule = {
240: 0, /* iVersion */
241: icuCreate, /* xCreate */
242: icuDestroy, /* xCreate */
243: icuOpen, /* xOpen */
244: icuClose, /* xClose */
245: icuNext, /* xNext */
246: };
247:
248: /*
249: ** Set *ppModule to point at the implementation of the ICU tokenizer.
250: */
251: void sqlite3Fts3IcuTokenizerModule(
252: sqlite3_tokenizer_module const**ppModule
253: ){
254: *ppModule = &icuTokenizerModule;
255: }
256:
257: #endif /* defined(SQLITE_ENABLE_ICU) */
258: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>