Annotation of embedaddon/sqlite3/ext/fts2/fts2_icu.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** 2007 June 22
! 3: **
! 4: ** The author disclaims copyright to this source code. In place of
! 5: ** a legal notice, here is a blessing:
! 6: **
! 7: ** May you do good and not evil.
! 8: ** May you find forgiveness for yourself and forgive others.
! 9: ** May you share freely, never taking more than you give.
! 10: **
! 11: *************************************************************************
! 12: ** This file implements a tokenizer for fts2 based on the ICU library.
! 13: **
! 14: ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
! 15: */
! 16:
! 17: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
! 18: #ifdef SQLITE_ENABLE_ICU
! 19:
! 20: #include <assert.h>
! 21: #include <string.h>
! 22: #include "fts2_tokenizer.h"
! 23:
! 24: #include <unicode/ubrk.h>
! 25: #include <unicode/ucol.h>
! 26: #include <unicode/ustring.h>
! 27: #include <unicode/utf16.h>
! 28:
! 29: typedef struct IcuTokenizer IcuTokenizer;
! 30: typedef struct IcuCursor IcuCursor;
! 31:
! 32: struct IcuTokenizer {
! 33: sqlite3_tokenizer base;
! 34: char *zLocale;
! 35: };
! 36:
! 37: struct IcuCursor {
! 38: sqlite3_tokenizer_cursor base;
! 39:
! 40: UBreakIterator *pIter; /* ICU break-iterator object */
! 41: int nChar; /* Number of UChar elements in pInput */
! 42: UChar *aChar; /* Copy of input using utf-16 encoding */
! 43: int *aOffset; /* Offsets of each character in utf-8 input */
! 44:
! 45: int nBuffer;
! 46: char *zBuffer;
! 47:
! 48: int iToken;
! 49: };
! 50:
! 51: /*
! 52: ** Create a new tokenizer instance.
! 53: */
! 54: static int icuCreate(
! 55: int argc, /* Number of entries in argv[] */
! 56: const char * const *argv, /* Tokenizer creation arguments */
! 57: sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
! 58: ){
! 59: IcuTokenizer *p;
! 60: int n = 0;
! 61:
! 62: if( argc>0 ){
! 63: n = strlen(argv[0])+1;
! 64: }
! 65: p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
! 66: if( !p ){
! 67: return SQLITE_NOMEM;
! 68: }
! 69: memset(p, 0, sizeof(IcuTokenizer));
! 70:
! 71: if( n ){
! 72: p->zLocale = (char *)&p[1];
! 73: memcpy(p->zLocale, argv[0], n);
! 74: }
! 75:
! 76: *ppTokenizer = (sqlite3_tokenizer *)p;
! 77:
! 78: return SQLITE_OK;
! 79: }
! 80:
! 81: /*
! 82: ** Destroy a tokenizer
! 83: */
! 84: static int icuDestroy(sqlite3_tokenizer *pTokenizer){
! 85: IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
! 86: sqlite3_free(p);
! 87: return SQLITE_OK;
! 88: }
! 89:
! 90: /*
! 91: ** Prepare to begin tokenizing a particular string. The input
! 92: ** string to be tokenized is pInput[0..nBytes-1]. A cursor
! 93: ** used to incrementally tokenize this string is returned in
! 94: ** *ppCursor.
! 95: */
! 96: static int icuOpen(
! 97: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
! 98: const char *zInput, /* Input string */
! 99: int nInput, /* Length of zInput in bytes */
! 100: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
! 101: ){
! 102: IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
! 103: IcuCursor *pCsr;
! 104:
! 105: const int32_t opt = U_FOLD_CASE_DEFAULT;
! 106: UErrorCode status = U_ZERO_ERROR;
! 107: int nChar;
! 108:
! 109: UChar32 c;
! 110: int iInput = 0;
! 111: int iOut = 0;
! 112:
! 113: *ppCursor = 0;
! 114:
! 115: if( nInput<0 ){
! 116: nInput = strlen(zInput);
! 117: }
! 118: nChar = nInput+1;
! 119: pCsr = (IcuCursor *)sqlite3_malloc(
! 120: sizeof(IcuCursor) + /* IcuCursor */
! 121: nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
! 122: (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
! 123: );
! 124: if( !pCsr ){
! 125: return SQLITE_NOMEM;
! 126: }
! 127: memset(pCsr, 0, sizeof(IcuCursor));
! 128: pCsr->aChar = (UChar *)&pCsr[1];
! 129: pCsr->aOffset = (int *)&pCsr->aChar[nChar];
! 130:
! 131: pCsr->aOffset[iOut] = iInput;
! 132: U8_NEXT(zInput, iInput, nInput, c);
! 133: while( c>0 ){
! 134: int isError = 0;
! 135: c = u_foldCase(c, opt);
! 136: U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
! 137: if( isError ){
! 138: sqlite3_free(pCsr);
! 139: return SQLITE_ERROR;
! 140: }
! 141: pCsr->aOffset[iOut] = iInput;
! 142:
! 143: if( iInput<nInput ){
! 144: U8_NEXT(zInput, iInput, nInput, c);
! 145: }else{
! 146: c = 0;
! 147: }
! 148: }
! 149:
! 150: pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
! 151: if( !U_SUCCESS(status) ){
! 152: sqlite3_free(pCsr);
! 153: return SQLITE_ERROR;
! 154: }
! 155: pCsr->nChar = iOut;
! 156:
! 157: ubrk_first(pCsr->pIter);
! 158: *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
! 159: return SQLITE_OK;
! 160: }
! 161:
! 162: /*
! 163: ** Close a tokenization cursor previously opened by a call to icuOpen().
! 164: */
! 165: static int icuClose(sqlite3_tokenizer_cursor *pCursor){
! 166: IcuCursor *pCsr = (IcuCursor *)pCursor;
! 167: ubrk_close(pCsr->pIter);
! 168: sqlite3_free(pCsr->zBuffer);
! 169: sqlite3_free(pCsr);
! 170: return SQLITE_OK;
! 171: }
! 172:
! 173: /*
! 174: ** Extract the next token from a tokenization cursor.
! 175: */
! 176: static int icuNext(
! 177: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
! 178: const char **ppToken, /* OUT: *ppToken is the token text */
! 179: int *pnBytes, /* OUT: Number of bytes in token */
! 180: int *piStartOffset, /* OUT: Starting offset of token */
! 181: int *piEndOffset, /* OUT: Ending offset of token */
! 182: int *piPosition /* OUT: Position integer of token */
! 183: ){
! 184: IcuCursor *pCsr = (IcuCursor *)pCursor;
! 185:
! 186: int iStart = 0;
! 187: int iEnd = 0;
! 188: int nByte = 0;
! 189:
! 190: while( iStart==iEnd ){
! 191: UChar32 c;
! 192:
! 193: iStart = ubrk_current(pCsr->pIter);
! 194: iEnd = ubrk_next(pCsr->pIter);
! 195: if( iEnd==UBRK_DONE ){
! 196: return SQLITE_DONE;
! 197: }
! 198:
! 199: while( iStart<iEnd ){
! 200: int iWhite = iStart;
! 201: U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
! 202: if( u_isspace(c) ){
! 203: iStart = iWhite;
! 204: }else{
! 205: break;
! 206: }
! 207: }
! 208: assert(iStart<=iEnd);
! 209: }
! 210:
! 211: do {
! 212: UErrorCode status = U_ZERO_ERROR;
! 213: if( nByte ){
! 214: char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
! 215: if( !zNew ){
! 216: return SQLITE_NOMEM;
! 217: }
! 218: pCsr->zBuffer = zNew;
! 219: pCsr->nBuffer = nByte;
! 220: }
! 221:
! 222: u_strToUTF8(
! 223: pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
! 224: &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
! 225: &status /* Output success/failure */
! 226: );
! 227: } while( nByte>pCsr->nBuffer );
! 228:
! 229: *ppToken = pCsr->zBuffer;
! 230: *pnBytes = nByte;
! 231: *piStartOffset = pCsr->aOffset[iStart];
! 232: *piEndOffset = pCsr->aOffset[iEnd];
! 233: *piPosition = pCsr->iToken++;
! 234:
! 235: return SQLITE_OK;
! 236: }
! 237:
! 238: /*
! 239: ** The set of routines that implement the simple tokenizer
! 240: */
! 241: static const sqlite3_tokenizer_module icuTokenizerModule = {
! 242: 0, /* iVersion */
! 243: icuCreate, /* xCreate */
! 244: icuDestroy, /* xCreate */
! 245: icuOpen, /* xOpen */
! 246: icuClose, /* xClose */
! 247: icuNext, /* xNext */
! 248: };
! 249:
! 250: /*
! 251: ** Set *ppModule to point at the implementation of the ICU tokenizer.
! 252: */
! 253: void sqlite3Fts2IcuTokenizerModule(
! 254: sqlite3_tokenizer_module const**ppModule
! 255: ){
! 256: *ppModule = &icuTokenizerModule;
! 257: }
! 258:
! 259: #endif /* defined(SQLITE_ENABLE_ICU) */
! 260: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>