Return to fts2_icu.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts2 |
1.1 ! misho 1: /* ! 2: ** 2007 June 22 ! 3: ** ! 4: ** The author disclaims copyright to this source code. In place of ! 5: ** a legal notice, here is a blessing: ! 6: ** ! 7: ** May you do good and not evil. ! 8: ** May you find forgiveness for yourself and forgive others. ! 9: ** May you share freely, never taking more than you give. ! 10: ** ! 11: ************************************************************************* ! 12: ** This file implements a tokenizer for fts2 based on the ICU library. ! 13: ** ! 14: ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $ ! 15: */ ! 16: ! 17: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) ! 18: #ifdef SQLITE_ENABLE_ICU ! 19: ! 20: #include <assert.h> ! 21: #include <string.h> ! 22: #include "fts2_tokenizer.h" ! 23: ! 24: #include <unicode/ubrk.h> ! 25: #include <unicode/ucol.h> ! 26: #include <unicode/ustring.h> ! 27: #include <unicode/utf16.h> ! 28: ! 29: typedef struct IcuTokenizer IcuTokenizer; ! 30: typedef struct IcuCursor IcuCursor; ! 31: ! 32: struct IcuTokenizer { ! 33: sqlite3_tokenizer base; ! 34: char *zLocale; ! 35: }; ! 36: ! 37: struct IcuCursor { ! 38: sqlite3_tokenizer_cursor base; ! 39: ! 40: UBreakIterator *pIter; /* ICU break-iterator object */ ! 41: int nChar; /* Number of UChar elements in pInput */ ! 42: UChar *aChar; /* Copy of input using utf-16 encoding */ ! 43: int *aOffset; /* Offsets of each character in utf-8 input */ ! 44: ! 45: int nBuffer; ! 46: char *zBuffer; ! 47: ! 48: int iToken; ! 49: }; ! 50: ! 51: /* ! 52: ** Create a new tokenizer instance. ! 53: */ ! 54: static int icuCreate( ! 55: int argc, /* Number of entries in argv[] */ ! 56: const char * const *argv, /* Tokenizer creation arguments */ ! 57: sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ ! 58: ){ ! 59: IcuTokenizer *p; ! 60: int n = 0; ! 61: ! 62: if( argc>0 ){ ! 63: n = strlen(argv[0])+1; ! 64: } ! 65: p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); ! 66: if( !p ){ ! 67: return SQLITE_NOMEM; ! 68: } ! 69: memset(p, 0, sizeof(IcuTokenizer)); ! 70: ! 71: if( n ){ ! 72: p->zLocale = (char *)&p[1]; ! 73: memcpy(p->zLocale, argv[0], n); ! 74: } ! 75: ! 76: *ppTokenizer = (sqlite3_tokenizer *)p; ! 77: ! 78: return SQLITE_OK; ! 79: } ! 80: ! 81: /* ! 82: ** Destroy a tokenizer ! 83: */ ! 84: static int icuDestroy(sqlite3_tokenizer *pTokenizer){ ! 85: IcuTokenizer *p = (IcuTokenizer *)pTokenizer; ! 86: sqlite3_free(p); ! 87: return SQLITE_OK; ! 88: } ! 89: ! 90: /* ! 91: ** Prepare to begin tokenizing a particular string. The input ! 92: ** string to be tokenized is pInput[0..nBytes-1]. A cursor ! 93: ** used to incrementally tokenize this string is returned in ! 94: ** *ppCursor. ! 95: */ ! 96: static int icuOpen( ! 97: sqlite3_tokenizer *pTokenizer, /* The tokenizer */ ! 98: const char *zInput, /* Input string */ ! 99: int nInput, /* Length of zInput in bytes */ ! 100: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ ! 101: ){ ! 102: IcuTokenizer *p = (IcuTokenizer *)pTokenizer; ! 103: IcuCursor *pCsr; ! 104: ! 105: const int32_t opt = U_FOLD_CASE_DEFAULT; ! 106: UErrorCode status = U_ZERO_ERROR; ! 107: int nChar; ! 108: ! 109: UChar32 c; ! 110: int iInput = 0; ! 111: int iOut = 0; ! 112: ! 113: *ppCursor = 0; ! 114: ! 115: if( nInput<0 ){ ! 116: nInput = strlen(zInput); ! 117: } ! 118: nChar = nInput+1; ! 119: pCsr = (IcuCursor *)sqlite3_malloc( ! 120: sizeof(IcuCursor) + /* IcuCursor */ ! 121: nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ ! 122: (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ ! 123: ); ! 124: if( !pCsr ){ ! 125: return SQLITE_NOMEM; ! 126: } ! 127: memset(pCsr, 0, sizeof(IcuCursor)); ! 128: pCsr->aChar = (UChar *)&pCsr[1]; ! 129: pCsr->aOffset = (int *)&pCsr->aChar[nChar]; ! 130: ! 131: pCsr->aOffset[iOut] = iInput; ! 132: U8_NEXT(zInput, iInput, nInput, c); ! 133: while( c>0 ){ ! 134: int isError = 0; ! 135: c = u_foldCase(c, opt); ! 136: U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); ! 137: if( isError ){ ! 138: sqlite3_free(pCsr); ! 139: return SQLITE_ERROR; ! 140: } ! 141: pCsr->aOffset[iOut] = iInput; ! 142: ! 143: if( iInput<nInput ){ ! 144: U8_NEXT(zInput, iInput, nInput, c); ! 145: }else{ ! 146: c = 0; ! 147: } ! 148: } ! 149: ! 150: pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); ! 151: if( !U_SUCCESS(status) ){ ! 152: sqlite3_free(pCsr); ! 153: return SQLITE_ERROR; ! 154: } ! 155: pCsr->nChar = iOut; ! 156: ! 157: ubrk_first(pCsr->pIter); ! 158: *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; ! 159: return SQLITE_OK; ! 160: } ! 161: ! 162: /* ! 163: ** Close a tokenization cursor previously opened by a call to icuOpen(). ! 164: */ ! 165: static int icuClose(sqlite3_tokenizer_cursor *pCursor){ ! 166: IcuCursor *pCsr = (IcuCursor *)pCursor; ! 167: ubrk_close(pCsr->pIter); ! 168: sqlite3_free(pCsr->zBuffer); ! 169: sqlite3_free(pCsr); ! 170: return SQLITE_OK; ! 171: } ! 172: ! 173: /* ! 174: ** Extract the next token from a tokenization cursor. ! 175: */ ! 176: static int icuNext( ! 177: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ ! 178: const char **ppToken, /* OUT: *ppToken is the token text */ ! 179: int *pnBytes, /* OUT: Number of bytes in token */ ! 180: int *piStartOffset, /* OUT: Starting offset of token */ ! 181: int *piEndOffset, /* OUT: Ending offset of token */ ! 182: int *piPosition /* OUT: Position integer of token */ ! 183: ){ ! 184: IcuCursor *pCsr = (IcuCursor *)pCursor; ! 185: ! 186: int iStart = 0; ! 187: int iEnd = 0; ! 188: int nByte = 0; ! 189: ! 190: while( iStart==iEnd ){ ! 191: UChar32 c; ! 192: ! 193: iStart = ubrk_current(pCsr->pIter); ! 194: iEnd = ubrk_next(pCsr->pIter); ! 195: if( iEnd==UBRK_DONE ){ ! 196: return SQLITE_DONE; ! 197: } ! 198: ! 199: while( iStart<iEnd ){ ! 200: int iWhite = iStart; ! 201: U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); ! 202: if( u_isspace(c) ){ ! 203: iStart = iWhite; ! 204: }else{ ! 205: break; ! 206: } ! 207: } ! 208: assert(iStart<=iEnd); ! 209: } ! 210: ! 211: do { ! 212: UErrorCode status = U_ZERO_ERROR; ! 213: if( nByte ){ ! 214: char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); ! 215: if( !zNew ){ ! 216: return SQLITE_NOMEM; ! 217: } ! 218: pCsr->zBuffer = zNew; ! 219: pCsr->nBuffer = nByte; ! 220: } ! 221: ! 222: u_strToUTF8( ! 223: pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ ! 224: &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ ! 225: &status /* Output success/failure */ ! 226: ); ! 227: } while( nByte>pCsr->nBuffer ); ! 228: ! 229: *ppToken = pCsr->zBuffer; ! 230: *pnBytes = nByte; ! 231: *piStartOffset = pCsr->aOffset[iStart]; ! 232: *piEndOffset = pCsr->aOffset[iEnd]; ! 233: *piPosition = pCsr->iToken++; ! 234: ! 235: return SQLITE_OK; ! 236: } ! 237: ! 238: /* ! 239: ** The set of routines that implement the simple tokenizer ! 240: */ ! 241: static const sqlite3_tokenizer_module icuTokenizerModule = { ! 242: 0, /* iVersion */ ! 243: icuCreate, /* xCreate */ ! 244: icuDestroy, /* xCreate */ ! 245: icuOpen, /* xOpen */ ! 246: icuClose, /* xClose */ ! 247: icuNext, /* xNext */ ! 248: }; ! 249: ! 250: /* ! 251: ** Set *ppModule to point at the implementation of the ICU tokenizer. ! 252: */ ! 253: void sqlite3Fts2IcuTokenizerModule( ! 254: sqlite3_tokenizer_module const**ppModule ! 255: ){ ! 256: *ppModule = &icuTokenizerModule; ! 257: } ! 258: ! 259: #endif /* defined(SQLITE_ENABLE_ICU) */ ! 260: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */