Annotation of embedaddon/sqlite3/ext/fts3/fts3_icu.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** 2007 June 22
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: *************************************************************************
                     12: ** This file implements a tokenizer for fts3 based on the ICU library.
                     13: */
                     14: #include "fts3Int.h"
                     15: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
                     16: #ifdef SQLITE_ENABLE_ICU
                     17: 
                     18: #include <assert.h>
                     19: #include <string.h>
                     20: #include "fts3_tokenizer.h"
                     21: 
                     22: #include <unicode/ubrk.h>
                     23: #include <unicode/ucol.h>
                     24: #include <unicode/ustring.h>
                     25: #include <unicode/utf16.h>
                     26: 
                     27: typedef struct IcuTokenizer IcuTokenizer;
                     28: typedef struct IcuCursor IcuCursor;
                     29: 
                     30: struct IcuTokenizer {
                     31:   sqlite3_tokenizer base;
                     32:   char *zLocale;
                     33: };
                     34: 
                     35: struct IcuCursor {
                     36:   sqlite3_tokenizer_cursor base;
                     37: 
                     38:   UBreakIterator *pIter;      /* ICU break-iterator object */
                     39:   int nChar;                  /* Number of UChar elements in pInput */
                     40:   UChar *aChar;               /* Copy of input using utf-16 encoding */
                     41:   int *aOffset;               /* Offsets of each character in utf-8 input */
                     42: 
                     43:   int nBuffer;
                     44:   char *zBuffer;
                     45: 
                     46:   int iToken;
                     47: };
                     48: 
                     49: /*
                     50: ** Create a new tokenizer instance.
                     51: */
                     52: static int icuCreate(
                     53:   int argc,                            /* Number of entries in argv[] */
                     54:   const char * const *argv,            /* Tokenizer creation arguments */
                     55:   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
                     56: ){
                     57:   IcuTokenizer *p;
                     58:   int n = 0;
                     59: 
                     60:   if( argc>0 ){
                     61:     n = strlen(argv[0])+1;
                     62:   }
                     63:   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
                     64:   if( !p ){
                     65:     return SQLITE_NOMEM;
                     66:   }
                     67:   memset(p, 0, sizeof(IcuTokenizer));
                     68: 
                     69:   if( n ){
                     70:     p->zLocale = (char *)&p[1];
                     71:     memcpy(p->zLocale, argv[0], n);
                     72:   }
                     73: 
                     74:   *ppTokenizer = (sqlite3_tokenizer *)p;
                     75: 
                     76:   return SQLITE_OK;
                     77: }
                     78: 
                     79: /*
                     80: ** Destroy a tokenizer
                     81: */
                     82: static int icuDestroy(sqlite3_tokenizer *pTokenizer){
                     83:   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
                     84:   sqlite3_free(p);
                     85:   return SQLITE_OK;
                     86: }
                     87: 
                     88: /*
                     89: ** Prepare to begin tokenizing a particular string.  The input
                     90: ** string to be tokenized is pInput[0..nBytes-1].  A cursor
                     91: ** used to incrementally tokenize this string is returned in 
                     92: ** *ppCursor.
                     93: */
                     94: static int icuOpen(
                     95:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
                     96:   const char *zInput,                    /* Input string */
                     97:   int nInput,                            /* Length of zInput in bytes */
                     98:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
                     99: ){
                    100:   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
                    101:   IcuCursor *pCsr;
                    102: 
                    103:   const int32_t opt = U_FOLD_CASE_DEFAULT;
                    104:   UErrorCode status = U_ZERO_ERROR;
                    105:   int nChar;
                    106: 
                    107:   UChar32 c;
                    108:   int iInput = 0;
                    109:   int iOut = 0;
                    110: 
                    111:   *ppCursor = 0;
                    112: 
                    113:   if( nInput<0 ){
                    114:     nInput = strlen(zInput);
                    115:   }
                    116:   nChar = nInput+1;
                    117:   pCsr = (IcuCursor *)sqlite3_malloc(
                    118:       sizeof(IcuCursor) +                /* IcuCursor */
                    119:       nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
                    120:       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
                    121:   );
                    122:   if( !pCsr ){
                    123:     return SQLITE_NOMEM;
                    124:   }
                    125:   memset(pCsr, 0, sizeof(IcuCursor));
                    126:   pCsr->aChar = (UChar *)&pCsr[1];
                    127:   pCsr->aOffset = (int *)&pCsr->aChar[nChar];
                    128: 
                    129:   pCsr->aOffset[iOut] = iInput;
                    130:   U8_NEXT(zInput, iInput, nInput, c); 
                    131:   while( c>0 ){
                    132:     int isError = 0;
                    133:     c = u_foldCase(c, opt);
                    134:     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
                    135:     if( isError ){
                    136:       sqlite3_free(pCsr);
                    137:       return SQLITE_ERROR;
                    138:     }
                    139:     pCsr->aOffset[iOut] = iInput;
                    140: 
                    141:     if( iInput<nInput ){
                    142:       U8_NEXT(zInput, iInput, nInput, c);
                    143:     }else{
                    144:       c = 0;
                    145:     }
                    146:   }
                    147: 
                    148:   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
                    149:   if( !U_SUCCESS(status) ){
                    150:     sqlite3_free(pCsr);
                    151:     return SQLITE_ERROR;
                    152:   }
                    153:   pCsr->nChar = iOut;
                    154: 
                    155:   ubrk_first(pCsr->pIter);
                    156:   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
                    157:   return SQLITE_OK;
                    158: }
                    159: 
                    160: /*
                    161: ** Close a tokenization cursor previously opened by a call to icuOpen().
                    162: */
                    163: static int icuClose(sqlite3_tokenizer_cursor *pCursor){
                    164:   IcuCursor *pCsr = (IcuCursor *)pCursor;
                    165:   ubrk_close(pCsr->pIter);
                    166:   sqlite3_free(pCsr->zBuffer);
                    167:   sqlite3_free(pCsr);
                    168:   return SQLITE_OK;
                    169: }
                    170: 
                    171: /*
                    172: ** Extract the next token from a tokenization cursor.
                    173: */
                    174: static int icuNext(
                    175:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
                    176:   const char **ppToken,               /* OUT: *ppToken is the token text */
                    177:   int *pnBytes,                       /* OUT: Number of bytes in token */
                    178:   int *piStartOffset,                 /* OUT: Starting offset of token */
                    179:   int *piEndOffset,                   /* OUT: Ending offset of token */
                    180:   int *piPosition                     /* OUT: Position integer of token */
                    181: ){
                    182:   IcuCursor *pCsr = (IcuCursor *)pCursor;
                    183: 
                    184:   int iStart = 0;
                    185:   int iEnd = 0;
                    186:   int nByte = 0;
                    187: 
                    188:   while( iStart==iEnd ){
                    189:     UChar32 c;
                    190: 
                    191:     iStart = ubrk_current(pCsr->pIter);
                    192:     iEnd = ubrk_next(pCsr->pIter);
                    193:     if( iEnd==UBRK_DONE ){
                    194:       return SQLITE_DONE;
                    195:     }
                    196: 
                    197:     while( iStart<iEnd ){
                    198:       int iWhite = iStart;
                    199:       U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
                    200:       if( u_isspace(c) ){
                    201:         iStart = iWhite;
                    202:       }else{
                    203:         break;
                    204:       }
                    205:     }
                    206:     assert(iStart<=iEnd);
                    207:   }
                    208: 
                    209:   do {
                    210:     UErrorCode status = U_ZERO_ERROR;
                    211:     if( nByte ){
                    212:       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
                    213:       if( !zNew ){
                    214:         return SQLITE_NOMEM;
                    215:       }
                    216:       pCsr->zBuffer = zNew;
                    217:       pCsr->nBuffer = nByte;
                    218:     }
                    219: 
                    220:     u_strToUTF8(
                    221:         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
                    222:         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
                    223:         &status                                  /* Output success/failure */
                    224:     );
                    225:   } while( nByte>pCsr->nBuffer );
                    226: 
                    227:   *ppToken = pCsr->zBuffer;
                    228:   *pnBytes = nByte;
                    229:   *piStartOffset = pCsr->aOffset[iStart];
                    230:   *piEndOffset = pCsr->aOffset[iEnd];
                    231:   *piPosition = pCsr->iToken++;
                    232: 
                    233:   return SQLITE_OK;
                    234: }
                    235: 
                    236: /*
                    237: ** The set of routines that implement the simple tokenizer
                    238: */
                    239: static const sqlite3_tokenizer_module icuTokenizerModule = {
                    240:   0,                           /* iVersion */
                    241:   icuCreate,                   /* xCreate  */
                    242:   icuDestroy,                  /* xCreate  */
                    243:   icuOpen,                     /* xOpen    */
                    244:   icuClose,                    /* xClose   */
                    245:   icuNext,                     /* xNext    */
                    246: };
                    247: 
                    248: /*
                    249: ** Set *ppModule to point at the implementation of the ICU tokenizer.
                    250: */
                    251: void sqlite3Fts3IcuTokenizerModule(
                    252:   sqlite3_tokenizer_module const**ppModule
                    253: ){
                    254:   *ppModule = &icuTokenizerModule;
                    255: }
                    256: 
                    257: #endif /* defined(SQLITE_ENABLE_ICU) */
                    258: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>