Annotation of embedaddon/sqlite3/ext/fts2/fts2_porter.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** 2006 September 30
! 3: **
! 4: ** The author disclaims copyright to this source code. In place of
! 5: ** a legal notice, here is a blessing:
! 6: **
! 7: ** May you do good and not evil.
! 8: ** May you find forgiveness for yourself and forgive others.
! 9: ** May you share freely, never taking more than you give.
! 10: **
! 11: *************************************************************************
! 12: ** Implementation of the full-text-search tokenizer that implements
! 13: ** a Porter stemmer.
! 14: */
! 15:
! 16: /*
! 17: ** The code in this file is only compiled if:
! 18: **
! 19: ** * The FTS2 module is being built as an extension
! 20: ** (in which case SQLITE_CORE is not defined), or
! 21: **
! 22: ** * The FTS2 module is being built into the core of
! 23: ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
! 24: */
! 25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
! 26:
! 27:
! 28: #include <assert.h>
! 29: #include <stdlib.h>
! 30: #include <stdio.h>
! 31: #include <string.h>
! 32:
! 33: #include "fts2_tokenizer.h"
! 34:
! 35: /*
! 36: ** Class derived from sqlite3_tokenizer
! 37: */
! 38: typedef struct porter_tokenizer {
! 39: sqlite3_tokenizer base; /* Base class */
! 40: } porter_tokenizer;
! 41:
! 42: /*
! 43: ** Class derived from sqlit3_tokenizer_cursor
! 44: */
! 45: typedef struct porter_tokenizer_cursor {
! 46: sqlite3_tokenizer_cursor base;
! 47: const char *zInput; /* input we are tokenizing */
! 48: int nInput; /* size of the input */
! 49: int iOffset; /* current position in zInput */
! 50: int iToken; /* index of next token to be returned */
! 51: char *zToken; /* storage for current token */
! 52: int nAllocated; /* space allocated to zToken buffer */
! 53: } porter_tokenizer_cursor;
! 54:
! 55:
! 56: /* Forward declaration */
! 57: static const sqlite3_tokenizer_module porterTokenizerModule;
! 58:
! 59:
! 60: /*
! 61: ** Create a new tokenizer instance.
! 62: */
! 63: static int porterCreate(
! 64: int argc, const char * const *argv,
! 65: sqlite3_tokenizer **ppTokenizer
! 66: ){
! 67: porter_tokenizer *t;
! 68: t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
! 69: if( t==NULL ) return SQLITE_NOMEM;
! 70: memset(t, 0, sizeof(*t));
! 71: *ppTokenizer = &t->base;
! 72: return SQLITE_OK;
! 73: }
! 74:
! 75: /*
! 76: ** Destroy a tokenizer
! 77: */
! 78: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
! 79: sqlite3_free(pTokenizer);
! 80: return SQLITE_OK;
! 81: }
! 82:
! 83: /*
! 84: ** Prepare to begin tokenizing a particular string. The input
! 85: ** string to be tokenized is zInput[0..nInput-1]. A cursor
! 86: ** used to incrementally tokenize this string is returned in
! 87: ** *ppCursor.
! 88: */
! 89: static int porterOpen(
! 90: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
! 91: const char *zInput, int nInput, /* String to be tokenized */
! 92: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
! 93: ){
! 94: porter_tokenizer_cursor *c;
! 95:
! 96: c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
! 97: if( c==NULL ) return SQLITE_NOMEM;
! 98:
! 99: c->zInput = zInput;
! 100: if( zInput==0 ){
! 101: c->nInput = 0;
! 102: }else if( nInput<0 ){
! 103: c->nInput = (int)strlen(zInput);
! 104: }else{
! 105: c->nInput = nInput;
! 106: }
! 107: c->iOffset = 0; /* start tokenizing at the beginning */
! 108: c->iToken = 0;
! 109: c->zToken = NULL; /* no space allocated, yet. */
! 110: c->nAllocated = 0;
! 111:
! 112: *ppCursor = &c->base;
! 113: return SQLITE_OK;
! 114: }
! 115:
! 116: /*
! 117: ** Close a tokenization cursor previously opened by a call to
! 118: ** porterOpen() above.
! 119: */
! 120: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
! 121: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
! 122: sqlite3_free(c->zToken);
! 123: sqlite3_free(c);
! 124: return SQLITE_OK;
! 125: }
! 126: /*
! 127: ** Vowel or consonant
! 128: */
! 129: static const char cType[] = {
! 130: 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
! 131: 1, 1, 1, 2, 1
! 132: };
! 133:
! 134: /*
! 135: ** isConsonant() and isVowel() determine if their first character in
! 136: ** the string they point to is a consonant or a vowel, according
! 137: ** to Porter ruls.
! 138: **
! 139: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
! 140: ** 'Y' is a consonant unless it follows another consonant,
! 141: ** in which case it is a vowel.
! 142: **
! 143: ** In these routine, the letters are in reverse order. So the 'y' rule
! 144: ** is that 'y' is a consonant unless it is followed by another
! 145: ** consonent.
! 146: */
! 147: static int isVowel(const char*);
! 148: static int isConsonant(const char *z){
! 149: int j;
! 150: char x = *z;
! 151: if( x==0 ) return 0;
! 152: assert( x>='a' && x<='z' );
! 153: j = cType[x-'a'];
! 154: if( j<2 ) return j;
! 155: return z[1]==0 || isVowel(z + 1);
! 156: }
! 157: static int isVowel(const char *z){
! 158: int j;
! 159: char x = *z;
! 160: if( x==0 ) return 0;
! 161: assert( x>='a' && x<='z' );
! 162: j = cType[x-'a'];
! 163: if( j<2 ) return 1-j;
! 164: return isConsonant(z + 1);
! 165: }
! 166:
! 167: /*
! 168: ** Let any sequence of one or more vowels be represented by V and let
! 169: ** C be sequence of one or more consonants. Then every word can be
! 170: ** represented as:
! 171: **
! 172: ** [C] (VC){m} [V]
! 173: **
! 174: ** In prose: A word is an optional consonant followed by zero or
! 175: ** vowel-consonant pairs followed by an optional vowel. "m" is the
! 176: ** number of vowel consonant pairs. This routine computes the value
! 177: ** of m for the first i bytes of a word.
! 178: **
! 179: ** Return true if the m-value for z is 1 or more. In other words,
! 180: ** return true if z contains at least one vowel that is followed
! 181: ** by a consonant.
! 182: **
! 183: ** In this routine z[] is in reverse order. So we are really looking
! 184: ** for an instance of of a consonant followed by a vowel.
! 185: */
! 186: static int m_gt_0(const char *z){
! 187: while( isVowel(z) ){ z++; }
! 188: if( *z==0 ) return 0;
! 189: while( isConsonant(z) ){ z++; }
! 190: return *z!=0;
! 191: }
! 192:
! 193: /* Like mgt0 above except we are looking for a value of m which is
! 194: ** exactly 1
! 195: */
! 196: static int m_eq_1(const char *z){
! 197: while( isVowel(z) ){ z++; }
! 198: if( *z==0 ) return 0;
! 199: while( isConsonant(z) ){ z++; }
! 200: if( *z==0 ) return 0;
! 201: while( isVowel(z) ){ z++; }
! 202: if( *z==0 ) return 1;
! 203: while( isConsonant(z) ){ z++; }
! 204: return *z==0;
! 205: }
! 206:
! 207: /* Like mgt0 above except we are looking for a value of m>1 instead
! 208: ** or m>0
! 209: */
! 210: static int m_gt_1(const char *z){
! 211: while( isVowel(z) ){ z++; }
! 212: if( *z==0 ) return 0;
! 213: while( isConsonant(z) ){ z++; }
! 214: if( *z==0 ) return 0;
! 215: while( isVowel(z) ){ z++; }
! 216: if( *z==0 ) return 0;
! 217: while( isConsonant(z) ){ z++; }
! 218: return *z!=0;
! 219: }
! 220:
! 221: /*
! 222: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
! 223: */
! 224: static int hasVowel(const char *z){
! 225: while( isConsonant(z) ){ z++; }
! 226: return *z!=0;
! 227: }
! 228:
! 229: /*
! 230: ** Return TRUE if the word ends in a double consonant.
! 231: **
! 232: ** The text is reversed here. So we are really looking at
! 233: ** the first two characters of z[].
! 234: */
! 235: static int doubleConsonant(const char *z){
! 236: return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
! 237: }
! 238:
! 239: /*
! 240: ** Return TRUE if the word ends with three letters which
! 241: ** are consonant-vowel-consonent and where the final consonant
! 242: ** is not 'w', 'x', or 'y'.
! 243: **
! 244: ** The word is reversed here. So we are really checking the
! 245: ** first three letters and the first one cannot be in [wxy].
! 246: */
! 247: static int star_oh(const char *z){
! 248: return
! 249: z[0]!=0 && isConsonant(z) &&
! 250: z[0]!='w' && z[0]!='x' && z[0]!='y' &&
! 251: z[1]!=0 && isVowel(z+1) &&
! 252: z[2]!=0 && isConsonant(z+2);
! 253: }
! 254:
! 255: /*
! 256: ** If the word ends with zFrom and xCond() is true for the stem
! 257: ** of the word that preceeds the zFrom ending, then change the
! 258: ** ending to zTo.
! 259: **
! 260: ** The input word *pz and zFrom are both in reverse order. zTo
! 261: ** is in normal order.
! 262: **
! 263: ** Return TRUE if zFrom matches. Return FALSE if zFrom does not
! 264: ** match. Not that TRUE is returned even if xCond() fails and
! 265: ** no substitution occurs.
! 266: */
! 267: static int stem(
! 268: char **pz, /* The word being stemmed (Reversed) */
! 269: const char *zFrom, /* If the ending matches this... (Reversed) */
! 270: const char *zTo, /* ... change the ending to this (not reversed) */
! 271: int (*xCond)(const char*) /* Condition that must be true */
! 272: ){
! 273: char *z = *pz;
! 274: while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
! 275: if( *zFrom!=0 ) return 0;
! 276: if( xCond && !xCond(z) ) return 1;
! 277: while( *zTo ){
! 278: *(--z) = *(zTo++);
! 279: }
! 280: *pz = z;
! 281: return 1;
! 282: }
! 283:
! 284: /*
! 285: ** This is the fallback stemmer used when the porter stemmer is
! 286: ** inappropriate. The input word is copied into the output with
! 287: ** US-ASCII case folding. If the input word is too long (more
! 288: ** than 20 bytes if it contains no digits or more than 6 bytes if
! 289: ** it contains digits) then word is truncated to 20 or 6 bytes
! 290: ** by taking 10 or 3 bytes from the beginning and end.
! 291: */
! 292: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
! 293: int i, mx, j;
! 294: int hasDigit = 0;
! 295: for(i=0; i<nIn; i++){
! 296: int c = zIn[i];
! 297: if( c>='A' && c<='Z' ){
! 298: zOut[i] = c - 'A' + 'a';
! 299: }else{
! 300: if( c>='0' && c<='9' ) hasDigit = 1;
! 301: zOut[i] = c;
! 302: }
! 303: }
! 304: mx = hasDigit ? 3 : 10;
! 305: if( nIn>mx*2 ){
! 306: for(j=mx, i=nIn-mx; i<nIn; i++, j++){
! 307: zOut[j] = zOut[i];
! 308: }
! 309: i = j;
! 310: }
! 311: zOut[i] = 0;
! 312: *pnOut = i;
! 313: }
! 314:
! 315:
! 316: /*
! 317: ** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
! 318: ** zOut is at least big enough to hold nIn bytes. Write the actual
! 319: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
! 320: **
! 321: ** Any upper-case characters in the US-ASCII character set ([A-Z])
! 322: ** are converted to lower case. Upper-case UTF characters are
! 323: ** unchanged.
! 324: **
! 325: ** Words that are longer than about 20 bytes are stemmed by retaining
! 326: ** a few bytes from the beginning and the end of the word. If the
! 327: ** word contains digits, 3 bytes are taken from the beginning and
! 328: ** 3 bytes from the end. For long words without digits, 10 bytes
! 329: ** are taken from each end. US-ASCII case folding still applies.
! 330: **
! 331: ** If the input word contains not digits but does characters not
! 332: ** in [a-zA-Z] then no stemming is attempted and this routine just
! 333: ** copies the input into the input into the output with US-ASCII
! 334: ** case folding.
! 335: **
! 336: ** Stemming never increases the length of the word. So there is
! 337: ** no chance of overflowing the zOut buffer.
! 338: */
! 339: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
! 340: int i, j, c;
! 341: char zReverse[28];
! 342: char *z, *z2;
! 343: if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
! 344: /* The word is too big or too small for the porter stemmer.
! 345: ** Fallback to the copy stemmer */
! 346: copy_stemmer(zIn, nIn, zOut, pnOut);
! 347: return;
! 348: }
! 349: for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
! 350: c = zIn[i];
! 351: if( c>='A' && c<='Z' ){
! 352: zReverse[j] = c + 'a' - 'A';
! 353: }else if( c>='a' && c<='z' ){
! 354: zReverse[j] = c;
! 355: }else{
! 356: /* The use of a character not in [a-zA-Z] means that we fallback
! 357: ** to the copy stemmer */
! 358: copy_stemmer(zIn, nIn, zOut, pnOut);
! 359: return;
! 360: }
! 361: }
! 362: memset(&zReverse[sizeof(zReverse)-5], 0, 5);
! 363: z = &zReverse[j+1];
! 364:
! 365:
! 366: /* Step 1a */
! 367: if( z[0]=='s' ){
! 368: if(
! 369: !stem(&z, "sess", "ss", 0) &&
! 370: !stem(&z, "sei", "i", 0) &&
! 371: !stem(&z, "ss", "ss", 0)
! 372: ){
! 373: z++;
! 374: }
! 375: }
! 376:
! 377: /* Step 1b */
! 378: z2 = z;
! 379: if( stem(&z, "dee", "ee", m_gt_0) ){
! 380: /* Do nothing. The work was all in the test */
! 381: }else if(
! 382: (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
! 383: && z!=z2
! 384: ){
! 385: if( stem(&z, "ta", "ate", 0) ||
! 386: stem(&z, "lb", "ble", 0) ||
! 387: stem(&z, "zi", "ize", 0) ){
! 388: /* Do nothing. The work was all in the test */
! 389: }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
! 390: z++;
! 391: }else if( m_eq_1(z) && star_oh(z) ){
! 392: *(--z) = 'e';
! 393: }
! 394: }
! 395:
! 396: /* Step 1c */
! 397: if( z[0]=='y' && hasVowel(z+1) ){
! 398: z[0] = 'i';
! 399: }
! 400:
! 401: /* Step 2 */
! 402: switch( z[1] ){
! 403: case 'a':
! 404: stem(&z, "lanoita", "ate", m_gt_0) ||
! 405: stem(&z, "lanoit", "tion", m_gt_0);
! 406: break;
! 407: case 'c':
! 408: stem(&z, "icne", "ence", m_gt_0) ||
! 409: stem(&z, "icna", "ance", m_gt_0);
! 410: break;
! 411: case 'e':
! 412: stem(&z, "rezi", "ize", m_gt_0);
! 413: break;
! 414: case 'g':
! 415: stem(&z, "igol", "log", m_gt_0);
! 416: break;
! 417: case 'l':
! 418: stem(&z, "ilb", "ble", m_gt_0) ||
! 419: stem(&z, "illa", "al", m_gt_0) ||
! 420: stem(&z, "iltne", "ent", m_gt_0) ||
! 421: stem(&z, "ile", "e", m_gt_0) ||
! 422: stem(&z, "ilsuo", "ous", m_gt_0);
! 423: break;
! 424: case 'o':
! 425: stem(&z, "noitazi", "ize", m_gt_0) ||
! 426: stem(&z, "noita", "ate", m_gt_0) ||
! 427: stem(&z, "rota", "ate", m_gt_0);
! 428: break;
! 429: case 's':
! 430: stem(&z, "msila", "al", m_gt_0) ||
! 431: stem(&z, "ssenevi", "ive", m_gt_0) ||
! 432: stem(&z, "ssenluf", "ful", m_gt_0) ||
! 433: stem(&z, "ssensuo", "ous", m_gt_0);
! 434: break;
! 435: case 't':
! 436: stem(&z, "itila", "al", m_gt_0) ||
! 437: stem(&z, "itivi", "ive", m_gt_0) ||
! 438: stem(&z, "itilib", "ble", m_gt_0);
! 439: break;
! 440: }
! 441:
! 442: /* Step 3 */
! 443: switch( z[0] ){
! 444: case 'e':
! 445: stem(&z, "etaci", "ic", m_gt_0) ||
! 446: stem(&z, "evita", "", m_gt_0) ||
! 447: stem(&z, "ezila", "al", m_gt_0);
! 448: break;
! 449: case 'i':
! 450: stem(&z, "itici", "ic", m_gt_0);
! 451: break;
! 452: case 'l':
! 453: stem(&z, "laci", "ic", m_gt_0) ||
! 454: stem(&z, "luf", "", m_gt_0);
! 455: break;
! 456: case 's':
! 457: stem(&z, "ssen", "", m_gt_0);
! 458: break;
! 459: }
! 460:
! 461: /* Step 4 */
! 462: switch( z[1] ){
! 463: case 'a':
! 464: if( z[0]=='l' && m_gt_1(z+2) ){
! 465: z += 2;
! 466: }
! 467: break;
! 468: case 'c':
! 469: if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
! 470: z += 4;
! 471: }
! 472: break;
! 473: case 'e':
! 474: if( z[0]=='r' && m_gt_1(z+2) ){
! 475: z += 2;
! 476: }
! 477: break;
! 478: case 'i':
! 479: if( z[0]=='c' && m_gt_1(z+2) ){
! 480: z += 2;
! 481: }
! 482: break;
! 483: case 'l':
! 484: if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
! 485: z += 4;
! 486: }
! 487: break;
! 488: case 'n':
! 489: if( z[0]=='t' ){
! 490: if( z[2]=='a' ){
! 491: if( m_gt_1(z+3) ){
! 492: z += 3;
! 493: }
! 494: }else if( z[2]=='e' ){
! 495: stem(&z, "tneme", "", m_gt_1) ||
! 496: stem(&z, "tnem", "", m_gt_1) ||
! 497: stem(&z, "tne", "", m_gt_1);
! 498: }
! 499: }
! 500: break;
! 501: case 'o':
! 502: if( z[0]=='u' ){
! 503: if( m_gt_1(z+2) ){
! 504: z += 2;
! 505: }
! 506: }else if( z[3]=='s' || z[3]=='t' ){
! 507: stem(&z, "noi", "", m_gt_1);
! 508: }
! 509: break;
! 510: case 's':
! 511: if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
! 512: z += 3;
! 513: }
! 514: break;
! 515: case 't':
! 516: stem(&z, "eta", "", m_gt_1) ||
! 517: stem(&z, "iti", "", m_gt_1);
! 518: break;
! 519: case 'u':
! 520: if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
! 521: z += 3;
! 522: }
! 523: break;
! 524: case 'v':
! 525: case 'z':
! 526: if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
! 527: z += 3;
! 528: }
! 529: break;
! 530: }
! 531:
! 532: /* Step 5a */
! 533: if( z[0]=='e' ){
! 534: if( m_gt_1(z+1) ){
! 535: z++;
! 536: }else if( m_eq_1(z+1) && !star_oh(z+1) ){
! 537: z++;
! 538: }
! 539: }
! 540:
! 541: /* Step 5b */
! 542: if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
! 543: z++;
! 544: }
! 545:
! 546: /* z[] is now the stemmed word in reverse order. Flip it back
! 547: ** around into forward order and return.
! 548: */
! 549: *pnOut = i = strlen(z);
! 550: zOut[i] = 0;
! 551: while( *z ){
! 552: zOut[--i] = *(z++);
! 553: }
! 554: }
! 555:
! 556: /*
! 557: ** Characters that can be part of a token. We assume any character
! 558: ** whose value is greater than 0x80 (any UTF character) can be
! 559: ** part of a token. In other words, delimiters all must have
! 560: ** values of 0x7f or lower.
! 561: */
! 562: static const char porterIdChar[] = {
! 563: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
! 564: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
! 565: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
! 566: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
! 567: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
! 568: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
! 569: };
! 570: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
! 571:
! 572: /*
! 573: ** Extract the next token from a tokenization cursor. The cursor must
! 574: ** have been opened by a prior call to porterOpen().
! 575: */
! 576: static int porterNext(
! 577: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
! 578: const char **pzToken, /* OUT: *pzToken is the token text */
! 579: int *pnBytes, /* OUT: Number of bytes in token */
! 580: int *piStartOffset, /* OUT: Starting offset of token */
! 581: int *piEndOffset, /* OUT: Ending offset of token */
! 582: int *piPosition /* OUT: Position integer of token */
! 583: ){
! 584: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
! 585: const char *z = c->zInput;
! 586:
! 587: while( c->iOffset<c->nInput ){
! 588: int iStartOffset, ch;
! 589:
! 590: /* Scan past delimiter characters */
! 591: while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
! 592: c->iOffset++;
! 593: }
! 594:
! 595: /* Count non-delimiter characters. */
! 596: iStartOffset = c->iOffset;
! 597: while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
! 598: c->iOffset++;
! 599: }
! 600:
! 601: if( c->iOffset>iStartOffset ){
! 602: int n = c->iOffset-iStartOffset;
! 603: if( n>c->nAllocated ){
! 604: c->nAllocated = n+20;
! 605: c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
! 606: if( c->zToken==NULL ) return SQLITE_NOMEM;
! 607: }
! 608: porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
! 609: *pzToken = c->zToken;
! 610: *piStartOffset = iStartOffset;
! 611: *piEndOffset = c->iOffset;
! 612: *piPosition = c->iToken++;
! 613: return SQLITE_OK;
! 614: }
! 615: }
! 616: return SQLITE_DONE;
! 617: }
! 618:
! 619: /*
! 620: ** The set of routines that implement the porter-stemmer tokenizer
! 621: */
! 622: static const sqlite3_tokenizer_module porterTokenizerModule = {
! 623: 0,
! 624: porterCreate,
! 625: porterDestroy,
! 626: porterOpen,
! 627: porterClose,
! 628: porterNext,
! 629: };
! 630:
! 631: /*
! 632: ** Allocate a new porter tokenizer. Return a pointer to the new
! 633: ** tokenizer in *ppModule
! 634: */
! 635: void sqlite3Fts2PorterTokenizerModule(
! 636: sqlite3_tokenizer_module const**ppModule
! 637: ){
! 638: *ppModule = &porterTokenizerModule;
! 639: }
! 640:
! 641: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>