Return to fts2_porter.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts2 |
1.1 ! misho 1: /* ! 2: ** 2006 September 30 ! 3: ** ! 4: ** The author disclaims copyright to this source code. In place of ! 5: ** a legal notice, here is a blessing: ! 6: ** ! 7: ** May you do good and not evil. ! 8: ** May you find forgiveness for yourself and forgive others. ! 9: ** May you share freely, never taking more than you give. ! 10: ** ! 11: ************************************************************************* ! 12: ** Implementation of the full-text-search tokenizer that implements ! 13: ** a Porter stemmer. ! 14: */ ! 15: ! 16: /* ! 17: ** The code in this file is only compiled if: ! 18: ** ! 19: ** * The FTS2 module is being built as an extension ! 20: ** (in which case SQLITE_CORE is not defined), or ! 21: ** ! 22: ** * The FTS2 module is being built into the core of ! 23: ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). ! 24: */ ! 25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) ! 26: ! 27: ! 28: #include <assert.h> ! 29: #include <stdlib.h> ! 30: #include <stdio.h> ! 31: #include <string.h> ! 32: ! 33: #include "fts2_tokenizer.h" ! 34: ! 35: /* ! 36: ** Class derived from sqlite3_tokenizer ! 37: */ ! 38: typedef struct porter_tokenizer { ! 39: sqlite3_tokenizer base; /* Base class */ ! 40: } porter_tokenizer; ! 41: ! 42: /* ! 43: ** Class derived from sqlit3_tokenizer_cursor ! 44: */ ! 45: typedef struct porter_tokenizer_cursor { ! 46: sqlite3_tokenizer_cursor base; ! 47: const char *zInput; /* input we are tokenizing */ ! 48: int nInput; /* size of the input */ ! 49: int iOffset; /* current position in zInput */ ! 50: int iToken; /* index of next token to be returned */ ! 51: char *zToken; /* storage for current token */ ! 52: int nAllocated; /* space allocated to zToken buffer */ ! 53: } porter_tokenizer_cursor; ! 54: ! 55: ! 56: /* Forward declaration */ ! 57: static const sqlite3_tokenizer_module porterTokenizerModule; ! 58: ! 59: ! 60: /* ! 61: ** Create a new tokenizer instance. ! 62: */ ! 63: static int porterCreate( ! 64: int argc, const char * const *argv, ! 65: sqlite3_tokenizer **ppTokenizer ! 66: ){ ! 67: porter_tokenizer *t; ! 68: t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); ! 69: if( t==NULL ) return SQLITE_NOMEM; ! 70: memset(t, 0, sizeof(*t)); ! 71: *ppTokenizer = &t->base; ! 72: return SQLITE_OK; ! 73: } ! 74: ! 75: /* ! 76: ** Destroy a tokenizer ! 77: */ ! 78: static int porterDestroy(sqlite3_tokenizer *pTokenizer){ ! 79: sqlite3_free(pTokenizer); ! 80: return SQLITE_OK; ! 81: } ! 82: ! 83: /* ! 84: ** Prepare to begin tokenizing a particular string. The input ! 85: ** string to be tokenized is zInput[0..nInput-1]. A cursor ! 86: ** used to incrementally tokenize this string is returned in ! 87: ** *ppCursor. ! 88: */ ! 89: static int porterOpen( ! 90: sqlite3_tokenizer *pTokenizer, /* The tokenizer */ ! 91: const char *zInput, int nInput, /* String to be tokenized */ ! 92: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ ! 93: ){ ! 94: porter_tokenizer_cursor *c; ! 95: ! 96: c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); ! 97: if( c==NULL ) return SQLITE_NOMEM; ! 98: ! 99: c->zInput = zInput; ! 100: if( zInput==0 ){ ! 101: c->nInput = 0; ! 102: }else if( nInput<0 ){ ! 103: c->nInput = (int)strlen(zInput); ! 104: }else{ ! 105: c->nInput = nInput; ! 106: } ! 107: c->iOffset = 0; /* start tokenizing at the beginning */ ! 108: c->iToken = 0; ! 109: c->zToken = NULL; /* no space allocated, yet. */ ! 110: c->nAllocated = 0; ! 111: ! 112: *ppCursor = &c->base; ! 113: return SQLITE_OK; ! 114: } ! 115: ! 116: /* ! 117: ** Close a tokenization cursor previously opened by a call to ! 118: ** porterOpen() above. ! 119: */ ! 120: static int porterClose(sqlite3_tokenizer_cursor *pCursor){ ! 121: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; ! 122: sqlite3_free(c->zToken); ! 123: sqlite3_free(c); ! 124: return SQLITE_OK; ! 125: } ! 126: /* ! 127: ** Vowel or consonant ! 128: */ ! 129: static const char cType[] = { ! 130: 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, ! 131: 1, 1, 1, 2, 1 ! 132: }; ! 133: ! 134: /* ! 135: ** isConsonant() and isVowel() determine if their first character in ! 136: ** the string they point to is a consonant or a vowel, according ! 137: ** to Porter ruls. ! 138: ** ! 139: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. ! 140: ** 'Y' is a consonant unless it follows another consonant, ! 141: ** in which case it is a vowel. ! 142: ** ! 143: ** In these routine, the letters are in reverse order. So the 'y' rule ! 144: ** is that 'y' is a consonant unless it is followed by another ! 145: ** consonent. ! 146: */ ! 147: static int isVowel(const char*); ! 148: static int isConsonant(const char *z){ ! 149: int j; ! 150: char x = *z; ! 151: if( x==0 ) return 0; ! 152: assert( x>='a' && x<='z' ); ! 153: j = cType[x-'a']; ! 154: if( j<2 ) return j; ! 155: return z[1]==0 || isVowel(z + 1); ! 156: } ! 157: static int isVowel(const char *z){ ! 158: int j; ! 159: char x = *z; ! 160: if( x==0 ) return 0; ! 161: assert( x>='a' && x<='z' ); ! 162: j = cType[x-'a']; ! 163: if( j<2 ) return 1-j; ! 164: return isConsonant(z + 1); ! 165: } ! 166: ! 167: /* ! 168: ** Let any sequence of one or more vowels be represented by V and let ! 169: ** C be sequence of one or more consonants. Then every word can be ! 170: ** represented as: ! 171: ** ! 172: ** [C] (VC){m} [V] ! 173: ** ! 174: ** In prose: A word is an optional consonant followed by zero or ! 175: ** vowel-consonant pairs followed by an optional vowel. "m" is the ! 176: ** number of vowel consonant pairs. This routine computes the value ! 177: ** of m for the first i bytes of a word. ! 178: ** ! 179: ** Return true if the m-value for z is 1 or more. In other words, ! 180: ** return true if z contains at least one vowel that is followed ! 181: ** by a consonant. ! 182: ** ! 183: ** In this routine z[] is in reverse order. So we are really looking ! 184: ** for an instance of of a consonant followed by a vowel. ! 185: */ ! 186: static int m_gt_0(const char *z){ ! 187: while( isVowel(z) ){ z++; } ! 188: if( *z==0 ) return 0; ! 189: while( isConsonant(z) ){ z++; } ! 190: return *z!=0; ! 191: } ! 192: ! 193: /* Like mgt0 above except we are looking for a value of m which is ! 194: ** exactly 1 ! 195: */ ! 196: static int m_eq_1(const char *z){ ! 197: while( isVowel(z) ){ z++; } ! 198: if( *z==0 ) return 0; ! 199: while( isConsonant(z) ){ z++; } ! 200: if( *z==0 ) return 0; ! 201: while( isVowel(z) ){ z++; } ! 202: if( *z==0 ) return 1; ! 203: while( isConsonant(z) ){ z++; } ! 204: return *z==0; ! 205: } ! 206: ! 207: /* Like mgt0 above except we are looking for a value of m>1 instead ! 208: ** or m>0 ! 209: */ ! 210: static int m_gt_1(const char *z){ ! 211: while( isVowel(z) ){ z++; } ! 212: if( *z==0 ) return 0; ! 213: while( isConsonant(z) ){ z++; } ! 214: if( *z==0 ) return 0; ! 215: while( isVowel(z) ){ z++; } ! 216: if( *z==0 ) return 0; ! 217: while( isConsonant(z) ){ z++; } ! 218: return *z!=0; ! 219: } ! 220: ! 221: /* ! 222: ** Return TRUE if there is a vowel anywhere within z[0..n-1] ! 223: */ ! 224: static int hasVowel(const char *z){ ! 225: while( isConsonant(z) ){ z++; } ! 226: return *z!=0; ! 227: } ! 228: ! 229: /* ! 230: ** Return TRUE if the word ends in a double consonant. ! 231: ** ! 232: ** The text is reversed here. So we are really looking at ! 233: ** the first two characters of z[]. ! 234: */ ! 235: static int doubleConsonant(const char *z){ ! 236: return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); ! 237: } ! 238: ! 239: /* ! 240: ** Return TRUE if the word ends with three letters which ! 241: ** are consonant-vowel-consonent and where the final consonant ! 242: ** is not 'w', 'x', or 'y'. ! 243: ** ! 244: ** The word is reversed here. So we are really checking the ! 245: ** first three letters and the first one cannot be in [wxy]. ! 246: */ ! 247: static int star_oh(const char *z){ ! 248: return ! 249: z[0]!=0 && isConsonant(z) && ! 250: z[0]!='w' && z[0]!='x' && z[0]!='y' && ! 251: z[1]!=0 && isVowel(z+1) && ! 252: z[2]!=0 && isConsonant(z+2); ! 253: } ! 254: ! 255: /* ! 256: ** If the word ends with zFrom and xCond() is true for the stem ! 257: ** of the word that preceeds the zFrom ending, then change the ! 258: ** ending to zTo. ! 259: ** ! 260: ** The input word *pz and zFrom are both in reverse order. zTo ! 261: ** is in normal order. ! 262: ** ! 263: ** Return TRUE if zFrom matches. Return FALSE if zFrom does not ! 264: ** match. Not that TRUE is returned even if xCond() fails and ! 265: ** no substitution occurs. ! 266: */ ! 267: static int stem( ! 268: char **pz, /* The word being stemmed (Reversed) */ ! 269: const char *zFrom, /* If the ending matches this... (Reversed) */ ! 270: const char *zTo, /* ... change the ending to this (not reversed) */ ! 271: int (*xCond)(const char*) /* Condition that must be true */ ! 272: ){ ! 273: char *z = *pz; ! 274: while( *zFrom && *zFrom==*z ){ z++; zFrom++; } ! 275: if( *zFrom!=0 ) return 0; ! 276: if( xCond && !xCond(z) ) return 1; ! 277: while( *zTo ){ ! 278: *(--z) = *(zTo++); ! 279: } ! 280: *pz = z; ! 281: return 1; ! 282: } ! 283: ! 284: /* ! 285: ** This is the fallback stemmer used when the porter stemmer is ! 286: ** inappropriate. The input word is copied into the output with ! 287: ** US-ASCII case folding. If the input word is too long (more ! 288: ** than 20 bytes if it contains no digits or more than 6 bytes if ! 289: ** it contains digits) then word is truncated to 20 or 6 bytes ! 290: ** by taking 10 or 3 bytes from the beginning and end. ! 291: */ ! 292: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ ! 293: int i, mx, j; ! 294: int hasDigit = 0; ! 295: for(i=0; i<nIn; i++){ ! 296: int c = zIn[i]; ! 297: if( c>='A' && c<='Z' ){ ! 298: zOut[i] = c - 'A' + 'a'; ! 299: }else{ ! 300: if( c>='0' && c<='9' ) hasDigit = 1; ! 301: zOut[i] = c; ! 302: } ! 303: } ! 304: mx = hasDigit ? 3 : 10; ! 305: if( nIn>mx*2 ){ ! 306: for(j=mx, i=nIn-mx; i<nIn; i++, j++){ ! 307: zOut[j] = zOut[i]; ! 308: } ! 309: i = j; ! 310: } ! 311: zOut[i] = 0; ! 312: *pnOut = i; ! 313: } ! 314: ! 315: ! 316: /* ! 317: ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. ! 318: ** zOut is at least big enough to hold nIn bytes. Write the actual ! 319: ** size of the output word (exclusive of the '\0' terminator) into *pnOut. ! 320: ** ! 321: ** Any upper-case characters in the US-ASCII character set ([A-Z]) ! 322: ** are converted to lower case. Upper-case UTF characters are ! 323: ** unchanged. ! 324: ** ! 325: ** Words that are longer than about 20 bytes are stemmed by retaining ! 326: ** a few bytes from the beginning and the end of the word. If the ! 327: ** word contains digits, 3 bytes are taken from the beginning and ! 328: ** 3 bytes from the end. For long words without digits, 10 bytes ! 329: ** are taken from each end. US-ASCII case folding still applies. ! 330: ** ! 331: ** If the input word contains not digits but does characters not ! 332: ** in [a-zA-Z] then no stemming is attempted and this routine just ! 333: ** copies the input into the input into the output with US-ASCII ! 334: ** case folding. ! 335: ** ! 336: ** Stemming never increases the length of the word. So there is ! 337: ** no chance of overflowing the zOut buffer. ! 338: */ ! 339: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ ! 340: int i, j, c; ! 341: char zReverse[28]; ! 342: char *z, *z2; ! 343: if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ ! 344: /* The word is too big or too small for the porter stemmer. ! 345: ** Fallback to the copy stemmer */ ! 346: copy_stemmer(zIn, nIn, zOut, pnOut); ! 347: return; ! 348: } ! 349: for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ ! 350: c = zIn[i]; ! 351: if( c>='A' && c<='Z' ){ ! 352: zReverse[j] = c + 'a' - 'A'; ! 353: }else if( c>='a' && c<='z' ){ ! 354: zReverse[j] = c; ! 355: }else{ ! 356: /* The use of a character not in [a-zA-Z] means that we fallback ! 357: ** to the copy stemmer */ ! 358: copy_stemmer(zIn, nIn, zOut, pnOut); ! 359: return; ! 360: } ! 361: } ! 362: memset(&zReverse[sizeof(zReverse)-5], 0, 5); ! 363: z = &zReverse[j+1]; ! 364: ! 365: ! 366: /* Step 1a */ ! 367: if( z[0]=='s' ){ ! 368: if( ! 369: !stem(&z, "sess", "ss", 0) && ! 370: !stem(&z, "sei", "i", 0) && ! 371: !stem(&z, "ss", "ss", 0) ! 372: ){ ! 373: z++; ! 374: } ! 375: } ! 376: ! 377: /* Step 1b */ ! 378: z2 = z; ! 379: if( stem(&z, "dee", "ee", m_gt_0) ){ ! 380: /* Do nothing. The work was all in the test */ ! 381: }else if( ! 382: (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) ! 383: && z!=z2 ! 384: ){ ! 385: if( stem(&z, "ta", "ate", 0) || ! 386: stem(&z, "lb", "ble", 0) || ! 387: stem(&z, "zi", "ize", 0) ){ ! 388: /* Do nothing. The work was all in the test */ ! 389: }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ ! 390: z++; ! 391: }else if( m_eq_1(z) && star_oh(z) ){ ! 392: *(--z) = 'e'; ! 393: } ! 394: } ! 395: ! 396: /* Step 1c */ ! 397: if( z[0]=='y' && hasVowel(z+1) ){ ! 398: z[0] = 'i'; ! 399: } ! 400: ! 401: /* Step 2 */ ! 402: switch( z[1] ){ ! 403: case 'a': ! 404: stem(&z, "lanoita", "ate", m_gt_0) || ! 405: stem(&z, "lanoit", "tion", m_gt_0); ! 406: break; ! 407: case 'c': ! 408: stem(&z, "icne", "ence", m_gt_0) || ! 409: stem(&z, "icna", "ance", m_gt_0); ! 410: break; ! 411: case 'e': ! 412: stem(&z, "rezi", "ize", m_gt_0); ! 413: break; ! 414: case 'g': ! 415: stem(&z, "igol", "log", m_gt_0); ! 416: break; ! 417: case 'l': ! 418: stem(&z, "ilb", "ble", m_gt_0) || ! 419: stem(&z, "illa", "al", m_gt_0) || ! 420: stem(&z, "iltne", "ent", m_gt_0) || ! 421: stem(&z, "ile", "e", m_gt_0) || ! 422: stem(&z, "ilsuo", "ous", m_gt_0); ! 423: break; ! 424: case 'o': ! 425: stem(&z, "noitazi", "ize", m_gt_0) || ! 426: stem(&z, "noita", "ate", m_gt_0) || ! 427: stem(&z, "rota", "ate", m_gt_0); ! 428: break; ! 429: case 's': ! 430: stem(&z, "msila", "al", m_gt_0) || ! 431: stem(&z, "ssenevi", "ive", m_gt_0) || ! 432: stem(&z, "ssenluf", "ful", m_gt_0) || ! 433: stem(&z, "ssensuo", "ous", m_gt_0); ! 434: break; ! 435: case 't': ! 436: stem(&z, "itila", "al", m_gt_0) || ! 437: stem(&z, "itivi", "ive", m_gt_0) || ! 438: stem(&z, "itilib", "ble", m_gt_0); ! 439: break; ! 440: } ! 441: ! 442: /* Step 3 */ ! 443: switch( z[0] ){ ! 444: case 'e': ! 445: stem(&z, "etaci", "ic", m_gt_0) || ! 446: stem(&z, "evita", "", m_gt_0) || ! 447: stem(&z, "ezila", "al", m_gt_0); ! 448: break; ! 449: case 'i': ! 450: stem(&z, "itici", "ic", m_gt_0); ! 451: break; ! 452: case 'l': ! 453: stem(&z, "laci", "ic", m_gt_0) || ! 454: stem(&z, "luf", "", m_gt_0); ! 455: break; ! 456: case 's': ! 457: stem(&z, "ssen", "", m_gt_0); ! 458: break; ! 459: } ! 460: ! 461: /* Step 4 */ ! 462: switch( z[1] ){ ! 463: case 'a': ! 464: if( z[0]=='l' && m_gt_1(z+2) ){ ! 465: z += 2; ! 466: } ! 467: break; ! 468: case 'c': ! 469: if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ ! 470: z += 4; ! 471: } ! 472: break; ! 473: case 'e': ! 474: if( z[0]=='r' && m_gt_1(z+2) ){ ! 475: z += 2; ! 476: } ! 477: break; ! 478: case 'i': ! 479: if( z[0]=='c' && m_gt_1(z+2) ){ ! 480: z += 2; ! 481: } ! 482: break; ! 483: case 'l': ! 484: if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ ! 485: z += 4; ! 486: } ! 487: break; ! 488: case 'n': ! 489: if( z[0]=='t' ){ ! 490: if( z[2]=='a' ){ ! 491: if( m_gt_1(z+3) ){ ! 492: z += 3; ! 493: } ! 494: }else if( z[2]=='e' ){ ! 495: stem(&z, "tneme", "", m_gt_1) || ! 496: stem(&z, "tnem", "", m_gt_1) || ! 497: stem(&z, "tne", "", m_gt_1); ! 498: } ! 499: } ! 500: break; ! 501: case 'o': ! 502: if( z[0]=='u' ){ ! 503: if( m_gt_1(z+2) ){ ! 504: z += 2; ! 505: } ! 506: }else if( z[3]=='s' || z[3]=='t' ){ ! 507: stem(&z, "noi", "", m_gt_1); ! 508: } ! 509: break; ! 510: case 's': ! 511: if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ ! 512: z += 3; ! 513: } ! 514: break; ! 515: case 't': ! 516: stem(&z, "eta", "", m_gt_1) || ! 517: stem(&z, "iti", "", m_gt_1); ! 518: break; ! 519: case 'u': ! 520: if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ ! 521: z += 3; ! 522: } ! 523: break; ! 524: case 'v': ! 525: case 'z': ! 526: if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ ! 527: z += 3; ! 528: } ! 529: break; ! 530: } ! 531: ! 532: /* Step 5a */ ! 533: if( z[0]=='e' ){ ! 534: if( m_gt_1(z+1) ){ ! 535: z++; ! 536: }else if( m_eq_1(z+1) && !star_oh(z+1) ){ ! 537: z++; ! 538: } ! 539: } ! 540: ! 541: /* Step 5b */ ! 542: if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ ! 543: z++; ! 544: } ! 545: ! 546: /* z[] is now the stemmed word in reverse order. Flip it back ! 547: ** around into forward order and return. ! 548: */ ! 549: *pnOut = i = strlen(z); ! 550: zOut[i] = 0; ! 551: while( *z ){ ! 552: zOut[--i] = *(z++); ! 553: } ! 554: } ! 555: ! 556: /* ! 557: ** Characters that can be part of a token. We assume any character ! 558: ** whose value is greater than 0x80 (any UTF character) can be ! 559: ** part of a token. In other words, delimiters all must have ! 560: ** values of 0x7f or lower. ! 561: */ ! 562: static const char porterIdChar[] = { ! 563: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ ! 564: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ ! 565: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ ! 566: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ ! 567: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ ! 568: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ ! 569: }; ! 570: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30])) ! 571: ! 572: /* ! 573: ** Extract the next token from a tokenization cursor. The cursor must ! 574: ** have been opened by a prior call to porterOpen(). ! 575: */ ! 576: static int porterNext( ! 577: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */ ! 578: const char **pzToken, /* OUT: *pzToken is the token text */ ! 579: int *pnBytes, /* OUT: Number of bytes in token */ ! 580: int *piStartOffset, /* OUT: Starting offset of token */ ! 581: int *piEndOffset, /* OUT: Ending offset of token */ ! 582: int *piPosition /* OUT: Position integer of token */ ! 583: ){ ! 584: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor; ! 585: const char *z = c->zInput; ! 586: ! 587: while( c->iOffset<c->nInput ){ ! 588: int iStartOffset, ch; ! 589: ! 590: /* Scan past delimiter characters */ ! 591: while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){ ! 592: c->iOffset++; ! 593: } ! 594: ! 595: /* Count non-delimiter characters. */ ! 596: iStartOffset = c->iOffset; ! 597: while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ ! 598: c->iOffset++; ! 599: } ! 600: ! 601: if( c->iOffset>iStartOffset ){ ! 602: int n = c->iOffset-iStartOffset; ! 603: if( n>c->nAllocated ){ ! 604: c->nAllocated = n+20; ! 605: c->zToken = sqlite3_realloc(c->zToken, c->nAllocated); ! 606: if( c->zToken==NULL ) return SQLITE_NOMEM; ! 607: } ! 608: porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); ! 609: *pzToken = c->zToken; ! 610: *piStartOffset = iStartOffset; ! 611: *piEndOffset = c->iOffset; ! 612: *piPosition = c->iToken++; ! 613: return SQLITE_OK; ! 614: } ! 615: } ! 616: return SQLITE_DONE; ! 617: } ! 618: ! 619: /* ! 620: ** The set of routines that implement the porter-stemmer tokenizer ! 621: */ ! 622: static const sqlite3_tokenizer_module porterTokenizerModule = { ! 623: 0, ! 624: porterCreate, ! 625: porterDestroy, ! 626: porterOpen, ! 627: porterClose, ! 628: porterNext, ! 629: }; ! 630: ! 631: /* ! 632: ** Allocate a new porter tokenizer. Return a pointer to the new ! 633: ** tokenizer in *ppModule ! 634: */ ! 635: void sqlite3Fts2PorterTokenizerModule( ! 636: sqlite3_tokenizer_module const**ppModule ! 637: ){ ! 638: *ppModule = &porterTokenizerModule; ! 639: } ! 640: ! 641: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */