Annotation of embedaddon/sqlite3/ext/fts3/fts3_porter.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** 2006 September 30
! 3: **
! 4: ** The author disclaims copyright to this source code. In place of
! 5: ** a legal notice, here is a blessing:
! 6: **
! 7: ** May you do good and not evil.
! 8: ** May you find forgiveness for yourself and forgive others.
! 9: ** May you share freely, never taking more than you give.
! 10: **
! 11: *************************************************************************
! 12: ** Implementation of the full-text-search tokenizer that implements
! 13: ** a Porter stemmer.
! 14: */
! 15:
! 16: /*
! 17: ** The code in this file is only compiled if:
! 18: **
! 19: ** * The FTS3 module is being built as an extension
! 20: ** (in which case SQLITE_CORE is not defined), or
! 21: **
! 22: ** * The FTS3 module is being built into the core of
! 23: ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
! 24: */
! 25: #include "fts3Int.h"
! 26: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
! 27:
! 28: #include <assert.h>
! 29: #include <stdlib.h>
! 30: #include <stdio.h>
! 31: #include <string.h>
! 32:
! 33: #include "fts3_tokenizer.h"
! 34:
! 35: /*
! 36: ** Class derived from sqlite3_tokenizer
! 37: */
! 38: typedef struct porter_tokenizer {
! 39: sqlite3_tokenizer base; /* Base class */
! 40: } porter_tokenizer;
! 41:
! 42: /*
! 43: ** Class derived from sqlite3_tokenizer_cursor
! 44: */
! 45: typedef struct porter_tokenizer_cursor {
! 46: sqlite3_tokenizer_cursor base;
! 47: const char *zInput; /* input we are tokenizing */
! 48: int nInput; /* size of the input */
! 49: int iOffset; /* current position in zInput */
! 50: int iToken; /* index of next token to be returned */
! 51: char *zToken; /* storage for current token */
! 52: int nAllocated; /* space allocated to zToken buffer */
! 53: } porter_tokenizer_cursor;
! 54:
! 55:
! 56: /*
! 57: ** Create a new tokenizer instance.
! 58: */
! 59: static int porterCreate(
! 60: int argc, const char * const *argv,
! 61: sqlite3_tokenizer **ppTokenizer
! 62: ){
! 63: porter_tokenizer *t;
! 64:
! 65: UNUSED_PARAMETER(argc);
! 66: UNUSED_PARAMETER(argv);
! 67:
! 68: t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
! 69: if( t==NULL ) return SQLITE_NOMEM;
! 70: memset(t, 0, sizeof(*t));
! 71: *ppTokenizer = &t->base;
! 72: return SQLITE_OK;
! 73: }
! 74:
! 75: /*
! 76: ** Destroy a tokenizer
! 77: */
! 78: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
! 79: sqlite3_free(pTokenizer);
! 80: return SQLITE_OK;
! 81: }
! 82:
! 83: /*
! 84: ** Prepare to begin tokenizing a particular string. The input
! 85: ** string to be tokenized is zInput[0..nInput-1]. A cursor
! 86: ** used to incrementally tokenize this string is returned in
! 87: ** *ppCursor.
! 88: */
! 89: static int porterOpen(
! 90: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
! 91: const char *zInput, int nInput, /* String to be tokenized */
! 92: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
! 93: ){
! 94: porter_tokenizer_cursor *c;
! 95:
! 96: UNUSED_PARAMETER(pTokenizer);
! 97:
! 98: c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
! 99: if( c==NULL ) return SQLITE_NOMEM;
! 100:
! 101: c->zInput = zInput;
! 102: if( zInput==0 ){
! 103: c->nInput = 0;
! 104: }else if( nInput<0 ){
! 105: c->nInput = (int)strlen(zInput);
! 106: }else{
! 107: c->nInput = nInput;
! 108: }
! 109: c->iOffset = 0; /* start tokenizing at the beginning */
! 110: c->iToken = 0;
! 111: c->zToken = NULL; /* no space allocated, yet. */
! 112: c->nAllocated = 0;
! 113:
! 114: *ppCursor = &c->base;
! 115: return SQLITE_OK;
! 116: }
! 117:
! 118: /*
! 119: ** Close a tokenization cursor previously opened by a call to
! 120: ** porterOpen() above.
! 121: */
! 122: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
! 123: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
! 124: sqlite3_free(c->zToken);
! 125: sqlite3_free(c);
! 126: return SQLITE_OK;
! 127: }
! 128: /*
! 129: ** Vowel or consonant
! 130: */
! 131: static const char cType[] = {
! 132: 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
! 133: 1, 1, 1, 2, 1
! 134: };
! 135:
! 136: /*
! 137: ** isConsonant() and isVowel() determine if their first character in
! 138: ** the string they point to is a consonant or a vowel, according
! 139: ** to Porter ruls.
! 140: **
! 141: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
! 142: ** 'Y' is a consonant unless it follows another consonant,
! 143: ** in which case it is a vowel.
! 144: **
! 145: ** In these routine, the letters are in reverse order. So the 'y' rule
! 146: ** is that 'y' is a consonant unless it is followed by another
! 147: ** consonent.
! 148: */
! 149: static int isVowel(const char*);
! 150: static int isConsonant(const char *z){
! 151: int j;
! 152: char x = *z;
! 153: if( x==0 ) return 0;
! 154: assert( x>='a' && x<='z' );
! 155: j = cType[x-'a'];
! 156: if( j<2 ) return j;
! 157: return z[1]==0 || isVowel(z + 1);
! 158: }
! 159: static int isVowel(const char *z){
! 160: int j;
! 161: char x = *z;
! 162: if( x==0 ) return 0;
! 163: assert( x>='a' && x<='z' );
! 164: j = cType[x-'a'];
! 165: if( j<2 ) return 1-j;
! 166: return isConsonant(z + 1);
! 167: }
! 168:
! 169: /*
! 170: ** Let any sequence of one or more vowels be represented by V and let
! 171: ** C be sequence of one or more consonants. Then every word can be
! 172: ** represented as:
! 173: **
! 174: ** [C] (VC){m} [V]
! 175: **
! 176: ** In prose: A word is an optional consonant followed by zero or
! 177: ** vowel-consonant pairs followed by an optional vowel. "m" is the
! 178: ** number of vowel consonant pairs. This routine computes the value
! 179: ** of m for the first i bytes of a word.
! 180: **
! 181: ** Return true if the m-value for z is 1 or more. In other words,
! 182: ** return true if z contains at least one vowel that is followed
! 183: ** by a consonant.
! 184: **
! 185: ** In this routine z[] is in reverse order. So we are really looking
! 186: ** for an instance of of a consonant followed by a vowel.
! 187: */
! 188: static int m_gt_0(const char *z){
! 189: while( isVowel(z) ){ z++; }
! 190: if( *z==0 ) return 0;
! 191: while( isConsonant(z) ){ z++; }
! 192: return *z!=0;
! 193: }
! 194:
! 195: /* Like mgt0 above except we are looking for a value of m which is
! 196: ** exactly 1
! 197: */
! 198: static int m_eq_1(const char *z){
! 199: while( isVowel(z) ){ z++; }
! 200: if( *z==0 ) return 0;
! 201: while( isConsonant(z) ){ z++; }
! 202: if( *z==0 ) return 0;
! 203: while( isVowel(z) ){ z++; }
! 204: if( *z==0 ) return 1;
! 205: while( isConsonant(z) ){ z++; }
! 206: return *z==0;
! 207: }
! 208:
! 209: /* Like mgt0 above except we are looking for a value of m>1 instead
! 210: ** or m>0
! 211: */
! 212: static int m_gt_1(const char *z){
! 213: while( isVowel(z) ){ z++; }
! 214: if( *z==0 ) return 0;
! 215: while( isConsonant(z) ){ z++; }
! 216: if( *z==0 ) return 0;
! 217: while( isVowel(z) ){ z++; }
! 218: if( *z==0 ) return 0;
! 219: while( isConsonant(z) ){ z++; }
! 220: return *z!=0;
! 221: }
! 222:
! 223: /*
! 224: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
! 225: */
! 226: static int hasVowel(const char *z){
! 227: while( isConsonant(z) ){ z++; }
! 228: return *z!=0;
! 229: }
! 230:
! 231: /*
! 232: ** Return TRUE if the word ends in a double consonant.
! 233: **
! 234: ** The text is reversed here. So we are really looking at
! 235: ** the first two characters of z[].
! 236: */
! 237: static int doubleConsonant(const char *z){
! 238: return isConsonant(z) && z[0]==z[1];
! 239: }
! 240:
! 241: /*
! 242: ** Return TRUE if the word ends with three letters which
! 243: ** are consonant-vowel-consonent and where the final consonant
! 244: ** is not 'w', 'x', or 'y'.
! 245: **
! 246: ** The word is reversed here. So we are really checking the
! 247: ** first three letters and the first one cannot be in [wxy].
! 248: */
! 249: static int star_oh(const char *z){
! 250: return
! 251: isConsonant(z) &&
! 252: z[0]!='w' && z[0]!='x' && z[0]!='y' &&
! 253: isVowel(z+1) &&
! 254: isConsonant(z+2);
! 255: }
! 256:
! 257: /*
! 258: ** If the word ends with zFrom and xCond() is true for the stem
! 259: ** of the word that preceeds the zFrom ending, then change the
! 260: ** ending to zTo.
! 261: **
! 262: ** The input word *pz and zFrom are both in reverse order. zTo
! 263: ** is in normal order.
! 264: **
! 265: ** Return TRUE if zFrom matches. Return FALSE if zFrom does not
! 266: ** match. Not that TRUE is returned even if xCond() fails and
! 267: ** no substitution occurs.
! 268: */
! 269: static int stem(
! 270: char **pz, /* The word being stemmed (Reversed) */
! 271: const char *zFrom, /* If the ending matches this... (Reversed) */
! 272: const char *zTo, /* ... change the ending to this (not reversed) */
! 273: int (*xCond)(const char*) /* Condition that must be true */
! 274: ){
! 275: char *z = *pz;
! 276: while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
! 277: if( *zFrom!=0 ) return 0;
! 278: if( xCond && !xCond(z) ) return 1;
! 279: while( *zTo ){
! 280: *(--z) = *(zTo++);
! 281: }
! 282: *pz = z;
! 283: return 1;
! 284: }
! 285:
! 286: /*
! 287: ** This is the fallback stemmer used when the porter stemmer is
! 288: ** inappropriate. The input word is copied into the output with
! 289: ** US-ASCII case folding. If the input word is too long (more
! 290: ** than 20 bytes if it contains no digits or more than 6 bytes if
! 291: ** it contains digits) then word is truncated to 20 or 6 bytes
! 292: ** by taking 10 or 3 bytes from the beginning and end.
! 293: */
! 294: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
! 295: int i, mx, j;
! 296: int hasDigit = 0;
! 297: for(i=0; i<nIn; i++){
! 298: char c = zIn[i];
! 299: if( c>='A' && c<='Z' ){
! 300: zOut[i] = c - 'A' + 'a';
! 301: }else{
! 302: if( c>='0' && c<='9' ) hasDigit = 1;
! 303: zOut[i] = c;
! 304: }
! 305: }
! 306: mx = hasDigit ? 3 : 10;
! 307: if( nIn>mx*2 ){
! 308: for(j=mx, i=nIn-mx; i<nIn; i++, j++){
! 309: zOut[j] = zOut[i];
! 310: }
! 311: i = j;
! 312: }
! 313: zOut[i] = 0;
! 314: *pnOut = i;
! 315: }
! 316:
! 317:
! 318: /*
! 319: ** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
! 320: ** zOut is at least big enough to hold nIn bytes. Write the actual
! 321: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
! 322: **
! 323: ** Any upper-case characters in the US-ASCII character set ([A-Z])
! 324: ** are converted to lower case. Upper-case UTF characters are
! 325: ** unchanged.
! 326: **
! 327: ** Words that are longer than about 20 bytes are stemmed by retaining
! 328: ** a few bytes from the beginning and the end of the word. If the
! 329: ** word contains digits, 3 bytes are taken from the beginning and
! 330: ** 3 bytes from the end. For long words without digits, 10 bytes
! 331: ** are taken from each end. US-ASCII case folding still applies.
! 332: **
! 333: ** If the input word contains not digits but does characters not
! 334: ** in [a-zA-Z] then no stemming is attempted and this routine just
! 335: ** copies the input into the input into the output with US-ASCII
! 336: ** case folding.
! 337: **
! 338: ** Stemming never increases the length of the word. So there is
! 339: ** no chance of overflowing the zOut buffer.
! 340: */
! 341: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
! 342: int i, j;
! 343: char zReverse[28];
! 344: char *z, *z2;
! 345: if( nIn<3 || nIn>=(int)sizeof(zReverse)-7 ){
! 346: /* The word is too big or too small for the porter stemmer.
! 347: ** Fallback to the copy stemmer */
! 348: copy_stemmer(zIn, nIn, zOut, pnOut);
! 349: return;
! 350: }
! 351: for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
! 352: char c = zIn[i];
! 353: if( c>='A' && c<='Z' ){
! 354: zReverse[j] = c + 'a' - 'A';
! 355: }else if( c>='a' && c<='z' ){
! 356: zReverse[j] = c;
! 357: }else{
! 358: /* The use of a character not in [a-zA-Z] means that we fallback
! 359: ** to the copy stemmer */
! 360: copy_stemmer(zIn, nIn, zOut, pnOut);
! 361: return;
! 362: }
! 363: }
! 364: memset(&zReverse[sizeof(zReverse)-5], 0, 5);
! 365: z = &zReverse[j+1];
! 366:
! 367:
! 368: /* Step 1a */
! 369: if( z[0]=='s' ){
! 370: if(
! 371: !stem(&z, "sess", "ss", 0) &&
! 372: !stem(&z, "sei", "i", 0) &&
! 373: !stem(&z, "ss", "ss", 0)
! 374: ){
! 375: z++;
! 376: }
! 377: }
! 378:
! 379: /* Step 1b */
! 380: z2 = z;
! 381: if( stem(&z, "dee", "ee", m_gt_0) ){
! 382: /* Do nothing. The work was all in the test */
! 383: }else if(
! 384: (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
! 385: && z!=z2
! 386: ){
! 387: if( stem(&z, "ta", "ate", 0) ||
! 388: stem(&z, "lb", "ble", 0) ||
! 389: stem(&z, "zi", "ize", 0) ){
! 390: /* Do nothing. The work was all in the test */
! 391: }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
! 392: z++;
! 393: }else if( m_eq_1(z) && star_oh(z) ){
! 394: *(--z) = 'e';
! 395: }
! 396: }
! 397:
! 398: /* Step 1c */
! 399: if( z[0]=='y' && hasVowel(z+1) ){
! 400: z[0] = 'i';
! 401: }
! 402:
! 403: /* Step 2 */
! 404: switch( z[1] ){
! 405: case 'a':
! 406: stem(&z, "lanoita", "ate", m_gt_0) ||
! 407: stem(&z, "lanoit", "tion", m_gt_0);
! 408: break;
! 409: case 'c':
! 410: stem(&z, "icne", "ence", m_gt_0) ||
! 411: stem(&z, "icna", "ance", m_gt_0);
! 412: break;
! 413: case 'e':
! 414: stem(&z, "rezi", "ize", m_gt_0);
! 415: break;
! 416: case 'g':
! 417: stem(&z, "igol", "log", m_gt_0);
! 418: break;
! 419: case 'l':
! 420: stem(&z, "ilb", "ble", m_gt_0) ||
! 421: stem(&z, "illa", "al", m_gt_0) ||
! 422: stem(&z, "iltne", "ent", m_gt_0) ||
! 423: stem(&z, "ile", "e", m_gt_0) ||
! 424: stem(&z, "ilsuo", "ous", m_gt_0);
! 425: break;
! 426: case 'o':
! 427: stem(&z, "noitazi", "ize", m_gt_0) ||
! 428: stem(&z, "noita", "ate", m_gt_0) ||
! 429: stem(&z, "rota", "ate", m_gt_0);
! 430: break;
! 431: case 's':
! 432: stem(&z, "msila", "al", m_gt_0) ||
! 433: stem(&z, "ssenevi", "ive", m_gt_0) ||
! 434: stem(&z, "ssenluf", "ful", m_gt_0) ||
! 435: stem(&z, "ssensuo", "ous", m_gt_0);
! 436: break;
! 437: case 't':
! 438: stem(&z, "itila", "al", m_gt_0) ||
! 439: stem(&z, "itivi", "ive", m_gt_0) ||
! 440: stem(&z, "itilib", "ble", m_gt_0);
! 441: break;
! 442: }
! 443:
! 444: /* Step 3 */
! 445: switch( z[0] ){
! 446: case 'e':
! 447: stem(&z, "etaci", "ic", m_gt_0) ||
! 448: stem(&z, "evita", "", m_gt_0) ||
! 449: stem(&z, "ezila", "al", m_gt_0);
! 450: break;
! 451: case 'i':
! 452: stem(&z, "itici", "ic", m_gt_0);
! 453: break;
! 454: case 'l':
! 455: stem(&z, "laci", "ic", m_gt_0) ||
! 456: stem(&z, "luf", "", m_gt_0);
! 457: break;
! 458: case 's':
! 459: stem(&z, "ssen", "", m_gt_0);
! 460: break;
! 461: }
! 462:
! 463: /* Step 4 */
! 464: switch( z[1] ){
! 465: case 'a':
! 466: if( z[0]=='l' && m_gt_1(z+2) ){
! 467: z += 2;
! 468: }
! 469: break;
! 470: case 'c':
! 471: if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
! 472: z += 4;
! 473: }
! 474: break;
! 475: case 'e':
! 476: if( z[0]=='r' && m_gt_1(z+2) ){
! 477: z += 2;
! 478: }
! 479: break;
! 480: case 'i':
! 481: if( z[0]=='c' && m_gt_1(z+2) ){
! 482: z += 2;
! 483: }
! 484: break;
! 485: case 'l':
! 486: if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
! 487: z += 4;
! 488: }
! 489: break;
! 490: case 'n':
! 491: if( z[0]=='t' ){
! 492: if( z[2]=='a' ){
! 493: if( m_gt_1(z+3) ){
! 494: z += 3;
! 495: }
! 496: }else if( z[2]=='e' ){
! 497: stem(&z, "tneme", "", m_gt_1) ||
! 498: stem(&z, "tnem", "", m_gt_1) ||
! 499: stem(&z, "tne", "", m_gt_1);
! 500: }
! 501: }
! 502: break;
! 503: case 'o':
! 504: if( z[0]=='u' ){
! 505: if( m_gt_1(z+2) ){
! 506: z += 2;
! 507: }
! 508: }else if( z[3]=='s' || z[3]=='t' ){
! 509: stem(&z, "noi", "", m_gt_1);
! 510: }
! 511: break;
! 512: case 's':
! 513: if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
! 514: z += 3;
! 515: }
! 516: break;
! 517: case 't':
! 518: stem(&z, "eta", "", m_gt_1) ||
! 519: stem(&z, "iti", "", m_gt_1);
! 520: break;
! 521: case 'u':
! 522: if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
! 523: z += 3;
! 524: }
! 525: break;
! 526: case 'v':
! 527: case 'z':
! 528: if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
! 529: z += 3;
! 530: }
! 531: break;
! 532: }
! 533:
! 534: /* Step 5a */
! 535: if( z[0]=='e' ){
! 536: if( m_gt_1(z+1) ){
! 537: z++;
! 538: }else if( m_eq_1(z+1) && !star_oh(z+1) ){
! 539: z++;
! 540: }
! 541: }
! 542:
! 543: /* Step 5b */
! 544: if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
! 545: z++;
! 546: }
! 547:
! 548: /* z[] is now the stemmed word in reverse order. Flip it back
! 549: ** around into forward order and return.
! 550: */
! 551: *pnOut = i = (int)strlen(z);
! 552: zOut[i] = 0;
! 553: while( *z ){
! 554: zOut[--i] = *(z++);
! 555: }
! 556: }
! 557:
! 558: /*
! 559: ** Characters that can be part of a token. We assume any character
! 560: ** whose value is greater than 0x80 (any UTF character) can be
! 561: ** part of a token. In other words, delimiters all must have
! 562: ** values of 0x7f or lower.
! 563: */
! 564: static const char porterIdChar[] = {
! 565: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
! 566: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
! 567: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
! 568: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
! 569: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
! 570: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
! 571: };
! 572: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
! 573:
! 574: /*
! 575: ** Extract the next token from a tokenization cursor. The cursor must
! 576: ** have been opened by a prior call to porterOpen().
! 577: */
! 578: static int porterNext(
! 579: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
! 580: const char **pzToken, /* OUT: *pzToken is the token text */
! 581: int *pnBytes, /* OUT: Number of bytes in token */
! 582: int *piStartOffset, /* OUT: Starting offset of token */
! 583: int *piEndOffset, /* OUT: Ending offset of token */
! 584: int *piPosition /* OUT: Position integer of token */
! 585: ){
! 586: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
! 587: const char *z = c->zInput;
! 588:
! 589: while( c->iOffset<c->nInput ){
! 590: int iStartOffset, ch;
! 591:
! 592: /* Scan past delimiter characters */
! 593: while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
! 594: c->iOffset++;
! 595: }
! 596:
! 597: /* Count non-delimiter characters. */
! 598: iStartOffset = c->iOffset;
! 599: while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
! 600: c->iOffset++;
! 601: }
! 602:
! 603: if( c->iOffset>iStartOffset ){
! 604: int n = c->iOffset-iStartOffset;
! 605: if( n>c->nAllocated ){
! 606: char *pNew;
! 607: c->nAllocated = n+20;
! 608: pNew = sqlite3_realloc(c->zToken, c->nAllocated);
! 609: if( !pNew ) return SQLITE_NOMEM;
! 610: c->zToken = pNew;
! 611: }
! 612: porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
! 613: *pzToken = c->zToken;
! 614: *piStartOffset = iStartOffset;
! 615: *piEndOffset = c->iOffset;
! 616: *piPosition = c->iToken++;
! 617: return SQLITE_OK;
! 618: }
! 619: }
! 620: return SQLITE_DONE;
! 621: }
! 622:
! 623: /*
! 624: ** The set of routines that implement the porter-stemmer tokenizer
! 625: */
! 626: static const sqlite3_tokenizer_module porterTokenizerModule = {
! 627: 0,
! 628: porterCreate,
! 629: porterDestroy,
! 630: porterOpen,
! 631: porterClose,
! 632: porterNext,
! 633: };
! 634:
! 635: /*
! 636: ** Allocate a new porter tokenizer. Return a pointer to the new
! 637: ** tokenizer in *ppModule
! 638: */
! 639: void sqlite3Fts3PorterTokenizerModule(
! 640: sqlite3_tokenizer_module const**ppModule
! 641: ){
! 642: *ppModule = &porterTokenizerModule;
! 643: }
! 644:
! 645: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>