Annotation of embedaddon/sqlite3/ext/fts1/fts1_porter.c, revision 1.1
1.1 ! misho 1: /*
! 2: ** 2006 September 30
! 3: **
! 4: ** The author disclaims copyright to this source code. In place of
! 5: ** a legal notice, here is a blessing:
! 6: **
! 7: ** May you do good and not evil.
! 8: ** May you find forgiveness for yourself and forgive others.
! 9: ** May you share freely, never taking more than you give.
! 10: **
! 11: *************************************************************************
! 12: ** Implementation of the full-text-search tokenizer that implements
! 13: ** a Porter stemmer.
! 14: */
! 15:
! 16: /*
! 17: ** The code in this file is only compiled if:
! 18: **
! 19: ** * The FTS1 module is being built as an extension
! 20: ** (in which case SQLITE_CORE is not defined), or
! 21: **
! 22: ** * The FTS1 module is being built into the core of
! 23: ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
! 24: */
! 25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
! 26:
! 27:
! 28: #include <assert.h>
! 29: #include <stdlib.h>
! 30: #include <stdio.h>
! 31: #include <string.h>
! 32: #include <ctype.h>
! 33:
! 34: #include "fts1_tokenizer.h"
! 35:
! 36: /*
! 37: ** Class derived from sqlite3_tokenizer
! 38: */
! 39: typedef struct porter_tokenizer {
! 40: sqlite3_tokenizer base; /* Base class */
! 41: } porter_tokenizer;
! 42:
! 43: /*
! 44: ** Class derived from sqlit3_tokenizer_cursor
! 45: */
! 46: typedef struct porter_tokenizer_cursor {
! 47: sqlite3_tokenizer_cursor base;
! 48: const char *zInput; /* input we are tokenizing */
! 49: int nInput; /* size of the input */
! 50: int iOffset; /* current position in zInput */
! 51: int iToken; /* index of next token to be returned */
! 52: char *zToken; /* storage for current token */
! 53: int nAllocated; /* space allocated to zToken buffer */
! 54: } porter_tokenizer_cursor;
! 55:
! 56:
! 57: /* Forward declaration */
! 58: static const sqlite3_tokenizer_module porterTokenizerModule;
! 59:
! 60:
! 61: /*
! 62: ** Create a new tokenizer instance.
! 63: */
! 64: static int porterCreate(
! 65: int argc, const char * const *argv,
! 66: sqlite3_tokenizer **ppTokenizer
! 67: ){
! 68: porter_tokenizer *t;
! 69: t = (porter_tokenizer *) calloc(sizeof(*t), 1);
! 70: if( t==NULL ) return SQLITE_NOMEM;
! 71:
! 72: *ppTokenizer = &t->base;
! 73: return SQLITE_OK;
! 74: }
! 75:
! 76: /*
! 77: ** Destroy a tokenizer
! 78: */
! 79: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
! 80: free(pTokenizer);
! 81: return SQLITE_OK;
! 82: }
! 83:
! 84: /*
! 85: ** Prepare to begin tokenizing a particular string. The input
! 86: ** string to be tokenized is zInput[0..nInput-1]. A cursor
! 87: ** used to incrementally tokenize this string is returned in
! 88: ** *ppCursor.
! 89: */
! 90: static int porterOpen(
! 91: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
! 92: const char *zInput, int nInput, /* String to be tokenized */
! 93: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
! 94: ){
! 95: porter_tokenizer_cursor *c;
! 96:
! 97: c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
! 98: if( c==NULL ) return SQLITE_NOMEM;
! 99:
! 100: c->zInput = zInput;
! 101: if( zInput==0 ){
! 102: c->nInput = 0;
! 103: }else if( nInput<0 ){
! 104: c->nInput = (int)strlen(zInput);
! 105: }else{
! 106: c->nInput = nInput;
! 107: }
! 108: c->iOffset = 0; /* start tokenizing at the beginning */
! 109: c->iToken = 0;
! 110: c->zToken = NULL; /* no space allocated, yet. */
! 111: c->nAllocated = 0;
! 112:
! 113: *ppCursor = &c->base;
! 114: return SQLITE_OK;
! 115: }
! 116:
! 117: /*
! 118: ** Close a tokenization cursor previously opened by a call to
! 119: ** porterOpen() above.
! 120: */
! 121: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
! 122: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
! 123: free(c->zToken);
! 124: free(c);
! 125: return SQLITE_OK;
! 126: }
! 127: /*
! 128: ** Vowel or consonant
! 129: */
! 130: static const char cType[] = {
! 131: 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
! 132: 1, 1, 1, 2, 1
! 133: };
! 134:
! 135: /*
! 136: ** isConsonant() and isVowel() determine if their first character in
! 137: ** the string they point to is a consonant or a vowel, according
! 138: ** to Porter ruls.
! 139: **
! 140: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
! 141: ** 'Y' is a consonant unless it follows another consonant,
! 142: ** in which case it is a vowel.
! 143: **
! 144: ** In these routine, the letters are in reverse order. So the 'y' rule
! 145: ** is that 'y' is a consonant unless it is followed by another
! 146: ** consonent.
! 147: */
! 148: static int isVowel(const char*);
! 149: static int isConsonant(const char *z){
! 150: int j;
! 151: char x = *z;
! 152: if( x==0 ) return 0;
! 153: assert( x>='a' && x<='z' );
! 154: j = cType[x-'a'];
! 155: if( j<2 ) return j;
! 156: return z[1]==0 || isVowel(z + 1);
! 157: }
! 158: static int isVowel(const char *z){
! 159: int j;
! 160: char x = *z;
! 161: if( x==0 ) return 0;
! 162: assert( x>='a' && x<='z' );
! 163: j = cType[x-'a'];
! 164: if( j<2 ) return 1-j;
! 165: return isConsonant(z + 1);
! 166: }
! 167:
! 168: /*
! 169: ** Let any sequence of one or more vowels be represented by V and let
! 170: ** C be sequence of one or more consonants. Then every word can be
! 171: ** represented as:
! 172: **
! 173: ** [C] (VC){m} [V]
! 174: **
! 175: ** In prose: A word is an optional consonant followed by zero or
! 176: ** vowel-consonant pairs followed by an optional vowel. "m" is the
! 177: ** number of vowel consonant pairs. This routine computes the value
! 178: ** of m for the first i bytes of a word.
! 179: **
! 180: ** Return true if the m-value for z is 1 or more. In other words,
! 181: ** return true if z contains at least one vowel that is followed
! 182: ** by a consonant.
! 183: **
! 184: ** In this routine z[] is in reverse order. So we are really looking
! 185: ** for an instance of of a consonant followed by a vowel.
! 186: */
! 187: static int m_gt_0(const char *z){
! 188: while( isVowel(z) ){ z++; }
! 189: if( *z==0 ) return 0;
! 190: while( isConsonant(z) ){ z++; }
! 191: return *z!=0;
! 192: }
! 193:
! 194: /* Like mgt0 above except we are looking for a value of m which is
! 195: ** exactly 1
! 196: */
! 197: static int m_eq_1(const char *z){
! 198: while( isVowel(z) ){ z++; }
! 199: if( *z==0 ) return 0;
! 200: while( isConsonant(z) ){ z++; }
! 201: if( *z==0 ) return 0;
! 202: while( isVowel(z) ){ z++; }
! 203: if( *z==0 ) return 1;
! 204: while( isConsonant(z) ){ z++; }
! 205: return *z==0;
! 206: }
! 207:
! 208: /* Like mgt0 above except we are looking for a value of m>1 instead
! 209: ** or m>0
! 210: */
! 211: static int m_gt_1(const char *z){
! 212: while( isVowel(z) ){ z++; }
! 213: if( *z==0 ) return 0;
! 214: while( isConsonant(z) ){ z++; }
! 215: if( *z==0 ) return 0;
! 216: while( isVowel(z) ){ z++; }
! 217: if( *z==0 ) return 0;
! 218: while( isConsonant(z) ){ z++; }
! 219: return *z!=0;
! 220: }
! 221:
! 222: /*
! 223: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
! 224: */
! 225: static int hasVowel(const char *z){
! 226: while( isConsonant(z) ){ z++; }
! 227: return *z!=0;
! 228: }
! 229:
! 230: /*
! 231: ** Return TRUE if the word ends in a double consonant.
! 232: **
! 233: ** The text is reversed here. So we are really looking at
! 234: ** the first two characters of z[].
! 235: */
! 236: static int doubleConsonant(const char *z){
! 237: return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
! 238: }
! 239:
! 240: /*
! 241: ** Return TRUE if the word ends with three letters which
! 242: ** are consonant-vowel-consonent and where the final consonant
! 243: ** is not 'w', 'x', or 'y'.
! 244: **
! 245: ** The word is reversed here. So we are really checking the
! 246: ** first three letters and the first one cannot be in [wxy].
! 247: */
! 248: static int star_oh(const char *z){
! 249: return
! 250: z[0]!=0 && isConsonant(z) &&
! 251: z[0]!='w' && z[0]!='x' && z[0]!='y' &&
! 252: z[1]!=0 && isVowel(z+1) &&
! 253: z[2]!=0 && isConsonant(z+2);
! 254: }
! 255:
! 256: /*
! 257: ** If the word ends with zFrom and xCond() is true for the stem
! 258: ** of the word that preceeds the zFrom ending, then change the
! 259: ** ending to zTo.
! 260: **
! 261: ** The input word *pz and zFrom are both in reverse order. zTo
! 262: ** is in normal order.
! 263: **
! 264: ** Return TRUE if zFrom matches. Return FALSE if zFrom does not
! 265: ** match. Not that TRUE is returned even if xCond() fails and
! 266: ** no substitution occurs.
! 267: */
! 268: static int stem(
! 269: char **pz, /* The word being stemmed (Reversed) */
! 270: const char *zFrom, /* If the ending matches this... (Reversed) */
! 271: const char *zTo, /* ... change the ending to this (not reversed) */
! 272: int (*xCond)(const char*) /* Condition that must be true */
! 273: ){
! 274: char *z = *pz;
! 275: while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
! 276: if( *zFrom!=0 ) return 0;
! 277: if( xCond && !xCond(z) ) return 1;
! 278: while( *zTo ){
! 279: *(--z) = *(zTo++);
! 280: }
! 281: *pz = z;
! 282: return 1;
! 283: }
! 284:
! 285: /*
! 286: ** This is the fallback stemmer used when the porter stemmer is
! 287: ** inappropriate. The input word is copied into the output with
! 288: ** US-ASCII case folding. If the input word is too long (more
! 289: ** than 20 bytes if it contains no digits or more than 6 bytes if
! 290: ** it contains digits) then word is truncated to 20 or 6 bytes
! 291: ** by taking 10 or 3 bytes from the beginning and end.
! 292: */
! 293: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
! 294: int i, mx, j;
! 295: int hasDigit = 0;
! 296: for(i=0; i<nIn; i++){
! 297: int c = zIn[i];
! 298: if( c>='A' && c<='Z' ){
! 299: zOut[i] = c - 'A' + 'a';
! 300: }else{
! 301: if( c>='0' && c<='9' ) hasDigit = 1;
! 302: zOut[i] = c;
! 303: }
! 304: }
! 305: mx = hasDigit ? 3 : 10;
! 306: if( nIn>mx*2 ){
! 307: for(j=mx, i=nIn-mx; i<nIn; i++, j++){
! 308: zOut[j] = zOut[i];
! 309: }
! 310: i = j;
! 311: }
! 312: zOut[i] = 0;
! 313: *pnOut = i;
! 314: }
! 315:
! 316:
! 317: /*
! 318: ** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
! 319: ** zOut is at least big enough to hold nIn bytes. Write the actual
! 320: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
! 321: **
! 322: ** Any upper-case characters in the US-ASCII character set ([A-Z])
! 323: ** are converted to lower case. Upper-case UTF characters are
! 324: ** unchanged.
! 325: **
! 326: ** Words that are longer than about 20 bytes are stemmed by retaining
! 327: ** a few bytes from the beginning and the end of the word. If the
! 328: ** word contains digits, 3 bytes are taken from the beginning and
! 329: ** 3 bytes from the end. For long words without digits, 10 bytes
! 330: ** are taken from each end. US-ASCII case folding still applies.
! 331: **
! 332: ** If the input word contains not digits but does characters not
! 333: ** in [a-zA-Z] then no stemming is attempted and this routine just
! 334: ** copies the input into the input into the output with US-ASCII
! 335: ** case folding.
! 336: **
! 337: ** Stemming never increases the length of the word. So there is
! 338: ** no chance of overflowing the zOut buffer.
! 339: */
! 340: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
! 341: int i, j, c;
! 342: char zReverse[28];
! 343: char *z, *z2;
! 344: if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
! 345: /* The word is too big or too small for the porter stemmer.
! 346: ** Fallback to the copy stemmer */
! 347: copy_stemmer(zIn, nIn, zOut, pnOut);
! 348: return;
! 349: }
! 350: for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
! 351: c = zIn[i];
! 352: if( c>='A' && c<='Z' ){
! 353: zReverse[j] = c + 'a' - 'A';
! 354: }else if( c>='a' && c<='z' ){
! 355: zReverse[j] = c;
! 356: }else{
! 357: /* The use of a character not in [a-zA-Z] means that we fallback
! 358: ** to the copy stemmer */
! 359: copy_stemmer(zIn, nIn, zOut, pnOut);
! 360: return;
! 361: }
! 362: }
! 363: memset(&zReverse[sizeof(zReverse)-5], 0, 5);
! 364: z = &zReverse[j+1];
! 365:
! 366:
! 367: /* Step 1a */
! 368: if( z[0]=='s' ){
! 369: if(
! 370: !stem(&z, "sess", "ss", 0) &&
! 371: !stem(&z, "sei", "i", 0) &&
! 372: !stem(&z, "ss", "ss", 0)
! 373: ){
! 374: z++;
! 375: }
! 376: }
! 377:
! 378: /* Step 1b */
! 379: z2 = z;
! 380: if( stem(&z, "dee", "ee", m_gt_0) ){
! 381: /* Do nothing. The work was all in the test */
! 382: }else if(
! 383: (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
! 384: && z!=z2
! 385: ){
! 386: if( stem(&z, "ta", "ate", 0) ||
! 387: stem(&z, "lb", "ble", 0) ||
! 388: stem(&z, "zi", "ize", 0) ){
! 389: /* Do nothing. The work was all in the test */
! 390: }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
! 391: z++;
! 392: }else if( m_eq_1(z) && star_oh(z) ){
! 393: *(--z) = 'e';
! 394: }
! 395: }
! 396:
! 397: /* Step 1c */
! 398: if( z[0]=='y' && hasVowel(z+1) ){
! 399: z[0] = 'i';
! 400: }
! 401:
! 402: /* Step 2 */
! 403: switch( z[1] ){
! 404: case 'a':
! 405: stem(&z, "lanoita", "ate", m_gt_0) ||
! 406: stem(&z, "lanoit", "tion", m_gt_0);
! 407: break;
! 408: case 'c':
! 409: stem(&z, "icne", "ence", m_gt_0) ||
! 410: stem(&z, "icna", "ance", m_gt_0);
! 411: break;
! 412: case 'e':
! 413: stem(&z, "rezi", "ize", m_gt_0);
! 414: break;
! 415: case 'g':
! 416: stem(&z, "igol", "log", m_gt_0);
! 417: break;
! 418: case 'l':
! 419: stem(&z, "ilb", "ble", m_gt_0) ||
! 420: stem(&z, "illa", "al", m_gt_0) ||
! 421: stem(&z, "iltne", "ent", m_gt_0) ||
! 422: stem(&z, "ile", "e", m_gt_0) ||
! 423: stem(&z, "ilsuo", "ous", m_gt_0);
! 424: break;
! 425: case 'o':
! 426: stem(&z, "noitazi", "ize", m_gt_0) ||
! 427: stem(&z, "noita", "ate", m_gt_0) ||
! 428: stem(&z, "rota", "ate", m_gt_0);
! 429: break;
! 430: case 's':
! 431: stem(&z, "msila", "al", m_gt_0) ||
! 432: stem(&z, "ssenevi", "ive", m_gt_0) ||
! 433: stem(&z, "ssenluf", "ful", m_gt_0) ||
! 434: stem(&z, "ssensuo", "ous", m_gt_0);
! 435: break;
! 436: case 't':
! 437: stem(&z, "itila", "al", m_gt_0) ||
! 438: stem(&z, "itivi", "ive", m_gt_0) ||
! 439: stem(&z, "itilib", "ble", m_gt_0);
! 440: break;
! 441: }
! 442:
! 443: /* Step 3 */
! 444: switch( z[0] ){
! 445: case 'e':
! 446: stem(&z, "etaci", "ic", m_gt_0) ||
! 447: stem(&z, "evita", "", m_gt_0) ||
! 448: stem(&z, "ezila", "al", m_gt_0);
! 449: break;
! 450: case 'i':
! 451: stem(&z, "itici", "ic", m_gt_0);
! 452: break;
! 453: case 'l':
! 454: stem(&z, "laci", "ic", m_gt_0) ||
! 455: stem(&z, "luf", "", m_gt_0);
! 456: break;
! 457: case 's':
! 458: stem(&z, "ssen", "", m_gt_0);
! 459: break;
! 460: }
! 461:
! 462: /* Step 4 */
! 463: switch( z[1] ){
! 464: case 'a':
! 465: if( z[0]=='l' && m_gt_1(z+2) ){
! 466: z += 2;
! 467: }
! 468: break;
! 469: case 'c':
! 470: if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
! 471: z += 4;
! 472: }
! 473: break;
! 474: case 'e':
! 475: if( z[0]=='r' && m_gt_1(z+2) ){
! 476: z += 2;
! 477: }
! 478: break;
! 479: case 'i':
! 480: if( z[0]=='c' && m_gt_1(z+2) ){
! 481: z += 2;
! 482: }
! 483: break;
! 484: case 'l':
! 485: if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
! 486: z += 4;
! 487: }
! 488: break;
! 489: case 'n':
! 490: if( z[0]=='t' ){
! 491: if( z[2]=='a' ){
! 492: if( m_gt_1(z+3) ){
! 493: z += 3;
! 494: }
! 495: }else if( z[2]=='e' ){
! 496: stem(&z, "tneme", "", m_gt_1) ||
! 497: stem(&z, "tnem", "", m_gt_1) ||
! 498: stem(&z, "tne", "", m_gt_1);
! 499: }
! 500: }
! 501: break;
! 502: case 'o':
! 503: if( z[0]=='u' ){
! 504: if( m_gt_1(z+2) ){
! 505: z += 2;
! 506: }
! 507: }else if( z[3]=='s' || z[3]=='t' ){
! 508: stem(&z, "noi", "", m_gt_1);
! 509: }
! 510: break;
! 511: case 's':
! 512: if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
! 513: z += 3;
! 514: }
! 515: break;
! 516: case 't':
! 517: stem(&z, "eta", "", m_gt_1) ||
! 518: stem(&z, "iti", "", m_gt_1);
! 519: break;
! 520: case 'u':
! 521: if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
! 522: z += 3;
! 523: }
! 524: break;
! 525: case 'v':
! 526: case 'z':
! 527: if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
! 528: z += 3;
! 529: }
! 530: break;
! 531: }
! 532:
! 533: /* Step 5a */
! 534: if( z[0]=='e' ){
! 535: if( m_gt_1(z+1) ){
! 536: z++;
! 537: }else if( m_eq_1(z+1) && !star_oh(z+1) ){
! 538: z++;
! 539: }
! 540: }
! 541:
! 542: /* Step 5b */
! 543: if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
! 544: z++;
! 545: }
! 546:
! 547: /* z[] is now the stemmed word in reverse order. Flip it back
! 548: ** around into forward order and return.
! 549: */
! 550: *pnOut = i = strlen(z);
! 551: zOut[i] = 0;
! 552: while( *z ){
! 553: zOut[--i] = *(z++);
! 554: }
! 555: }
! 556:
! 557: /*
! 558: ** Characters that can be part of a token. We assume any character
! 559: ** whose value is greater than 0x80 (any UTF character) can be
! 560: ** part of a token. In other words, delimiters all must have
! 561: ** values of 0x7f or lower.
! 562: */
! 563: static const char isIdChar[] = {
! 564: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
! 565: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
! 566: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
! 567: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
! 568: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
! 569: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
! 570: };
! 571: #define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
! 572: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
! 573:
! 574: /*
! 575: ** Extract the next token from a tokenization cursor. The cursor must
! 576: ** have been opened by a prior call to porterOpen().
! 577: */
! 578: static int porterNext(
! 579: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
! 580: const char **pzToken, /* OUT: *pzToken is the token text */
! 581: int *pnBytes, /* OUT: Number of bytes in token */
! 582: int *piStartOffset, /* OUT: Starting offset of token */
! 583: int *piEndOffset, /* OUT: Ending offset of token */
! 584: int *piPosition /* OUT: Position integer of token */
! 585: ){
! 586: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
! 587: const char *z = c->zInput;
! 588:
! 589: while( c->iOffset<c->nInput ){
! 590: int iStartOffset, ch;
! 591:
! 592: /* Scan past delimiter characters */
! 593: while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
! 594: c->iOffset++;
! 595: }
! 596:
! 597: /* Count non-delimiter characters. */
! 598: iStartOffset = c->iOffset;
! 599: while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
! 600: c->iOffset++;
! 601: }
! 602:
! 603: if( c->iOffset>iStartOffset ){
! 604: int n = c->iOffset-iStartOffset;
! 605: if( n>c->nAllocated ){
! 606: c->nAllocated = n+20;
! 607: c->zToken = realloc(c->zToken, c->nAllocated);
! 608: if( c->zToken==NULL ) return SQLITE_NOMEM;
! 609: }
! 610: porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
! 611: *pzToken = c->zToken;
! 612: *piStartOffset = iStartOffset;
! 613: *piEndOffset = c->iOffset;
! 614: *piPosition = c->iToken++;
! 615: return SQLITE_OK;
! 616: }
! 617: }
! 618: return SQLITE_DONE;
! 619: }
! 620:
! 621: /*
! 622: ** The set of routines that implement the porter-stemmer tokenizer
! 623: */
! 624: static const sqlite3_tokenizer_module porterTokenizerModule = {
! 625: 0,
! 626: porterCreate,
! 627: porterDestroy,
! 628: porterOpen,
! 629: porterClose,
! 630: porterNext,
! 631: };
! 632:
! 633: /*
! 634: ** Allocate a new porter tokenizer. Return a pointer to the new
! 635: ** tokenizer in *ppModule
! 636: */
! 637: void sqlite3Fts1PorterTokenizerModule(
! 638: sqlite3_tokenizer_module const**ppModule
! 639: ){
! 640: *ppModule = &porterTokenizerModule;
! 641: }
! 642:
! 643: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>