Annotation of embedaddon/sqlite3/ext/fts3/fts3_porter.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** 2006 September 30
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: *************************************************************************
                     12: ** Implementation of the full-text-search tokenizer that implements
                     13: ** a Porter stemmer.
                     14: */
                     15: 
                     16: /*
                     17: ** The code in this file is only compiled if:
                     18: **
                     19: **     * The FTS3 module is being built as an extension
                     20: **       (in which case SQLITE_CORE is not defined), or
                     21: **
                     22: **     * The FTS3 module is being built into the core of
                     23: **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
                     24: */
                     25: #include "fts3Int.h"
                     26: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
                     27: 
                     28: #include <assert.h>
                     29: #include <stdlib.h>
                     30: #include <stdio.h>
                     31: #include <string.h>
                     32: 
                     33: #include "fts3_tokenizer.h"
                     34: 
                     35: /*
                     36: ** Class derived from sqlite3_tokenizer
                     37: */
                     38: typedef struct porter_tokenizer {
                     39:   sqlite3_tokenizer base;      /* Base class */
                     40: } porter_tokenizer;
                     41: 
                     42: /*
                     43: ** Class derived from sqlite3_tokenizer_cursor
                     44: */
                     45: typedef struct porter_tokenizer_cursor {
                     46:   sqlite3_tokenizer_cursor base;
                     47:   const char *zInput;          /* input we are tokenizing */
                     48:   int nInput;                  /* size of the input */
                     49:   int iOffset;                 /* current position in zInput */
                     50:   int iToken;                  /* index of next token to be returned */
                     51:   char *zToken;                /* storage for current token */
                     52:   int nAllocated;              /* space allocated to zToken buffer */
                     53: } porter_tokenizer_cursor;
                     54: 
                     55: 
                     56: /*
                     57: ** Create a new tokenizer instance.
                     58: */
                     59: static int porterCreate(
                     60:   int argc, const char * const *argv,
                     61:   sqlite3_tokenizer **ppTokenizer
                     62: ){
                     63:   porter_tokenizer *t;
                     64: 
                     65:   UNUSED_PARAMETER(argc);
                     66:   UNUSED_PARAMETER(argv);
                     67: 
                     68:   t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
                     69:   if( t==NULL ) return SQLITE_NOMEM;
                     70:   memset(t, 0, sizeof(*t));
                     71:   *ppTokenizer = &t->base;
                     72:   return SQLITE_OK;
                     73: }
                     74: 
                     75: /*
                     76: ** Destroy a tokenizer
                     77: */
                     78: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
                     79:   sqlite3_free(pTokenizer);
                     80:   return SQLITE_OK;
                     81: }
                     82: 
                     83: /*
                     84: ** Prepare to begin tokenizing a particular string.  The input
                     85: ** string to be tokenized is zInput[0..nInput-1].  A cursor
                     86: ** used to incrementally tokenize this string is returned in 
                     87: ** *ppCursor.
                     88: */
                     89: static int porterOpen(
                     90:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
                     91:   const char *zInput, int nInput,        /* String to be tokenized */
                     92:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
                     93: ){
                     94:   porter_tokenizer_cursor *c;
                     95: 
                     96:   UNUSED_PARAMETER(pTokenizer);
                     97: 
                     98:   c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
                     99:   if( c==NULL ) return SQLITE_NOMEM;
                    100: 
                    101:   c->zInput = zInput;
                    102:   if( zInput==0 ){
                    103:     c->nInput = 0;
                    104:   }else if( nInput<0 ){
                    105:     c->nInput = (int)strlen(zInput);
                    106:   }else{
                    107:     c->nInput = nInput;
                    108:   }
                    109:   c->iOffset = 0;                 /* start tokenizing at the beginning */
                    110:   c->iToken = 0;
                    111:   c->zToken = NULL;               /* no space allocated, yet. */
                    112:   c->nAllocated = 0;
                    113: 
                    114:   *ppCursor = &c->base;
                    115:   return SQLITE_OK;
                    116: }
                    117: 
                    118: /*
                    119: ** Close a tokenization cursor previously opened by a call to
                    120: ** porterOpen() above.
                    121: */
                    122: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
                    123:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
                    124:   sqlite3_free(c->zToken);
                    125:   sqlite3_free(c);
                    126:   return SQLITE_OK;
                    127: }
                    128: /*
                    129: ** Vowel or consonant
                    130: */
                    131: static const char cType[] = {
                    132:    0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
                    133:    1, 1, 1, 2, 1
                    134: };
                    135: 
                    136: /*
                    137: ** isConsonant() and isVowel() determine if their first character in
                    138: ** the string they point to is a consonant or a vowel, according
                    139: ** to Porter ruls.  
                    140: **
                    141: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
                    142: ** 'Y' is a consonant unless it follows another consonant,
                    143: ** in which case it is a vowel.
                    144: **
                    145: ** In these routine, the letters are in reverse order.  So the 'y' rule
                    146: ** is that 'y' is a consonant unless it is followed by another
                    147: ** consonent.
                    148: */
                    149: static int isVowel(const char*);
                    150: static int isConsonant(const char *z){
                    151:   int j;
                    152:   char x = *z;
                    153:   if( x==0 ) return 0;
                    154:   assert( x>='a' && x<='z' );
                    155:   j = cType[x-'a'];
                    156:   if( j<2 ) return j;
                    157:   return z[1]==0 || isVowel(z + 1);
                    158: }
                    159: static int isVowel(const char *z){
                    160:   int j;
                    161:   char x = *z;
                    162:   if( x==0 ) return 0;
                    163:   assert( x>='a' && x<='z' );
                    164:   j = cType[x-'a'];
                    165:   if( j<2 ) return 1-j;
                    166:   return isConsonant(z + 1);
                    167: }
                    168: 
                    169: /*
                    170: ** Let any sequence of one or more vowels be represented by V and let
                    171: ** C be sequence of one or more consonants.  Then every word can be
                    172: ** represented as:
                    173: **
                    174: **           [C] (VC){m} [V]
                    175: **
                    176: ** In prose:  A word is an optional consonant followed by zero or
                    177: ** vowel-consonant pairs followed by an optional vowel.  "m" is the
                    178: ** number of vowel consonant pairs.  This routine computes the value
                    179: ** of m for the first i bytes of a word.
                    180: **
                    181: ** Return true if the m-value for z is 1 or more.  In other words,
                    182: ** return true if z contains at least one vowel that is followed
                    183: ** by a consonant.
                    184: **
                    185: ** In this routine z[] is in reverse order.  So we are really looking
                    186: ** for an instance of of a consonant followed by a vowel.
                    187: */
                    188: static int m_gt_0(const char *z){
                    189:   while( isVowel(z) ){ z++; }
                    190:   if( *z==0 ) return 0;
                    191:   while( isConsonant(z) ){ z++; }
                    192:   return *z!=0;
                    193: }
                    194: 
                    195: /* Like mgt0 above except we are looking for a value of m which is
                    196: ** exactly 1
                    197: */
                    198: static int m_eq_1(const char *z){
                    199:   while( isVowel(z) ){ z++; }
                    200:   if( *z==0 ) return 0;
                    201:   while( isConsonant(z) ){ z++; }
                    202:   if( *z==0 ) return 0;
                    203:   while( isVowel(z) ){ z++; }
                    204:   if( *z==0 ) return 1;
                    205:   while( isConsonant(z) ){ z++; }
                    206:   return *z==0;
                    207: }
                    208: 
                    209: /* Like mgt0 above except we are looking for a value of m>1 instead
                    210: ** or m>0
                    211: */
                    212: static int m_gt_1(const char *z){
                    213:   while( isVowel(z) ){ z++; }
                    214:   if( *z==0 ) return 0;
                    215:   while( isConsonant(z) ){ z++; }
                    216:   if( *z==0 ) return 0;
                    217:   while( isVowel(z) ){ z++; }
                    218:   if( *z==0 ) return 0;
                    219:   while( isConsonant(z) ){ z++; }
                    220:   return *z!=0;
                    221: }
                    222: 
                    223: /*
                    224: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
                    225: */
                    226: static int hasVowel(const char *z){
                    227:   while( isConsonant(z) ){ z++; }
                    228:   return *z!=0;
                    229: }
                    230: 
                    231: /*
                    232: ** Return TRUE if the word ends in a double consonant.
                    233: **
                    234: ** The text is reversed here. So we are really looking at
                    235: ** the first two characters of z[].
                    236: */
                    237: static int doubleConsonant(const char *z){
                    238:   return isConsonant(z) && z[0]==z[1];
                    239: }
                    240: 
                    241: /*
                    242: ** Return TRUE if the word ends with three letters which
                    243: ** are consonant-vowel-consonent and where the final consonant
                    244: ** is not 'w', 'x', or 'y'.
                    245: **
                    246: ** The word is reversed here.  So we are really checking the
                    247: ** first three letters and the first one cannot be in [wxy].
                    248: */
                    249: static int star_oh(const char *z){
                    250:   return
                    251:     isConsonant(z) &&
                    252:     z[0]!='w' && z[0]!='x' && z[0]!='y' &&
                    253:     isVowel(z+1) &&
                    254:     isConsonant(z+2);
                    255: }
                    256: 
                    257: /*
                    258: ** If the word ends with zFrom and xCond() is true for the stem
                    259: ** of the word that preceeds the zFrom ending, then change the 
                    260: ** ending to zTo.
                    261: **
                    262: ** The input word *pz and zFrom are both in reverse order.  zTo
                    263: ** is in normal order. 
                    264: **
                    265: ** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
                    266: ** match.  Not that TRUE is returned even if xCond() fails and
                    267: ** no substitution occurs.
                    268: */
                    269: static int stem(
                    270:   char **pz,             /* The word being stemmed (Reversed) */
                    271:   const char *zFrom,     /* If the ending matches this... (Reversed) */
                    272:   const char *zTo,       /* ... change the ending to this (not reversed) */
                    273:   int (*xCond)(const char*)   /* Condition that must be true */
                    274: ){
                    275:   char *z = *pz;
                    276:   while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
                    277:   if( *zFrom!=0 ) return 0;
                    278:   if( xCond && !xCond(z) ) return 1;
                    279:   while( *zTo ){
                    280:     *(--z) = *(zTo++);
                    281:   }
                    282:   *pz = z;
                    283:   return 1;
                    284: }
                    285: 
                    286: /*
                    287: ** This is the fallback stemmer used when the porter stemmer is
                    288: ** inappropriate.  The input word is copied into the output with
                    289: ** US-ASCII case folding.  If the input word is too long (more
                    290: ** than 20 bytes if it contains no digits or more than 6 bytes if
                    291: ** it contains digits) then word is truncated to 20 or 6 bytes
                    292: ** by taking 10 or 3 bytes from the beginning and end.
                    293: */
                    294: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
                    295:   int i, mx, j;
                    296:   int hasDigit = 0;
                    297:   for(i=0; i<nIn; i++){
                    298:     char c = zIn[i];
                    299:     if( c>='A' && c<='Z' ){
                    300:       zOut[i] = c - 'A' + 'a';
                    301:     }else{
                    302:       if( c>='0' && c<='9' ) hasDigit = 1;
                    303:       zOut[i] = c;
                    304:     }
                    305:   }
                    306:   mx = hasDigit ? 3 : 10;
                    307:   if( nIn>mx*2 ){
                    308:     for(j=mx, i=nIn-mx; i<nIn; i++, j++){
                    309:       zOut[j] = zOut[i];
                    310:     }
                    311:     i = j;
                    312:   }
                    313:   zOut[i] = 0;
                    314:   *pnOut = i;
                    315: }
                    316: 
                    317: 
                    318: /*
                    319: ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
                    320: ** zOut is at least big enough to hold nIn bytes.  Write the actual
                    321: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
                    322: **
                    323: ** Any upper-case characters in the US-ASCII character set ([A-Z])
                    324: ** are converted to lower case.  Upper-case UTF characters are
                    325: ** unchanged.
                    326: **
                    327: ** Words that are longer than about 20 bytes are stemmed by retaining
                    328: ** a few bytes from the beginning and the end of the word.  If the
                    329: ** word contains digits, 3 bytes are taken from the beginning and
                    330: ** 3 bytes from the end.  For long words without digits, 10 bytes
                    331: ** are taken from each end.  US-ASCII case folding still applies.
                    332: ** 
                    333: ** If the input word contains not digits but does characters not 
                    334: ** in [a-zA-Z] then no stemming is attempted and this routine just 
                    335: ** copies the input into the input into the output with US-ASCII
                    336: ** case folding.
                    337: **
                    338: ** Stemming never increases the length of the word.  So there is
                    339: ** no chance of overflowing the zOut buffer.
                    340: */
                    341: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
                    342:   int i, j;
                    343:   char zReverse[28];
                    344:   char *z, *z2;
                    345:   if( nIn<3 || nIn>=(int)sizeof(zReverse)-7 ){
                    346:     /* The word is too big or too small for the porter stemmer.
                    347:     ** Fallback to the copy stemmer */
                    348:     copy_stemmer(zIn, nIn, zOut, pnOut);
                    349:     return;
                    350:   }
                    351:   for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
                    352:     char c = zIn[i];
                    353:     if( c>='A' && c<='Z' ){
                    354:       zReverse[j] = c + 'a' - 'A';
                    355:     }else if( c>='a' && c<='z' ){
                    356:       zReverse[j] = c;
                    357:     }else{
                    358:       /* The use of a character not in [a-zA-Z] means that we fallback
                    359:       ** to the copy stemmer */
                    360:       copy_stemmer(zIn, nIn, zOut, pnOut);
                    361:       return;
                    362:     }
                    363:   }
                    364:   memset(&zReverse[sizeof(zReverse)-5], 0, 5);
                    365:   z = &zReverse[j+1];
                    366: 
                    367: 
                    368:   /* Step 1a */
                    369:   if( z[0]=='s' ){
                    370:     if(
                    371:      !stem(&z, "sess", "ss", 0) &&
                    372:      !stem(&z, "sei", "i", 0)  &&
                    373:      !stem(&z, "ss", "ss", 0)
                    374:     ){
                    375:       z++;
                    376:     }
                    377:   }
                    378: 
                    379:   /* Step 1b */  
                    380:   z2 = z;
                    381:   if( stem(&z, "dee", "ee", m_gt_0) ){
                    382:     /* Do nothing.  The work was all in the test */
                    383:   }else if( 
                    384:      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
                    385:       && z!=z2
                    386:   ){
                    387:      if( stem(&z, "ta", "ate", 0) ||
                    388:          stem(&z, "lb", "ble", 0) ||
                    389:          stem(&z, "zi", "ize", 0) ){
                    390:        /* Do nothing.  The work was all in the test */
                    391:      }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
                    392:        z++;
                    393:      }else if( m_eq_1(z) && star_oh(z) ){
                    394:        *(--z) = 'e';
                    395:      }
                    396:   }
                    397: 
                    398:   /* Step 1c */
                    399:   if( z[0]=='y' && hasVowel(z+1) ){
                    400:     z[0] = 'i';
                    401:   }
                    402: 
                    403:   /* Step 2 */
                    404:   switch( z[1] ){
                    405:    case 'a':
                    406:      stem(&z, "lanoita", "ate", m_gt_0) ||
                    407:      stem(&z, "lanoit", "tion", m_gt_0);
                    408:      break;
                    409:    case 'c':
                    410:      stem(&z, "icne", "ence", m_gt_0) ||
                    411:      stem(&z, "icna", "ance", m_gt_0);
                    412:      break;
                    413:    case 'e':
                    414:      stem(&z, "rezi", "ize", m_gt_0);
                    415:      break;
                    416:    case 'g':
                    417:      stem(&z, "igol", "log", m_gt_0);
                    418:      break;
                    419:    case 'l':
                    420:      stem(&z, "ilb", "ble", m_gt_0) ||
                    421:      stem(&z, "illa", "al", m_gt_0) ||
                    422:      stem(&z, "iltne", "ent", m_gt_0) ||
                    423:      stem(&z, "ile", "e", m_gt_0) ||
                    424:      stem(&z, "ilsuo", "ous", m_gt_0);
                    425:      break;
                    426:    case 'o':
                    427:      stem(&z, "noitazi", "ize", m_gt_0) ||
                    428:      stem(&z, "noita", "ate", m_gt_0) ||
                    429:      stem(&z, "rota", "ate", m_gt_0);
                    430:      break;
                    431:    case 's':
                    432:      stem(&z, "msila", "al", m_gt_0) ||
                    433:      stem(&z, "ssenevi", "ive", m_gt_0) ||
                    434:      stem(&z, "ssenluf", "ful", m_gt_0) ||
                    435:      stem(&z, "ssensuo", "ous", m_gt_0);
                    436:      break;
                    437:    case 't':
                    438:      stem(&z, "itila", "al", m_gt_0) ||
                    439:      stem(&z, "itivi", "ive", m_gt_0) ||
                    440:      stem(&z, "itilib", "ble", m_gt_0);
                    441:      break;
                    442:   }
                    443: 
                    444:   /* Step 3 */
                    445:   switch( z[0] ){
                    446:    case 'e':
                    447:      stem(&z, "etaci", "ic", m_gt_0) ||
                    448:      stem(&z, "evita", "", m_gt_0)   ||
                    449:      stem(&z, "ezila", "al", m_gt_0);
                    450:      break;
                    451:    case 'i':
                    452:      stem(&z, "itici", "ic", m_gt_0);
                    453:      break;
                    454:    case 'l':
                    455:      stem(&z, "laci", "ic", m_gt_0) ||
                    456:      stem(&z, "luf", "", m_gt_0);
                    457:      break;
                    458:    case 's':
                    459:      stem(&z, "ssen", "", m_gt_0);
                    460:      break;
                    461:   }
                    462: 
                    463:   /* Step 4 */
                    464:   switch( z[1] ){
                    465:    case 'a':
                    466:      if( z[0]=='l' && m_gt_1(z+2) ){
                    467:        z += 2;
                    468:      }
                    469:      break;
                    470:    case 'c':
                    471:      if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
                    472:        z += 4;
                    473:      }
                    474:      break;
                    475:    case 'e':
                    476:      if( z[0]=='r' && m_gt_1(z+2) ){
                    477:        z += 2;
                    478:      }
                    479:      break;
                    480:    case 'i':
                    481:      if( z[0]=='c' && m_gt_1(z+2) ){
                    482:        z += 2;
                    483:      }
                    484:      break;
                    485:    case 'l':
                    486:      if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
                    487:        z += 4;
                    488:      }
                    489:      break;
                    490:    case 'n':
                    491:      if( z[0]=='t' ){
                    492:        if( z[2]=='a' ){
                    493:          if( m_gt_1(z+3) ){
                    494:            z += 3;
                    495:          }
                    496:        }else if( z[2]=='e' ){
                    497:          stem(&z, "tneme", "", m_gt_1) ||
                    498:          stem(&z, "tnem", "", m_gt_1) ||
                    499:          stem(&z, "tne", "", m_gt_1);
                    500:        }
                    501:      }
                    502:      break;
                    503:    case 'o':
                    504:      if( z[0]=='u' ){
                    505:        if( m_gt_1(z+2) ){
                    506:          z += 2;
                    507:        }
                    508:      }else if( z[3]=='s' || z[3]=='t' ){
                    509:        stem(&z, "noi", "", m_gt_1);
                    510:      }
                    511:      break;
                    512:    case 's':
                    513:      if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
                    514:        z += 3;
                    515:      }
                    516:      break;
                    517:    case 't':
                    518:      stem(&z, "eta", "", m_gt_1) ||
                    519:      stem(&z, "iti", "", m_gt_1);
                    520:      break;
                    521:    case 'u':
                    522:      if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
                    523:        z += 3;
                    524:      }
                    525:      break;
                    526:    case 'v':
                    527:    case 'z':
                    528:      if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
                    529:        z += 3;
                    530:      }
                    531:      break;
                    532:   }
                    533: 
                    534:   /* Step 5a */
                    535:   if( z[0]=='e' ){
                    536:     if( m_gt_1(z+1) ){
                    537:       z++;
                    538:     }else if( m_eq_1(z+1) && !star_oh(z+1) ){
                    539:       z++;
                    540:     }
                    541:   }
                    542: 
                    543:   /* Step 5b */
                    544:   if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
                    545:     z++;
                    546:   }
                    547: 
                    548:   /* z[] is now the stemmed word in reverse order.  Flip it back
                    549:   ** around into forward order and return.
                    550:   */
                    551:   *pnOut = i = (int)strlen(z);
                    552:   zOut[i] = 0;
                    553:   while( *z ){
                    554:     zOut[--i] = *(z++);
                    555:   }
                    556: }
                    557: 
                    558: /*
                    559: ** Characters that can be part of a token.  We assume any character
                    560: ** whose value is greater than 0x80 (any UTF character) can be
                    561: ** part of a token.  In other words, delimiters all must have
                    562: ** values of 0x7f or lower.
                    563: */
                    564: static const char porterIdChar[] = {
                    565: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
                    566:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
                    567:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
                    568:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
                    569:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
                    570:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
                    571: };
                    572: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
                    573: 
                    574: /*
                    575: ** Extract the next token from a tokenization cursor.  The cursor must
                    576: ** have been opened by a prior call to porterOpen().
                    577: */
                    578: static int porterNext(
                    579:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
                    580:   const char **pzToken,               /* OUT: *pzToken is the token text */
                    581:   int *pnBytes,                       /* OUT: Number of bytes in token */
                    582:   int *piStartOffset,                 /* OUT: Starting offset of token */
                    583:   int *piEndOffset,                   /* OUT: Ending offset of token */
                    584:   int *piPosition                     /* OUT: Position integer of token */
                    585: ){
                    586:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
                    587:   const char *z = c->zInput;
                    588: 
                    589:   while( c->iOffset<c->nInput ){
                    590:     int iStartOffset, ch;
                    591: 
                    592:     /* Scan past delimiter characters */
                    593:     while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
                    594:       c->iOffset++;
                    595:     }
                    596: 
                    597:     /* Count non-delimiter characters. */
                    598:     iStartOffset = c->iOffset;
                    599:     while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
                    600:       c->iOffset++;
                    601:     }
                    602: 
                    603:     if( c->iOffset>iStartOffset ){
                    604:       int n = c->iOffset-iStartOffset;
                    605:       if( n>c->nAllocated ){
                    606:         char *pNew;
                    607:         c->nAllocated = n+20;
                    608:         pNew = sqlite3_realloc(c->zToken, c->nAllocated);
                    609:         if( !pNew ) return SQLITE_NOMEM;
                    610:         c->zToken = pNew;
                    611:       }
                    612:       porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
                    613:       *pzToken = c->zToken;
                    614:       *piStartOffset = iStartOffset;
                    615:       *piEndOffset = c->iOffset;
                    616:       *piPosition = c->iToken++;
                    617:       return SQLITE_OK;
                    618:     }
                    619:   }
                    620:   return SQLITE_DONE;
                    621: }
                    622: 
                    623: /*
                    624: ** The set of routines that implement the porter-stemmer tokenizer
                    625: */
                    626: static const sqlite3_tokenizer_module porterTokenizerModule = {
                    627:   0,
                    628:   porterCreate,
                    629:   porterDestroy,
                    630:   porterOpen,
                    631:   porterClose,
                    632:   porterNext,
                    633: };
                    634: 
                    635: /*
                    636: ** Allocate a new porter tokenizer.  Return a pointer to the new
                    637: ** tokenizer in *ppModule
                    638: */
                    639: void sqlite3Fts3PorterTokenizerModule(
                    640:   sqlite3_tokenizer_module const**ppModule
                    641: ){
                    642:   *ppModule = &porterTokenizerModule;
                    643: }
                    644: 
                    645: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>