embedaddon/sqlite3/ext/fts2/fts2_porter.c - annotate

Return to fts2_porter.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts2
Annotation of embedaddon/sqlite3/ext/fts2/fts2_porter.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** 2006 September 30
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: *************************************************************************
                     12: ** Implementation of the full-text-search tokenizer that implements
                     13: ** a Porter stemmer.
                     14: */
                     15: 
                     16: /*
                     17: ** The code in this file is only compiled if:
                     18: **
                     19: **     * The FTS2 module is being built as an extension
                     20: **       (in which case SQLITE_CORE is not defined), or
                     21: **
                     22: **     * The FTS2 module is being built into the core of
                     23: **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
                     24: */
                     25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
                     26: 
                     27: 
                     28: #include <assert.h>
                     29: #include <stdlib.h>
                     30: #include <stdio.h>
                     31: #include <string.h>
                     32: 
                     33: #include "fts2_tokenizer.h"
                     34: 
                     35: /*
                     36: ** Class derived from sqlite3_tokenizer
                     37: */
                     38: typedef struct porter_tokenizer {
                     39:   sqlite3_tokenizer base;      /* Base class */
                     40: } porter_tokenizer;
                     41: 
                     42: /*
                     43: ** Class derived from sqlit3_tokenizer_cursor
                     44: */
                     45: typedef struct porter_tokenizer_cursor {
                     46:   sqlite3_tokenizer_cursor base;
                     47:   const char *zInput;          /* input we are tokenizing */
                     48:   int nInput;                  /* size of the input */
                     49:   int iOffset;                 /* current position in zInput */
                     50:   int iToken;                  /* index of next token to be returned */
                     51:   char *zToken;                /* storage for current token */
                     52:   int nAllocated;              /* space allocated to zToken buffer */
                     53: } porter_tokenizer_cursor;
                     54: 
                     55: 
                     56: /* Forward declaration */
                     57: static const sqlite3_tokenizer_module porterTokenizerModule;
                     58: 
                     59: 
                     60: /*
                     61: ** Create a new tokenizer instance.
                     62: */
                     63: static int porterCreate(
                     64:   int argc, const char * const *argv,
                     65:   sqlite3_tokenizer **ppTokenizer
                     66: ){
                     67:   porter_tokenizer *t;
                     68:   t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
                     69:   if( t==NULL ) return SQLITE_NOMEM;
                     70:   memset(t, 0, sizeof(*t));
                     71:   *ppTokenizer = &t->base;
                     72:   return SQLITE_OK;
                     73: }
                     74: 
                     75: /*
                     76: ** Destroy a tokenizer
                     77: */
                     78: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
                     79:   sqlite3_free(pTokenizer);
                     80:   return SQLITE_OK;
                     81: }
                     82: 
                     83: /*
                     84: ** Prepare to begin tokenizing a particular string.  The input
                     85: ** string to be tokenized is zInput[0..nInput-1].  A cursor
                     86: ** used to incrementally tokenize this string is returned in 
                     87: ** *ppCursor.
                     88: */
                     89: static int porterOpen(
                     90:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
                     91:   const char *zInput, int nInput,        /* String to be tokenized */
                     92:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
                     93: ){
                     94:   porter_tokenizer_cursor *c;
                     95: 
                     96:   c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
                     97:   if( c==NULL ) return SQLITE_NOMEM;
                     98: 
                     99:   c->zInput = zInput;
                    100:   if( zInput==0 ){
                    101:     c->nInput = 0;
                    102:   }else if( nInput<0 ){
                    103:     c->nInput = (int)strlen(zInput);
                    104:   }else{
                    105:     c->nInput = nInput;
                    106:   }
                    107:   c->iOffset = 0;                 /* start tokenizing at the beginning */
                    108:   c->iToken = 0;
                    109:   c->zToken = NULL;               /* no space allocated, yet. */
                    110:   c->nAllocated = 0;
                    111: 
                    112:   *ppCursor = &c->base;
                    113:   return SQLITE_OK;
                    114: }
                    115: 
                    116: /*
                    117: ** Close a tokenization cursor previously opened by a call to
                    118: ** porterOpen() above.
                    119: */
                    120: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
                    121:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
                    122:   sqlite3_free(c->zToken);
                    123:   sqlite3_free(c);
                    124:   return SQLITE_OK;
                    125: }
                    126: /*
                    127: ** Vowel or consonant
                    128: */
                    129: static const char cType[] = {
                    130:    0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
                    131:    1, 1, 1, 2, 1
                    132: };
                    133: 
                    134: /*
                    135: ** isConsonant() and isVowel() determine if their first character in
                    136: ** the string they point to is a consonant or a vowel, according
                    137: ** to Porter ruls.  
                    138: **
                    139: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
                    140: ** 'Y' is a consonant unless it follows another consonant,
                    141: ** in which case it is a vowel.
                    142: **
                    143: ** In these routine, the letters are in reverse order.  So the 'y' rule
                    144: ** is that 'y' is a consonant unless it is followed by another
                    145: ** consonent.
                    146: */
                    147: static int isVowel(const char*);
                    148: static int isConsonant(const char *z){
                    149:   int j;
                    150:   char x = *z;
                    151:   if( x==0 ) return 0;
                    152:   assert( x>='a' && x<='z' );
                    153:   j = cType[x-'a'];
                    154:   if( j<2 ) return j;
                    155:   return z[1]==0 || isVowel(z + 1);
                    156: }
                    157: static int isVowel(const char *z){
                    158:   int j;
                    159:   char x = *z;
                    160:   if( x==0 ) return 0;
                    161:   assert( x>='a' && x<='z' );
                    162:   j = cType[x-'a'];
                    163:   if( j<2 ) return 1-j;
                    164:   return isConsonant(z + 1);
                    165: }
                    166: 
                    167: /*
                    168: ** Let any sequence of one or more vowels be represented by V and let
                    169: ** C be sequence of one or more consonants.  Then every word can be
                    170: ** represented as:
                    171: **
                    172: **           [C] (VC){m} [V]
                    173: **
                    174: ** In prose:  A word is an optional consonant followed by zero or
                    175: ** vowel-consonant pairs followed by an optional vowel.  "m" is the
                    176: ** number of vowel consonant pairs.  This routine computes the value
                    177: ** of m for the first i bytes of a word.
                    178: **
                    179: ** Return true if the m-value for z is 1 or more.  In other words,
                    180: ** return true if z contains at least one vowel that is followed
                    181: ** by a consonant.
                    182: **
                    183: ** In this routine z[] is in reverse order.  So we are really looking
                    184: ** for an instance of of a consonant followed by a vowel.
                    185: */
                    186: static int m_gt_0(const char *z){
                    187:   while( isVowel(z) ){ z++; }
                    188:   if( *z==0 ) return 0;
                    189:   while( isConsonant(z) ){ z++; }
                    190:   return *z!=0;
                    191: }
                    192: 
                    193: /* Like mgt0 above except we are looking for a value of m which is
                    194: ** exactly 1
                    195: */
                    196: static int m_eq_1(const char *z){
                    197:   while( isVowel(z) ){ z++; }
                    198:   if( *z==0 ) return 0;
                    199:   while( isConsonant(z) ){ z++; }
                    200:   if( *z==0 ) return 0;
                    201:   while( isVowel(z) ){ z++; }
                    202:   if( *z==0 ) return 1;
                    203:   while( isConsonant(z) ){ z++; }
                    204:   return *z==0;
                    205: }
                    206: 
                    207: /* Like mgt0 above except we are looking for a value of m>1 instead
                    208: ** or m>0
                    209: */
                    210: static int m_gt_1(const char *z){
                    211:   while( isVowel(z) ){ z++; }
                    212:   if( *z==0 ) return 0;
                    213:   while( isConsonant(z) ){ z++; }
                    214:   if( *z==0 ) return 0;
                    215:   while( isVowel(z) ){ z++; }
                    216:   if( *z==0 ) return 0;
                    217:   while( isConsonant(z) ){ z++; }
                    218:   return *z!=0;
                    219: }
                    220: 
                    221: /*
                    222: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
                    223: */
                    224: static int hasVowel(const char *z){
                    225:   while( isConsonant(z) ){ z++; }
                    226:   return *z!=0;
                    227: }
                    228: 
                    229: /*
                    230: ** Return TRUE if the word ends in a double consonant.
                    231: **
                    232: ** The text is reversed here. So we are really looking at
                    233: ** the first two characters of z[].
                    234: */
                    235: static int doubleConsonant(const char *z){
                    236:   return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
                    237: }
                    238: 
                    239: /*
                    240: ** Return TRUE if the word ends with three letters which
                    241: ** are consonant-vowel-consonent and where the final consonant
                    242: ** is not 'w', 'x', or 'y'.
                    243: **
                    244: ** The word is reversed here.  So we are really checking the
                    245: ** first three letters and the first one cannot be in [wxy].
                    246: */
                    247: static int star_oh(const char *z){
                    248:   return
                    249:     z[0]!=0 && isConsonant(z) &&
                    250:     z[0]!='w' && z[0]!='x' && z[0]!='y' &&
                    251:     z[1]!=0 && isVowel(z+1) &&
                    252:     z[2]!=0 && isConsonant(z+2);
                    253: }
                    254: 
                    255: /*
                    256: ** If the word ends with zFrom and xCond() is true for the stem
                    257: ** of the word that preceeds the zFrom ending, then change the 
                    258: ** ending to zTo.
                    259: **
                    260: ** The input word *pz and zFrom are both in reverse order.  zTo
                    261: ** is in normal order. 
                    262: **
                    263: ** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
                    264: ** match.  Not that TRUE is returned even if xCond() fails and
                    265: ** no substitution occurs.
                    266: */
                    267: static int stem(
                    268:   char **pz,             /* The word being stemmed (Reversed) */
                    269:   const char *zFrom,     /* If the ending matches this... (Reversed) */
                    270:   const char *zTo,       /* ... change the ending to this (not reversed) */
                    271:   int (*xCond)(const char*)   /* Condition that must be true */
                    272: ){
                    273:   char *z = *pz;
                    274:   while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
                    275:   if( *zFrom!=0 ) return 0;
                    276:   if( xCond && !xCond(z) ) return 1;
                    277:   while( *zTo ){
                    278:     *(--z) = *(zTo++);
                    279:   }
                    280:   *pz = z;
                    281:   return 1;
                    282: }
                    283: 
                    284: /*
                    285: ** This is the fallback stemmer used when the porter stemmer is
                    286: ** inappropriate.  The input word is copied into the output with
                    287: ** US-ASCII case folding.  If the input word is too long (more
                    288: ** than 20 bytes if it contains no digits or more than 6 bytes if
                    289: ** it contains digits) then word is truncated to 20 or 6 bytes
                    290: ** by taking 10 or 3 bytes from the beginning and end.
                    291: */
                    292: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
                    293:   int i, mx, j;
                    294:   int hasDigit = 0;
                    295:   for(i=0; i<nIn; i++){
                    296:     int c = zIn[i];
                    297:     if( c>='A' && c<='Z' ){
                    298:       zOut[i] = c - 'A' + 'a';
                    299:     }else{
                    300:       if( c>='0' && c<='9' ) hasDigit = 1;
                    301:       zOut[i] = c;
                    302:     }
                    303:   }
                    304:   mx = hasDigit ? 3 : 10;
                    305:   if( nIn>mx*2 ){
                    306:     for(j=mx, i=nIn-mx; i<nIn; i++, j++){
                    307:       zOut[j] = zOut[i];
                    308:     }
                    309:     i = j;
                    310:   }
                    311:   zOut[i] = 0;
                    312:   *pnOut = i;
                    313: }
                    314: 
                    315: 
                    316: /*
                    317: ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
                    318: ** zOut is at least big enough to hold nIn bytes.  Write the actual
                    319: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
                    320: **
                    321: ** Any upper-case characters in the US-ASCII character set ([A-Z])
                    322: ** are converted to lower case.  Upper-case UTF characters are
                    323: ** unchanged.
                    324: **
                    325: ** Words that are longer than about 20 bytes are stemmed by retaining
                    326: ** a few bytes from the beginning and the end of the word.  If the
                    327: ** word contains digits, 3 bytes are taken from the beginning and
                    328: ** 3 bytes from the end.  For long words without digits, 10 bytes
                    329: ** are taken from each end.  US-ASCII case folding still applies.
                    330: ** 
                    331: ** If the input word contains not digits but does characters not 
                    332: ** in [a-zA-Z] then no stemming is attempted and this routine just 
                    333: ** copies the input into the input into the output with US-ASCII
                    334: ** case folding.
                    335: **
                    336: ** Stemming never increases the length of the word.  So there is
                    337: ** no chance of overflowing the zOut buffer.
                    338: */
                    339: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
                    340:   int i, j, c;
                    341:   char zReverse[28];
                    342:   char *z, *z2;
                    343:   if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
                    344:     /* The word is too big or too small for the porter stemmer.
                    345:     ** Fallback to the copy stemmer */
                    346:     copy_stemmer(zIn, nIn, zOut, pnOut);
                    347:     return;
                    348:   }
                    349:   for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
                    350:     c = zIn[i];
                    351:     if( c>='A' && c<='Z' ){
                    352:       zReverse[j] = c + 'a' - 'A';
                    353:     }else if( c>='a' && c<='z' ){
                    354:       zReverse[j] = c;
                    355:     }else{
                    356:       /* The use of a character not in [a-zA-Z] means that we fallback
                    357:       ** to the copy stemmer */
                    358:       copy_stemmer(zIn, nIn, zOut, pnOut);
                    359:       return;
                    360:     }
                    361:   }
                    362:   memset(&zReverse[sizeof(zReverse)-5], 0, 5);
                    363:   z = &zReverse[j+1];
                    364: 
                    365: 
                    366:   /* Step 1a */
                    367:   if( z[0]=='s' ){
                    368:     if(
                    369:      !stem(&z, "sess", "ss", 0) &&
                    370:      !stem(&z, "sei", "i", 0)  &&
                    371:      !stem(&z, "ss", "ss", 0)
                    372:     ){
                    373:       z++;
                    374:     }
                    375:   }
                    376: 
                    377:   /* Step 1b */  
                    378:   z2 = z;
                    379:   if( stem(&z, "dee", "ee", m_gt_0) ){
                    380:     /* Do nothing.  The work was all in the test */
                    381:   }else if( 
                    382:      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
                    383:       && z!=z2
                    384:   ){
                    385:      if( stem(&z, "ta", "ate", 0) ||
                    386:          stem(&z, "lb", "ble", 0) ||
                    387:          stem(&z, "zi", "ize", 0) ){
                    388:        /* Do nothing.  The work was all in the test */
                    389:      }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
                    390:        z++;
                    391:      }else if( m_eq_1(z) && star_oh(z) ){
                    392:        *(--z) = 'e';
                    393:      }
                    394:   }
                    395: 
                    396:   /* Step 1c */
                    397:   if( z[0]=='y' && hasVowel(z+1) ){
                    398:     z[0] = 'i';
                    399:   }
                    400: 
                    401:   /* Step 2 */
                    402:   switch( z[1] ){
                    403:    case 'a':
                    404:      stem(&z, "lanoita", "ate", m_gt_0) ||
                    405:      stem(&z, "lanoit", "tion", m_gt_0);
                    406:      break;
                    407:    case 'c':
                    408:      stem(&z, "icne", "ence", m_gt_0) ||
                    409:      stem(&z, "icna", "ance", m_gt_0);
                    410:      break;
                    411:    case 'e':
                    412:      stem(&z, "rezi", "ize", m_gt_0);
                    413:      break;
                    414:    case 'g':
                    415:      stem(&z, "igol", "log", m_gt_0);
                    416:      break;
                    417:    case 'l':
                    418:      stem(&z, "ilb", "ble", m_gt_0) ||
                    419:      stem(&z, "illa", "al", m_gt_0) ||
                    420:      stem(&z, "iltne", "ent", m_gt_0) ||
                    421:      stem(&z, "ile", "e", m_gt_0) ||
                    422:      stem(&z, "ilsuo", "ous", m_gt_0);
                    423:      break;
                    424:    case 'o':
                    425:      stem(&z, "noitazi", "ize", m_gt_0) ||
                    426:      stem(&z, "noita", "ate", m_gt_0) ||
                    427:      stem(&z, "rota", "ate", m_gt_0);
                    428:      break;
                    429:    case 's':
                    430:      stem(&z, "msila", "al", m_gt_0) ||
                    431:      stem(&z, "ssenevi", "ive", m_gt_0) ||
                    432:      stem(&z, "ssenluf", "ful", m_gt_0) ||
                    433:      stem(&z, "ssensuo", "ous", m_gt_0);
                    434:      break;
                    435:    case 't':
                    436:      stem(&z, "itila", "al", m_gt_0) ||
                    437:      stem(&z, "itivi", "ive", m_gt_0) ||
                    438:      stem(&z, "itilib", "ble", m_gt_0);
                    439:      break;
                    440:   }
                    441: 
                    442:   /* Step 3 */
                    443:   switch( z[0] ){
                    444:    case 'e':
                    445:      stem(&z, "etaci", "ic", m_gt_0) ||
                    446:      stem(&z, "evita", "", m_gt_0)   ||
                    447:      stem(&z, "ezila", "al", m_gt_0);
                    448:      break;
                    449:    case 'i':
                    450:      stem(&z, "itici", "ic", m_gt_0);
                    451:      break;
                    452:    case 'l':
                    453:      stem(&z, "laci", "ic", m_gt_0) ||
                    454:      stem(&z, "luf", "", m_gt_0);
                    455:      break;
                    456:    case 's':
                    457:      stem(&z, "ssen", "", m_gt_0);
                    458:      break;
                    459:   }
                    460: 
                    461:   /* Step 4 */
                    462:   switch( z[1] ){
                    463:    case 'a':
                    464:      if( z[0]=='l' && m_gt_1(z+2) ){
                    465:        z += 2;
                    466:      }
                    467:      break;
                    468:    case 'c':
                    469:      if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
                    470:        z += 4;
                    471:      }
                    472:      break;
                    473:    case 'e':
                    474:      if( z[0]=='r' && m_gt_1(z+2) ){
                    475:        z += 2;
                    476:      }
                    477:      break;
                    478:    case 'i':
                    479:      if( z[0]=='c' && m_gt_1(z+2) ){
                    480:        z += 2;
                    481:      }
                    482:      break;
                    483:    case 'l':
                    484:      if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
                    485:        z += 4;
                    486:      }
                    487:      break;
                    488:    case 'n':
                    489:      if( z[0]=='t' ){
                    490:        if( z[2]=='a' ){
                    491:          if( m_gt_1(z+3) ){
                    492:            z += 3;
                    493:          }
                    494:        }else if( z[2]=='e' ){
                    495:          stem(&z, "tneme", "", m_gt_1) ||
                    496:          stem(&z, "tnem", "", m_gt_1) ||
                    497:          stem(&z, "tne", "", m_gt_1);
                    498:        }
                    499:      }
                    500:      break;
                    501:    case 'o':
                    502:      if( z[0]=='u' ){
                    503:        if( m_gt_1(z+2) ){
                    504:          z += 2;
                    505:        }
                    506:      }else if( z[3]=='s' || z[3]=='t' ){
                    507:        stem(&z, "noi", "", m_gt_1);
                    508:      }
                    509:      break;
                    510:    case 's':
                    511:      if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
                    512:        z += 3;
                    513:      }
                    514:      break;
                    515:    case 't':
                    516:      stem(&z, "eta", "", m_gt_1) ||
                    517:      stem(&z, "iti", "", m_gt_1);
                    518:      break;
                    519:    case 'u':
                    520:      if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
                    521:        z += 3;
                    522:      }
                    523:      break;
                    524:    case 'v':
                    525:    case 'z':
                    526:      if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
                    527:        z += 3;
                    528:      }
                    529:      break;
                    530:   }
                    531: 
                    532:   /* Step 5a */
                    533:   if( z[0]=='e' ){
                    534:     if( m_gt_1(z+1) ){
                    535:       z++;
                    536:     }else if( m_eq_1(z+1) && !star_oh(z+1) ){
                    537:       z++;
                    538:     }
                    539:   }
                    540: 
                    541:   /* Step 5b */
                    542:   if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
                    543:     z++;
                    544:   }
                    545: 
                    546:   /* z[] is now the stemmed word in reverse order.  Flip it back
                    547:   ** around into forward order and return.
                    548:   */
                    549:   *pnOut = i = strlen(z);
                    550:   zOut[i] = 0;
                    551:   while( *z ){
                    552:     zOut[--i] = *(z++);
                    553:   }
                    554: }
                    555: 
                    556: /*
                    557: ** Characters that can be part of a token.  We assume any character
                    558: ** whose value is greater than 0x80 (any UTF character) can be
                    559: ** part of a token.  In other words, delimiters all must have
                    560: ** values of 0x7f or lower.
                    561: */
                    562: static const char porterIdChar[] = {
                    563: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
                    564:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
                    565:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
                    566:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
                    567:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
                    568:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
                    569: };
                    570: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
                    571: 
                    572: /*
                    573: ** Extract the next token from a tokenization cursor.  The cursor must
                    574: ** have been opened by a prior call to porterOpen().
                    575: */
                    576: static int porterNext(
                    577:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
                    578:   const char **pzToken,               /* OUT: *pzToken is the token text */
                    579:   int *pnBytes,                       /* OUT: Number of bytes in token */
                    580:   int *piStartOffset,                 /* OUT: Starting offset of token */
                    581:   int *piEndOffset,                   /* OUT: Ending offset of token */
                    582:   int *piPosition                     /* OUT: Position integer of token */
                    583: ){
                    584:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
                    585:   const char *z = c->zInput;
                    586: 
                    587:   while( c->iOffset<c->nInput ){
                    588:     int iStartOffset, ch;
                    589: 
                    590:     /* Scan past delimiter characters */
                    591:     while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
                    592:       c->iOffset++;
                    593:     }
                    594: 
                    595:     /* Count non-delimiter characters. */
                    596:     iStartOffset = c->iOffset;
                    597:     while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
                    598:       c->iOffset++;
                    599:     }
                    600: 
                    601:     if( c->iOffset>iStartOffset ){
                    602:       int n = c->iOffset-iStartOffset;
                    603:       if( n>c->nAllocated ){
                    604:         c->nAllocated = n+20;
                    605:         c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
                    606:         if( c->zToken==NULL ) return SQLITE_NOMEM;
                    607:       }
                    608:       porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
                    609:       *pzToken = c->zToken;
                    610:       *piStartOffset = iStartOffset;
                    611:       *piEndOffset = c->iOffset;
                    612:       *piPosition = c->iToken++;
                    613:       return SQLITE_OK;
                    614:     }
                    615:   }
                    616:   return SQLITE_DONE;
                    617: }
                    618: 
                    619: /*
                    620: ** The set of routines that implement the porter-stemmer tokenizer
                    621: */
                    622: static const sqlite3_tokenizer_module porterTokenizerModule = {
                    623:   0,
                    624:   porterCreate,
                    625:   porterDestroy,
                    626:   porterOpen,
                    627:   porterClose,
                    628:   porterNext,
                    629: };
                    630: 
                    631: /*
                    632: ** Allocate a new porter tokenizer.  Return a pointer to the new
                    633: ** tokenizer in *ppModule
                    634: */
                    635: void sqlite3Fts2PorterTokenizerModule(
                    636:   sqlite3_tokenizer_module const**ppModule
                    637: ){
                    638:   *ppModule = &porterTokenizerModule;
                    639: }
                    640: 
                    641: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>