embedaddon/sqlite3/ext/fts1/fts1_porter.c - annotate

Return to fts1_porter.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts1
Annotation of embedaddon/sqlite3/ext/fts1/fts1_porter.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** 2006 September 30
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: *************************************************************************
                     12: ** Implementation of the full-text-search tokenizer that implements
                     13: ** a Porter stemmer.
                     14: */
                     15: 
                     16: /*
                     17: ** The code in this file is only compiled if:
                     18: **
                     19: **     * The FTS1 module is being built as an extension
                     20: **       (in which case SQLITE_CORE is not defined), or
                     21: **
                     22: **     * The FTS1 module is being built into the core of
                     23: **       SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
                     24: */
                     25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
                     26: 
                     27: 
                     28: #include <assert.h>
                     29: #include <stdlib.h>
                     30: #include <stdio.h>
                     31: #include <string.h>
                     32: #include <ctype.h>
                     33: 
                     34: #include "fts1_tokenizer.h"
                     35: 
                     36: /*
                     37: ** Class derived from sqlite3_tokenizer
                     38: */
                     39: typedef struct porter_tokenizer {
                     40:   sqlite3_tokenizer base;      /* Base class */
                     41: } porter_tokenizer;
                     42: 
                     43: /*
                     44: ** Class derived from sqlit3_tokenizer_cursor
                     45: */
                     46: typedef struct porter_tokenizer_cursor {
                     47:   sqlite3_tokenizer_cursor base;
                     48:   const char *zInput;          /* input we are tokenizing */
                     49:   int nInput;                  /* size of the input */
                     50:   int iOffset;                 /* current position in zInput */
                     51:   int iToken;                  /* index of next token to be returned */
                     52:   char *zToken;                /* storage for current token */
                     53:   int nAllocated;              /* space allocated to zToken buffer */
                     54: } porter_tokenizer_cursor;
                     55: 
                     56: 
                     57: /* Forward declaration */
                     58: static const sqlite3_tokenizer_module porterTokenizerModule;
                     59: 
                     60: 
                     61: /*
                     62: ** Create a new tokenizer instance.
                     63: */
                     64: static int porterCreate(
                     65:   int argc, const char * const *argv,
                     66:   sqlite3_tokenizer **ppTokenizer
                     67: ){
                     68:   porter_tokenizer *t;
                     69:   t = (porter_tokenizer *) calloc(sizeof(*t), 1);
                     70:   if( t==NULL ) return SQLITE_NOMEM;
                     71: 
                     72:   *ppTokenizer = &t->base;
                     73:   return SQLITE_OK;
                     74: }
                     75: 
                     76: /*
                     77: ** Destroy a tokenizer
                     78: */
                     79: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
                     80:   free(pTokenizer);
                     81:   return SQLITE_OK;
                     82: }
                     83: 
                     84: /*
                     85: ** Prepare to begin tokenizing a particular string.  The input
                     86: ** string to be tokenized is zInput[0..nInput-1].  A cursor
                     87: ** used to incrementally tokenize this string is returned in 
                     88: ** *ppCursor.
                     89: */
                     90: static int porterOpen(
                     91:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
                     92:   const char *zInput, int nInput,        /* String to be tokenized */
                     93:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
                     94: ){
                     95:   porter_tokenizer_cursor *c;
                     96: 
                     97:   c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
                     98:   if( c==NULL ) return SQLITE_NOMEM;
                     99: 
                    100:   c->zInput = zInput;
                    101:   if( zInput==0 ){
                    102:     c->nInput = 0;
                    103:   }else if( nInput<0 ){
                    104:     c->nInput = (int)strlen(zInput);
                    105:   }else{
                    106:     c->nInput = nInput;
                    107:   }
                    108:   c->iOffset = 0;                 /* start tokenizing at the beginning */
                    109:   c->iToken = 0;
                    110:   c->zToken = NULL;               /* no space allocated, yet. */
                    111:   c->nAllocated = 0;
                    112: 
                    113:   *ppCursor = &c->base;
                    114:   return SQLITE_OK;
                    115: }
                    116: 
                    117: /*
                    118: ** Close a tokenization cursor previously opened by a call to
                    119: ** porterOpen() above.
                    120: */
                    121: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
                    122:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
                    123:   free(c->zToken);
                    124:   free(c);
                    125:   return SQLITE_OK;
                    126: }
                    127: /*
                    128: ** Vowel or consonant
                    129: */
                    130: static const char cType[] = {
                    131:    0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
                    132:    1, 1, 1, 2, 1
                    133: };
                    134: 
                    135: /*
                    136: ** isConsonant() and isVowel() determine if their first character in
                    137: ** the string they point to is a consonant or a vowel, according
                    138: ** to Porter ruls.  
                    139: **
                    140: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
                    141: ** 'Y' is a consonant unless it follows another consonant,
                    142: ** in which case it is a vowel.
                    143: **
                    144: ** In these routine, the letters are in reverse order.  So the 'y' rule
                    145: ** is that 'y' is a consonant unless it is followed by another
                    146: ** consonent.
                    147: */
                    148: static int isVowel(const char*);
                    149: static int isConsonant(const char *z){
                    150:   int j;
                    151:   char x = *z;
                    152:   if( x==0 ) return 0;
                    153:   assert( x>='a' && x<='z' );
                    154:   j = cType[x-'a'];
                    155:   if( j<2 ) return j;
                    156:   return z[1]==0 || isVowel(z + 1);
                    157: }
                    158: static int isVowel(const char *z){
                    159:   int j;
                    160:   char x = *z;
                    161:   if( x==0 ) return 0;
                    162:   assert( x>='a' && x<='z' );
                    163:   j = cType[x-'a'];
                    164:   if( j<2 ) return 1-j;
                    165:   return isConsonant(z + 1);
                    166: }
                    167: 
                    168: /*
                    169: ** Let any sequence of one or more vowels be represented by V and let
                    170: ** C be sequence of one or more consonants.  Then every word can be
                    171: ** represented as:
                    172: **
                    173: **           [C] (VC){m} [V]
                    174: **
                    175: ** In prose:  A word is an optional consonant followed by zero or
                    176: ** vowel-consonant pairs followed by an optional vowel.  "m" is the
                    177: ** number of vowel consonant pairs.  This routine computes the value
                    178: ** of m for the first i bytes of a word.
                    179: **
                    180: ** Return true if the m-value for z is 1 or more.  In other words,
                    181: ** return true if z contains at least one vowel that is followed
                    182: ** by a consonant.
                    183: **
                    184: ** In this routine z[] is in reverse order.  So we are really looking
                    185: ** for an instance of of a consonant followed by a vowel.
                    186: */
                    187: static int m_gt_0(const char *z){
                    188:   while( isVowel(z) ){ z++; }
                    189:   if( *z==0 ) return 0;
                    190:   while( isConsonant(z) ){ z++; }
                    191:   return *z!=0;
                    192: }
                    193: 
                    194: /* Like mgt0 above except we are looking for a value of m which is
                    195: ** exactly 1
                    196: */
                    197: static int m_eq_1(const char *z){
                    198:   while( isVowel(z) ){ z++; }
                    199:   if( *z==0 ) return 0;
                    200:   while( isConsonant(z) ){ z++; }
                    201:   if( *z==0 ) return 0;
                    202:   while( isVowel(z) ){ z++; }
                    203:   if( *z==0 ) return 1;
                    204:   while( isConsonant(z) ){ z++; }
                    205:   return *z==0;
                    206: }
                    207: 
                    208: /* Like mgt0 above except we are looking for a value of m>1 instead
                    209: ** or m>0
                    210: */
                    211: static int m_gt_1(const char *z){
                    212:   while( isVowel(z) ){ z++; }
                    213:   if( *z==0 ) return 0;
                    214:   while( isConsonant(z) ){ z++; }
                    215:   if( *z==0 ) return 0;
                    216:   while( isVowel(z) ){ z++; }
                    217:   if( *z==0 ) return 0;
                    218:   while( isConsonant(z) ){ z++; }
                    219:   return *z!=0;
                    220: }
                    221: 
                    222: /*
                    223: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
                    224: */
                    225: static int hasVowel(const char *z){
                    226:   while( isConsonant(z) ){ z++; }
                    227:   return *z!=0;
                    228: }
                    229: 
                    230: /*
                    231: ** Return TRUE if the word ends in a double consonant.
                    232: **
                    233: ** The text is reversed here. So we are really looking at
                    234: ** the first two characters of z[].
                    235: */
                    236: static int doubleConsonant(const char *z){
                    237:   return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
                    238: }
                    239: 
                    240: /*
                    241: ** Return TRUE if the word ends with three letters which
                    242: ** are consonant-vowel-consonent and where the final consonant
                    243: ** is not 'w', 'x', or 'y'.
                    244: **
                    245: ** The word is reversed here.  So we are really checking the
                    246: ** first three letters and the first one cannot be in [wxy].
                    247: */
                    248: static int star_oh(const char *z){
                    249:   return
                    250:     z[0]!=0 && isConsonant(z) &&
                    251:     z[0]!='w' && z[0]!='x' && z[0]!='y' &&
                    252:     z[1]!=0 && isVowel(z+1) &&
                    253:     z[2]!=0 && isConsonant(z+2);
                    254: }
                    255: 
                    256: /*
                    257: ** If the word ends with zFrom and xCond() is true for the stem
                    258: ** of the word that preceeds the zFrom ending, then change the 
                    259: ** ending to zTo.
                    260: **
                    261: ** The input word *pz and zFrom are both in reverse order.  zTo
                    262: ** is in normal order. 
                    263: **
                    264: ** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
                    265: ** match.  Not that TRUE is returned even if xCond() fails and
                    266: ** no substitution occurs.
                    267: */
                    268: static int stem(
                    269:   char **pz,             /* The word being stemmed (Reversed) */
                    270:   const char *zFrom,     /* If the ending matches this... (Reversed) */
                    271:   const char *zTo,       /* ... change the ending to this (not reversed) */
                    272:   int (*xCond)(const char*)   /* Condition that must be true */
                    273: ){
                    274:   char *z = *pz;
                    275:   while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
                    276:   if( *zFrom!=0 ) return 0;
                    277:   if( xCond && !xCond(z) ) return 1;
                    278:   while( *zTo ){
                    279:     *(--z) = *(zTo++);
                    280:   }
                    281:   *pz = z;
                    282:   return 1;
                    283: }
                    284: 
                    285: /*
                    286: ** This is the fallback stemmer used when the porter stemmer is
                    287: ** inappropriate.  The input word is copied into the output with
                    288: ** US-ASCII case folding.  If the input word is too long (more
                    289: ** than 20 bytes if it contains no digits or more than 6 bytes if
                    290: ** it contains digits) then word is truncated to 20 or 6 bytes
                    291: ** by taking 10 or 3 bytes from the beginning and end.
                    292: */
                    293: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
                    294:   int i, mx, j;
                    295:   int hasDigit = 0;
                    296:   for(i=0; i<nIn; i++){
                    297:     int c = zIn[i];
                    298:     if( c>='A' && c<='Z' ){
                    299:       zOut[i] = c - 'A' + 'a';
                    300:     }else{
                    301:       if( c>='0' && c<='9' ) hasDigit = 1;
                    302:       zOut[i] = c;
                    303:     }
                    304:   }
                    305:   mx = hasDigit ? 3 : 10;
                    306:   if( nIn>mx*2 ){
                    307:     for(j=mx, i=nIn-mx; i<nIn; i++, j++){
                    308:       zOut[j] = zOut[i];
                    309:     }
                    310:     i = j;
                    311:   }
                    312:   zOut[i] = 0;
                    313:   *pnOut = i;
                    314: }
                    315: 
                    316: 
                    317: /*
                    318: ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
                    319: ** zOut is at least big enough to hold nIn bytes.  Write the actual
                    320: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
                    321: **
                    322: ** Any upper-case characters in the US-ASCII character set ([A-Z])
                    323: ** are converted to lower case.  Upper-case UTF characters are
                    324: ** unchanged.
                    325: **
                    326: ** Words that are longer than about 20 bytes are stemmed by retaining
                    327: ** a few bytes from the beginning and the end of the word.  If the
                    328: ** word contains digits, 3 bytes are taken from the beginning and
                    329: ** 3 bytes from the end.  For long words without digits, 10 bytes
                    330: ** are taken from each end.  US-ASCII case folding still applies.
                    331: ** 
                    332: ** If the input word contains not digits but does characters not 
                    333: ** in [a-zA-Z] then no stemming is attempted and this routine just 
                    334: ** copies the input into the input into the output with US-ASCII
                    335: ** case folding.
                    336: **
                    337: ** Stemming never increases the length of the word.  So there is
                    338: ** no chance of overflowing the zOut buffer.
                    339: */
                    340: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
                    341:   int i, j, c;
                    342:   char zReverse[28];
                    343:   char *z, *z2;
                    344:   if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
                    345:     /* The word is too big or too small for the porter stemmer.
                    346:     ** Fallback to the copy stemmer */
                    347:     copy_stemmer(zIn, nIn, zOut, pnOut);
                    348:     return;
                    349:   }
                    350:   for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
                    351:     c = zIn[i];
                    352:     if( c>='A' && c<='Z' ){
                    353:       zReverse[j] = c + 'a' - 'A';
                    354:     }else if( c>='a' && c<='z' ){
                    355:       zReverse[j] = c;
                    356:     }else{
                    357:       /* The use of a character not in [a-zA-Z] means that we fallback
                    358:       ** to the copy stemmer */
                    359:       copy_stemmer(zIn, nIn, zOut, pnOut);
                    360:       return;
                    361:     }
                    362:   }
                    363:   memset(&zReverse[sizeof(zReverse)-5], 0, 5);
                    364:   z = &zReverse[j+1];
                    365: 
                    366: 
                    367:   /* Step 1a */
                    368:   if( z[0]=='s' ){
                    369:     if(
                    370:      !stem(&z, "sess", "ss", 0) &&
                    371:      !stem(&z, "sei", "i", 0)  &&
                    372:      !stem(&z, "ss", "ss", 0)
                    373:     ){
                    374:       z++;
                    375:     }
                    376:   }
                    377: 
                    378:   /* Step 1b */  
                    379:   z2 = z;
                    380:   if( stem(&z, "dee", "ee", m_gt_0) ){
                    381:     /* Do nothing.  The work was all in the test */
                    382:   }else if( 
                    383:      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
                    384:       && z!=z2
                    385:   ){
                    386:      if( stem(&z, "ta", "ate", 0) ||
                    387:          stem(&z, "lb", "ble", 0) ||
                    388:          stem(&z, "zi", "ize", 0) ){
                    389:        /* Do nothing.  The work was all in the test */
                    390:      }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
                    391:        z++;
                    392:      }else if( m_eq_1(z) && star_oh(z) ){
                    393:        *(--z) = 'e';
                    394:      }
                    395:   }
                    396: 
                    397:   /* Step 1c */
                    398:   if( z[0]=='y' && hasVowel(z+1) ){
                    399:     z[0] = 'i';
                    400:   }
                    401: 
                    402:   /* Step 2 */
                    403:   switch( z[1] ){
                    404:    case 'a':
                    405:      stem(&z, "lanoita", "ate", m_gt_0) ||
                    406:      stem(&z, "lanoit", "tion", m_gt_0);
                    407:      break;
                    408:    case 'c':
                    409:      stem(&z, "icne", "ence", m_gt_0) ||
                    410:      stem(&z, "icna", "ance", m_gt_0);
                    411:      break;
                    412:    case 'e':
                    413:      stem(&z, "rezi", "ize", m_gt_0);
                    414:      break;
                    415:    case 'g':
                    416:      stem(&z, "igol", "log", m_gt_0);
                    417:      break;
                    418:    case 'l':
                    419:      stem(&z, "ilb", "ble", m_gt_0) ||
                    420:      stem(&z, "illa", "al", m_gt_0) ||
                    421:      stem(&z, "iltne", "ent", m_gt_0) ||
                    422:      stem(&z, "ile", "e", m_gt_0) ||
                    423:      stem(&z, "ilsuo", "ous", m_gt_0);
                    424:      break;
                    425:    case 'o':
                    426:      stem(&z, "noitazi", "ize", m_gt_0) ||
                    427:      stem(&z, "noita", "ate", m_gt_0) ||
                    428:      stem(&z, "rota", "ate", m_gt_0);
                    429:      break;
                    430:    case 's':
                    431:      stem(&z, "msila", "al", m_gt_0) ||
                    432:      stem(&z, "ssenevi", "ive", m_gt_0) ||
                    433:      stem(&z, "ssenluf", "ful", m_gt_0) ||
                    434:      stem(&z, "ssensuo", "ous", m_gt_0);
                    435:      break;
                    436:    case 't':
                    437:      stem(&z, "itila", "al", m_gt_0) ||
                    438:      stem(&z, "itivi", "ive", m_gt_0) ||
                    439:      stem(&z, "itilib", "ble", m_gt_0);
                    440:      break;
                    441:   }
                    442: 
                    443:   /* Step 3 */
                    444:   switch( z[0] ){
                    445:    case 'e':
                    446:      stem(&z, "etaci", "ic", m_gt_0) ||
                    447:      stem(&z, "evita", "", m_gt_0)   ||
                    448:      stem(&z, "ezila", "al", m_gt_0);
                    449:      break;
                    450:    case 'i':
                    451:      stem(&z, "itici", "ic", m_gt_0);
                    452:      break;
                    453:    case 'l':
                    454:      stem(&z, "laci", "ic", m_gt_0) ||
                    455:      stem(&z, "luf", "", m_gt_0);
                    456:      break;
                    457:    case 's':
                    458:      stem(&z, "ssen", "", m_gt_0);
                    459:      break;
                    460:   }
                    461: 
                    462:   /* Step 4 */
                    463:   switch( z[1] ){
                    464:    case 'a':
                    465:      if( z[0]=='l' && m_gt_1(z+2) ){
                    466:        z += 2;
                    467:      }
                    468:      break;
                    469:    case 'c':
                    470:      if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
                    471:        z += 4;
                    472:      }
                    473:      break;
                    474:    case 'e':
                    475:      if( z[0]=='r' && m_gt_1(z+2) ){
                    476:        z += 2;
                    477:      }
                    478:      break;
                    479:    case 'i':
                    480:      if( z[0]=='c' && m_gt_1(z+2) ){
                    481:        z += 2;
                    482:      }
                    483:      break;
                    484:    case 'l':
                    485:      if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
                    486:        z += 4;
                    487:      }
                    488:      break;
                    489:    case 'n':
                    490:      if( z[0]=='t' ){
                    491:        if( z[2]=='a' ){
                    492:          if( m_gt_1(z+3) ){
                    493:            z += 3;
                    494:          }
                    495:        }else if( z[2]=='e' ){
                    496:          stem(&z, "tneme", "", m_gt_1) ||
                    497:          stem(&z, "tnem", "", m_gt_1) ||
                    498:          stem(&z, "tne", "", m_gt_1);
                    499:        }
                    500:      }
                    501:      break;
                    502:    case 'o':
                    503:      if( z[0]=='u' ){
                    504:        if( m_gt_1(z+2) ){
                    505:          z += 2;
                    506:        }
                    507:      }else if( z[3]=='s' || z[3]=='t' ){
                    508:        stem(&z, "noi", "", m_gt_1);
                    509:      }
                    510:      break;
                    511:    case 's':
                    512:      if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
                    513:        z += 3;
                    514:      }
                    515:      break;
                    516:    case 't':
                    517:      stem(&z, "eta", "", m_gt_1) ||
                    518:      stem(&z, "iti", "", m_gt_1);
                    519:      break;
                    520:    case 'u':
                    521:      if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
                    522:        z += 3;
                    523:      }
                    524:      break;
                    525:    case 'v':
                    526:    case 'z':
                    527:      if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
                    528:        z += 3;
                    529:      }
                    530:      break;
                    531:   }
                    532: 
                    533:   /* Step 5a */
                    534:   if( z[0]=='e' ){
                    535:     if( m_gt_1(z+1) ){
                    536:       z++;
                    537:     }else if( m_eq_1(z+1) && !star_oh(z+1) ){
                    538:       z++;
                    539:     }
                    540:   }
                    541: 
                    542:   /* Step 5b */
                    543:   if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
                    544:     z++;
                    545:   }
                    546: 
                    547:   /* z[] is now the stemmed word in reverse order.  Flip it back
                    548:   ** around into forward order and return.
                    549:   */
                    550:   *pnOut = i = strlen(z);
                    551:   zOut[i] = 0;
                    552:   while( *z ){
                    553:     zOut[--i] = *(z++);
                    554:   }
                    555: }
                    556: 
                    557: /*
                    558: ** Characters that can be part of a token.  We assume any character
                    559: ** whose value is greater than 0x80 (any UTF character) can be
                    560: ** part of a token.  In other words, delimiters all must have
                    561: ** values of 0x7f or lower.
                    562: */
                    563: static const char isIdChar[] = {
                    564: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
                    565:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
                    566:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
                    567:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
                    568:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
                    569:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
                    570: };
                    571: #define idChar(C)  (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
                    572: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
                    573: 
                    574: /*
                    575: ** Extract the next token from a tokenization cursor.  The cursor must
                    576: ** have been opened by a prior call to porterOpen().
                    577: */
                    578: static int porterNext(
                    579:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
                    580:   const char **pzToken,               /* OUT: *pzToken is the token text */
                    581:   int *pnBytes,                       /* OUT: Number of bytes in token */
                    582:   int *piStartOffset,                 /* OUT: Starting offset of token */
                    583:   int *piEndOffset,                   /* OUT: Ending offset of token */
                    584:   int *piPosition                     /* OUT: Position integer of token */
                    585: ){
                    586:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
                    587:   const char *z = c->zInput;
                    588: 
                    589:   while( c->iOffset<c->nInput ){
                    590:     int iStartOffset, ch;
                    591: 
                    592:     /* Scan past delimiter characters */
                    593:     while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
                    594:       c->iOffset++;
                    595:     }
                    596: 
                    597:     /* Count non-delimiter characters. */
                    598:     iStartOffset = c->iOffset;
                    599:     while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
                    600:       c->iOffset++;
                    601:     }
                    602: 
                    603:     if( c->iOffset>iStartOffset ){
                    604:       int n = c->iOffset-iStartOffset;
                    605:       if( n>c->nAllocated ){
                    606:         c->nAllocated = n+20;
                    607:         c->zToken = realloc(c->zToken, c->nAllocated);
                    608:         if( c->zToken==NULL ) return SQLITE_NOMEM;
                    609:       }
                    610:       porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
                    611:       *pzToken = c->zToken;
                    612:       *piStartOffset = iStartOffset;
                    613:       *piEndOffset = c->iOffset;
                    614:       *piPosition = c->iToken++;
                    615:       return SQLITE_OK;
                    616:     }
                    617:   }
                    618:   return SQLITE_DONE;
                    619: }
                    620: 
                    621: /*
                    622: ** The set of routines that implement the porter-stemmer tokenizer
                    623: */
                    624: static const sqlite3_tokenizer_module porterTokenizerModule = {
                    625:   0,
                    626:   porterCreate,
                    627:   porterDestroy,
                    628:   porterOpen,
                    629:   porterClose,
                    630:   porterNext,
                    631: };
                    632: 
                    633: /*
                    634: ** Allocate a new porter tokenizer.  Return a pointer to the new
                    635: ** tokenizer in *ppModule
                    636: */
                    637: void sqlite3Fts1PorterTokenizerModule(
                    638:   sqlite3_tokenizer_module const**ppModule
                    639: ){
                    640:   *ppModule = &porterTokenizerModule;
                    641: }
                    642: 
                    643: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>