embedaddon/sqlite3/ext/fts3/fts3_porter.c - annotate

Return to fts3_porter.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts3
Annotation of embedaddon/sqlite3/ext/fts3/fts3_porter.c, revision 1.1

1.1     ! misho       1: /*
        !             2: ** 2006 September 30
        !             3: **
        !             4: ** The author disclaims copyright to this source code.  In place of
        !             5: ** a legal notice, here is a blessing:
        !             6: **
        !             7: **    May you do good and not evil.
        !             8: **    May you find forgiveness for yourself and forgive others.
        !             9: **    May you share freely, never taking more than you give.
        !            10: **
        !            11: *************************************************************************
        !            12: ** Implementation of the full-text-search tokenizer that implements
        !            13: ** a Porter stemmer.
        !            14: */
        !            15: 
        !            16: /*
        !            17: ** The code in this file is only compiled if:
        !            18: **
        !            19: **     * The FTS3 module is being built as an extension
        !            20: **       (in which case SQLITE_CORE is not defined), or
        !            21: **
        !            22: **     * The FTS3 module is being built into the core of
        !            23: **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
        !            24: */
        !            25: #include "fts3Int.h"
        !            26: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
        !            27: 
        !            28: #include <assert.h>
        !            29: #include <stdlib.h>
        !            30: #include <stdio.h>
        !            31: #include <string.h>
        !            32: 
        !            33: #include "fts3_tokenizer.h"
        !            34: 
        !            35: /*
        !            36: ** Class derived from sqlite3_tokenizer
        !            37: */
        !            38: typedef struct porter_tokenizer {
        !            39:   sqlite3_tokenizer base;      /* Base class */
        !            40: } porter_tokenizer;
        !            41: 
        !            42: /*
        !            43: ** Class derived from sqlite3_tokenizer_cursor
        !            44: */
        !            45: typedef struct porter_tokenizer_cursor {
        !            46:   sqlite3_tokenizer_cursor base;
        !            47:   const char *zInput;          /* input we are tokenizing */
        !            48:   int nInput;                  /* size of the input */
        !            49:   int iOffset;                 /* current position in zInput */
        !            50:   int iToken;                  /* index of next token to be returned */
        !            51:   char *zToken;                /* storage for current token */
        !            52:   int nAllocated;              /* space allocated to zToken buffer */
        !            53: } porter_tokenizer_cursor;
        !            54: 
        !            55: 
        !            56: /*
        !            57: ** Create a new tokenizer instance.
        !            58: */
        !            59: static int porterCreate(
        !            60:   int argc, const char * const *argv,
        !            61:   sqlite3_tokenizer **ppTokenizer
        !            62: ){
        !            63:   porter_tokenizer *t;
        !            64: 
        !            65:   UNUSED_PARAMETER(argc);
        !            66:   UNUSED_PARAMETER(argv);
        !            67: 
        !            68:   t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
        !            69:   if( t==NULL ) return SQLITE_NOMEM;
        !            70:   memset(t, 0, sizeof(*t));
        !            71:   *ppTokenizer = &t->base;
        !            72:   return SQLITE_OK;
        !            73: }
        !            74: 
        !            75: /*
        !            76: ** Destroy a tokenizer
        !            77: */
        !            78: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
        !            79:   sqlite3_free(pTokenizer);
        !            80:   return SQLITE_OK;
        !            81: }
        !            82: 
        !            83: /*
        !            84: ** Prepare to begin tokenizing a particular string.  The input
        !            85: ** string to be tokenized is zInput[0..nInput-1].  A cursor
        !            86: ** used to incrementally tokenize this string is returned in 
        !            87: ** *ppCursor.
        !            88: */
        !            89: static int porterOpen(
        !            90:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
        !            91:   const char *zInput, int nInput,        /* String to be tokenized */
        !            92:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
        !            93: ){
        !            94:   porter_tokenizer_cursor *c;
        !            95: 
        !            96:   UNUSED_PARAMETER(pTokenizer);
        !            97: 
        !            98:   c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
        !            99:   if( c==NULL ) return SQLITE_NOMEM;
        !           100: 
        !           101:   c->zInput = zInput;
        !           102:   if( zInput==0 ){
        !           103:     c->nInput = 0;
        !           104:   }else if( nInput<0 ){
        !           105:     c->nInput = (int)strlen(zInput);
        !           106:   }else{
        !           107:     c->nInput = nInput;
        !           108:   }
        !           109:   c->iOffset = 0;                 /* start tokenizing at the beginning */
        !           110:   c->iToken = 0;
        !           111:   c->zToken = NULL;               /* no space allocated, yet. */
        !           112:   c->nAllocated = 0;
        !           113: 
        !           114:   *ppCursor = &c->base;
        !           115:   return SQLITE_OK;
        !           116: }
        !           117: 
        !           118: /*
        !           119: ** Close a tokenization cursor previously opened by a call to
        !           120: ** porterOpen() above.
        !           121: */
        !           122: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
        !           123:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
        !           124:   sqlite3_free(c->zToken);
        !           125:   sqlite3_free(c);
        !           126:   return SQLITE_OK;
        !           127: }
        !           128: /*
        !           129: ** Vowel or consonant
        !           130: */
        !           131: static const char cType[] = {
        !           132:    0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
        !           133:    1, 1, 1, 2, 1
        !           134: };
        !           135: 
        !           136: /*
        !           137: ** isConsonant() and isVowel() determine if their first character in
        !           138: ** the string they point to is a consonant or a vowel, according
        !           139: ** to Porter ruls.  
        !           140: **
        !           141: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
        !           142: ** 'Y' is a consonant unless it follows another consonant,
        !           143: ** in which case it is a vowel.
        !           144: **
        !           145: ** In these routine, the letters are in reverse order.  So the 'y' rule
        !           146: ** is that 'y' is a consonant unless it is followed by another
        !           147: ** consonent.
        !           148: */
        !           149: static int isVowel(const char*);
        !           150: static int isConsonant(const char *z){
        !           151:   int j;
        !           152:   char x = *z;
        !           153:   if( x==0 ) return 0;
        !           154:   assert( x>='a' && x<='z' );
        !           155:   j = cType[x-'a'];
        !           156:   if( j<2 ) return j;
        !           157:   return z[1]==0 || isVowel(z + 1);
        !           158: }
        !           159: static int isVowel(const char *z){
        !           160:   int j;
        !           161:   char x = *z;
        !           162:   if( x==0 ) return 0;
        !           163:   assert( x>='a' && x<='z' );
        !           164:   j = cType[x-'a'];
        !           165:   if( j<2 ) return 1-j;
        !           166:   return isConsonant(z + 1);
        !           167: }
        !           168: 
        !           169: /*
        !           170: ** Let any sequence of one or more vowels be represented by V and let
        !           171: ** C be sequence of one or more consonants.  Then every word can be
        !           172: ** represented as:
        !           173: **
        !           174: **           [C] (VC){m} [V]
        !           175: **
        !           176: ** In prose:  A word is an optional consonant followed by zero or
        !           177: ** vowel-consonant pairs followed by an optional vowel.  "m" is the
        !           178: ** number of vowel consonant pairs.  This routine computes the value
        !           179: ** of m for the first i bytes of a word.
        !           180: **
        !           181: ** Return true if the m-value for z is 1 or more.  In other words,
        !           182: ** return true if z contains at least one vowel that is followed
        !           183: ** by a consonant.
        !           184: **
        !           185: ** In this routine z[] is in reverse order.  So we are really looking
        !           186: ** for an instance of of a consonant followed by a vowel.
        !           187: */
        !           188: static int m_gt_0(const char *z){
        !           189:   while( isVowel(z) ){ z++; }
        !           190:   if( *z==0 ) return 0;
        !           191:   while( isConsonant(z) ){ z++; }
        !           192:   return *z!=0;
        !           193: }
        !           194: 
        !           195: /* Like mgt0 above except we are looking for a value of m which is
        !           196: ** exactly 1
        !           197: */
        !           198: static int m_eq_1(const char *z){
        !           199:   while( isVowel(z) ){ z++; }
        !           200:   if( *z==0 ) return 0;
        !           201:   while( isConsonant(z) ){ z++; }
        !           202:   if( *z==0 ) return 0;
        !           203:   while( isVowel(z) ){ z++; }
        !           204:   if( *z==0 ) return 1;
        !           205:   while( isConsonant(z) ){ z++; }
        !           206:   return *z==0;
        !           207: }
        !           208: 
        !           209: /* Like mgt0 above except we are looking for a value of m>1 instead
        !           210: ** or m>0
        !           211: */
        !           212: static int m_gt_1(const char *z){
        !           213:   while( isVowel(z) ){ z++; }
        !           214:   if( *z==0 ) return 0;
        !           215:   while( isConsonant(z) ){ z++; }
        !           216:   if( *z==0 ) return 0;
        !           217:   while( isVowel(z) ){ z++; }
        !           218:   if( *z==0 ) return 0;
        !           219:   while( isConsonant(z) ){ z++; }
        !           220:   return *z!=0;
        !           221: }
        !           222: 
        !           223: /*
        !           224: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
        !           225: */
        !           226: static int hasVowel(const char *z){
        !           227:   while( isConsonant(z) ){ z++; }
        !           228:   return *z!=0;
        !           229: }
        !           230: 
        !           231: /*
        !           232: ** Return TRUE if the word ends in a double consonant.
        !           233: **
        !           234: ** The text is reversed here. So we are really looking at
        !           235: ** the first two characters of z[].
        !           236: */
        !           237: static int doubleConsonant(const char *z){
        !           238:   return isConsonant(z) && z[0]==z[1];
        !           239: }
        !           240: 
        !           241: /*
        !           242: ** Return TRUE if the word ends with three letters which
        !           243: ** are consonant-vowel-consonent and where the final consonant
        !           244: ** is not 'w', 'x', or 'y'.
        !           245: **
        !           246: ** The word is reversed here.  So we are really checking the
        !           247: ** first three letters and the first one cannot be in [wxy].
        !           248: */
        !           249: static int star_oh(const char *z){
        !           250:   return
        !           251:     isConsonant(z) &&
        !           252:     z[0]!='w' && z[0]!='x' && z[0]!='y' &&
        !           253:     isVowel(z+1) &&
        !           254:     isConsonant(z+2);
        !           255: }
        !           256: 
        !           257: /*
        !           258: ** If the word ends with zFrom and xCond() is true for the stem
        !           259: ** of the word that preceeds the zFrom ending, then change the 
        !           260: ** ending to zTo.
        !           261: **
        !           262: ** The input word *pz and zFrom are both in reverse order.  zTo
        !           263: ** is in normal order. 
        !           264: **
        !           265: ** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
        !           266: ** match.  Not that TRUE is returned even if xCond() fails and
        !           267: ** no substitution occurs.
        !           268: */
        !           269: static int stem(
        !           270:   char **pz,             /* The word being stemmed (Reversed) */
        !           271:   const char *zFrom,     /* If the ending matches this... (Reversed) */
        !           272:   const char *zTo,       /* ... change the ending to this (not reversed) */
        !           273:   int (*xCond)(const char*)   /* Condition that must be true */
        !           274: ){
        !           275:   char *z = *pz;
        !           276:   while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
        !           277:   if( *zFrom!=0 ) return 0;
        !           278:   if( xCond && !xCond(z) ) return 1;
        !           279:   while( *zTo ){
        !           280:     *(--z) = *(zTo++);
        !           281:   }
        !           282:   *pz = z;
        !           283:   return 1;
        !           284: }
        !           285: 
        !           286: /*
        !           287: ** This is the fallback stemmer used when the porter stemmer is
        !           288: ** inappropriate.  The input word is copied into the output with
        !           289: ** US-ASCII case folding.  If the input word is too long (more
        !           290: ** than 20 bytes if it contains no digits or more than 6 bytes if
        !           291: ** it contains digits) then word is truncated to 20 or 6 bytes
        !           292: ** by taking 10 or 3 bytes from the beginning and end.
        !           293: */
        !           294: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
        !           295:   int i, mx, j;
        !           296:   int hasDigit = 0;
        !           297:   for(i=0; i<nIn; i++){
        !           298:     char c = zIn[i];
        !           299:     if( c>='A' && c<='Z' ){
        !           300:       zOut[i] = c - 'A' + 'a';
        !           301:     }else{
        !           302:       if( c>='0' && c<='9' ) hasDigit = 1;
        !           303:       zOut[i] = c;
        !           304:     }
        !           305:   }
        !           306:   mx = hasDigit ? 3 : 10;
        !           307:   if( nIn>mx*2 ){
        !           308:     for(j=mx, i=nIn-mx; i<nIn; i++, j++){
        !           309:       zOut[j] = zOut[i];
        !           310:     }
        !           311:     i = j;
        !           312:   }
        !           313:   zOut[i] = 0;
        !           314:   *pnOut = i;
        !           315: }
        !           316: 
        !           317: 
        !           318: /*
        !           319: ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
        !           320: ** zOut is at least big enough to hold nIn bytes.  Write the actual
        !           321: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
        !           322: **
        !           323: ** Any upper-case characters in the US-ASCII character set ([A-Z])
        !           324: ** are converted to lower case.  Upper-case UTF characters are
        !           325: ** unchanged.
        !           326: **
        !           327: ** Words that are longer than about 20 bytes are stemmed by retaining
        !           328: ** a few bytes from the beginning and the end of the word.  If the
        !           329: ** word contains digits, 3 bytes are taken from the beginning and
        !           330: ** 3 bytes from the end.  For long words without digits, 10 bytes
        !           331: ** are taken from each end.  US-ASCII case folding still applies.
        !           332: ** 
        !           333: ** If the input word contains not digits but does characters not 
        !           334: ** in [a-zA-Z] then no stemming is attempted and this routine just 
        !           335: ** copies the input into the input into the output with US-ASCII
        !           336: ** case folding.
        !           337: **
        !           338: ** Stemming never increases the length of the word.  So there is
        !           339: ** no chance of overflowing the zOut buffer.
        !           340: */
        !           341: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
        !           342:   int i, j;
        !           343:   char zReverse[28];
        !           344:   char *z, *z2;
        !           345:   if( nIn<3 || nIn>=(int)sizeof(zReverse)-7 ){
        !           346:     /* The word is too big or too small for the porter stemmer.
        !           347:     ** Fallback to the copy stemmer */
        !           348:     copy_stemmer(zIn, nIn, zOut, pnOut);
        !           349:     return;
        !           350:   }
        !           351:   for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
        !           352:     char c = zIn[i];
        !           353:     if( c>='A' && c<='Z' ){
        !           354:       zReverse[j] = c + 'a' - 'A';
        !           355:     }else if( c>='a' && c<='z' ){
        !           356:       zReverse[j] = c;
        !           357:     }else{
        !           358:       /* The use of a character not in [a-zA-Z] means that we fallback
        !           359:       ** to the copy stemmer */
        !           360:       copy_stemmer(zIn, nIn, zOut, pnOut);
        !           361:       return;
        !           362:     }
        !           363:   }
        !           364:   memset(&zReverse[sizeof(zReverse)-5], 0, 5);
        !           365:   z = &zReverse[j+1];
        !           366: 
        !           367: 
        !           368:   /* Step 1a */
        !           369:   if( z[0]=='s' ){
        !           370:     if(
        !           371:      !stem(&z, "sess", "ss", 0) &&
        !           372:      !stem(&z, "sei", "i", 0)  &&
        !           373:      !stem(&z, "ss", "ss", 0)
        !           374:     ){
        !           375:       z++;
        !           376:     }
        !           377:   }
        !           378: 
        !           379:   /* Step 1b */  
        !           380:   z2 = z;
        !           381:   if( stem(&z, "dee", "ee", m_gt_0) ){
        !           382:     /* Do nothing.  The work was all in the test */
        !           383:   }else if( 
        !           384:      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
        !           385:       && z!=z2
        !           386:   ){
        !           387:      if( stem(&z, "ta", "ate", 0) ||
        !           388:          stem(&z, "lb", "ble", 0) ||
        !           389:          stem(&z, "zi", "ize", 0) ){
        !           390:        /* Do nothing.  The work was all in the test */
        !           391:      }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
        !           392:        z++;
        !           393:      }else if( m_eq_1(z) && star_oh(z) ){
        !           394:        *(--z) = 'e';
        !           395:      }
        !           396:   }
        !           397: 
        !           398:   /* Step 1c */
        !           399:   if( z[0]=='y' && hasVowel(z+1) ){
        !           400:     z[0] = 'i';
        !           401:   }
        !           402: 
        !           403:   /* Step 2 */
        !           404:   switch( z[1] ){
        !           405:    case 'a':
        !           406:      stem(&z, "lanoita", "ate", m_gt_0) ||
        !           407:      stem(&z, "lanoit", "tion", m_gt_0);
        !           408:      break;
        !           409:    case 'c':
        !           410:      stem(&z, "icne", "ence", m_gt_0) ||
        !           411:      stem(&z, "icna", "ance", m_gt_0);
        !           412:      break;
        !           413:    case 'e':
        !           414:      stem(&z, "rezi", "ize", m_gt_0);
        !           415:      break;
        !           416:    case 'g':
        !           417:      stem(&z, "igol", "log", m_gt_0);
        !           418:      break;
        !           419:    case 'l':
        !           420:      stem(&z, "ilb", "ble", m_gt_0) ||
        !           421:      stem(&z, "illa", "al", m_gt_0) ||
        !           422:      stem(&z, "iltne", "ent", m_gt_0) ||
        !           423:      stem(&z, "ile", "e", m_gt_0) ||
        !           424:      stem(&z, "ilsuo", "ous", m_gt_0);
        !           425:      break;
        !           426:    case 'o':
        !           427:      stem(&z, "noitazi", "ize", m_gt_0) ||
        !           428:      stem(&z, "noita", "ate", m_gt_0) ||
        !           429:      stem(&z, "rota", "ate", m_gt_0);
        !           430:      break;
        !           431:    case 's':
        !           432:      stem(&z, "msila", "al", m_gt_0) ||
        !           433:      stem(&z, "ssenevi", "ive", m_gt_0) ||
        !           434:      stem(&z, "ssenluf", "ful", m_gt_0) ||
        !           435:      stem(&z, "ssensuo", "ous", m_gt_0);
        !           436:      break;
        !           437:    case 't':
        !           438:      stem(&z, "itila", "al", m_gt_0) ||
        !           439:      stem(&z, "itivi", "ive", m_gt_0) ||
        !           440:      stem(&z, "itilib", "ble", m_gt_0);
        !           441:      break;
        !           442:   }
        !           443: 
        !           444:   /* Step 3 */
        !           445:   switch( z[0] ){
        !           446:    case 'e':
        !           447:      stem(&z, "etaci", "ic", m_gt_0) ||
        !           448:      stem(&z, "evita", "", m_gt_0)   ||
        !           449:      stem(&z, "ezila", "al", m_gt_0);
        !           450:      break;
        !           451:    case 'i':
        !           452:      stem(&z, "itici", "ic", m_gt_0);
        !           453:      break;
        !           454:    case 'l':
        !           455:      stem(&z, "laci", "ic", m_gt_0) ||
        !           456:      stem(&z, "luf", "", m_gt_0);
        !           457:      break;
        !           458:    case 's':
        !           459:      stem(&z, "ssen", "", m_gt_0);
        !           460:      break;
        !           461:   }
        !           462: 
        !           463:   /* Step 4 */
        !           464:   switch( z[1] ){
        !           465:    case 'a':
        !           466:      if( z[0]=='l' && m_gt_1(z+2) ){
        !           467:        z += 2;
        !           468:      }
        !           469:      break;
        !           470:    case 'c':
        !           471:      if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
        !           472:        z += 4;
        !           473:      }
        !           474:      break;
        !           475:    case 'e':
        !           476:      if( z[0]=='r' && m_gt_1(z+2) ){
        !           477:        z += 2;
        !           478:      }
        !           479:      break;
        !           480:    case 'i':
        !           481:      if( z[0]=='c' && m_gt_1(z+2) ){
        !           482:        z += 2;
        !           483:      }
        !           484:      break;
        !           485:    case 'l':
        !           486:      if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
        !           487:        z += 4;
        !           488:      }
        !           489:      break;
        !           490:    case 'n':
        !           491:      if( z[0]=='t' ){
        !           492:        if( z[2]=='a' ){
        !           493:          if( m_gt_1(z+3) ){
        !           494:            z += 3;
        !           495:          }
        !           496:        }else if( z[2]=='e' ){
        !           497:          stem(&z, "tneme", "", m_gt_1) ||
        !           498:          stem(&z, "tnem", "", m_gt_1) ||
        !           499:          stem(&z, "tne", "", m_gt_1);
        !           500:        }
        !           501:      }
        !           502:      break;
        !           503:    case 'o':
        !           504:      if( z[0]=='u' ){
        !           505:        if( m_gt_1(z+2) ){
        !           506:          z += 2;
        !           507:        }
        !           508:      }else if( z[3]=='s' || z[3]=='t' ){
        !           509:        stem(&z, "noi", "", m_gt_1);
        !           510:      }
        !           511:      break;
        !           512:    case 's':
        !           513:      if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
        !           514:        z += 3;
        !           515:      }
        !           516:      break;
        !           517:    case 't':
        !           518:      stem(&z, "eta", "", m_gt_1) ||
        !           519:      stem(&z, "iti", "", m_gt_1);
        !           520:      break;
        !           521:    case 'u':
        !           522:      if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
        !           523:        z += 3;
        !           524:      }
        !           525:      break;
        !           526:    case 'v':
        !           527:    case 'z':
        !           528:      if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
        !           529:        z += 3;
        !           530:      }
        !           531:      break;
        !           532:   }
        !           533: 
        !           534:   /* Step 5a */
        !           535:   if( z[0]=='e' ){
        !           536:     if( m_gt_1(z+1) ){
        !           537:       z++;
        !           538:     }else if( m_eq_1(z+1) && !star_oh(z+1) ){
        !           539:       z++;
        !           540:     }
        !           541:   }
        !           542: 
        !           543:   /* Step 5b */
        !           544:   if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
        !           545:     z++;
        !           546:   }
        !           547: 
        !           548:   /* z[] is now the stemmed word in reverse order.  Flip it back
        !           549:   ** around into forward order and return.
        !           550:   */
        !           551:   *pnOut = i = (int)strlen(z);
        !           552:   zOut[i] = 0;
        !           553:   while( *z ){
        !           554:     zOut[--i] = *(z++);
        !           555:   }
        !           556: }
        !           557: 
        !           558: /*
        !           559: ** Characters that can be part of a token.  We assume any character
        !           560: ** whose value is greater than 0x80 (any UTF character) can be
        !           561: ** part of a token.  In other words, delimiters all must have
        !           562: ** values of 0x7f or lower.
        !           563: */
        !           564: static const char porterIdChar[] = {
        !           565: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
        !           566:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
        !           567:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
        !           568:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
        !           569:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
        !           570:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
        !           571: };
        !           572: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
        !           573: 
        !           574: /*
        !           575: ** Extract the next token from a tokenization cursor.  The cursor must
        !           576: ** have been opened by a prior call to porterOpen().
        !           577: */
        !           578: static int porterNext(
        !           579:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
        !           580:   const char **pzToken,               /* OUT: *pzToken is the token text */
        !           581:   int *pnBytes,                       /* OUT: Number of bytes in token */
        !           582:   int *piStartOffset,                 /* OUT: Starting offset of token */
        !           583:   int *piEndOffset,                   /* OUT: Ending offset of token */
        !           584:   int *piPosition                     /* OUT: Position integer of token */
        !           585: ){
        !           586:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
        !           587:   const char *z = c->zInput;
        !           588: 
        !           589:   while( c->iOffset<c->nInput ){
        !           590:     int iStartOffset, ch;
        !           591: 
        !           592:     /* Scan past delimiter characters */
        !           593:     while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
        !           594:       c->iOffset++;
        !           595:     }
        !           596: 
        !           597:     /* Count non-delimiter characters. */
        !           598:     iStartOffset = c->iOffset;
        !           599:     while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
        !           600:       c->iOffset++;
        !           601:     }
        !           602: 
        !           603:     if( c->iOffset>iStartOffset ){
        !           604:       int n = c->iOffset-iStartOffset;
        !           605:       if( n>c->nAllocated ){
        !           606:         char *pNew;
        !           607:         c->nAllocated = n+20;
        !           608:         pNew = sqlite3_realloc(c->zToken, c->nAllocated);
        !           609:         if( !pNew ) return SQLITE_NOMEM;
        !           610:         c->zToken = pNew;
        !           611:       }
        !           612:       porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
        !           613:       *pzToken = c->zToken;
        !           614:       *piStartOffset = iStartOffset;
        !           615:       *piEndOffset = c->iOffset;
        !           616:       *piPosition = c->iToken++;
        !           617:       return SQLITE_OK;
        !           618:     }
        !           619:   }
        !           620:   return SQLITE_DONE;
        !           621: }
        !           622: 
        !           623: /*
        !           624: ** The set of routines that implement the porter-stemmer tokenizer
        !           625: */
        !           626: static const sqlite3_tokenizer_module porterTokenizerModule = {
        !           627:   0,
        !           628:   porterCreate,
        !           629:   porterDestroy,
        !           630:   porterOpen,
        !           631:   porterClose,
        !           632:   porterNext,
        !           633: };
        !           634: 
        !           635: /*
        !           636: ** Allocate a new porter tokenizer.  Return a pointer to the new
        !           637: ** tokenizer in *ppModule
        !           638: */
        !           639: void sqlite3Fts3PorterTokenizerModule(
        !           640:   sqlite3_tokenizer_module const**ppModule
        !           641: ){
        !           642:   *ppModule = &porterTokenizerModule;
        !           643: }
        !           644: 
        !           645: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>