Annotation of embedaddon/sqlite3/ext/fts2/fts2_porter.c, revision 1.1

1.1     ! misho       1: /*
        !             2: ** 2006 September 30
        !             3: **
        !             4: ** The author disclaims copyright to this source code.  In place of
        !             5: ** a legal notice, here is a blessing:
        !             6: **
        !             7: **    May you do good and not evil.
        !             8: **    May you find forgiveness for yourself and forgive others.
        !             9: **    May you share freely, never taking more than you give.
        !            10: **
        !            11: *************************************************************************
        !            12: ** Implementation of the full-text-search tokenizer that implements
        !            13: ** a Porter stemmer.
        !            14: */
        !            15: 
        !            16: /*
        !            17: ** The code in this file is only compiled if:
        !            18: **
        !            19: **     * The FTS2 module is being built as an extension
        !            20: **       (in which case SQLITE_CORE is not defined), or
        !            21: **
        !            22: **     * The FTS2 module is being built into the core of
        !            23: **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
        !            24: */
        !            25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
        !            26: 
        !            27: 
        !            28: #include <assert.h>
        !            29: #include <stdlib.h>
        !            30: #include <stdio.h>
        !            31: #include <string.h>
        !            32: 
        !            33: #include "fts2_tokenizer.h"
        !            34: 
        !            35: /*
        !            36: ** Class derived from sqlite3_tokenizer
        !            37: */
        !            38: typedef struct porter_tokenizer {
        !            39:   sqlite3_tokenizer base;      /* Base class */
        !            40: } porter_tokenizer;
        !            41: 
        !            42: /*
        !            43: ** Class derived from sqlit3_tokenizer_cursor
        !            44: */
        !            45: typedef struct porter_tokenizer_cursor {
        !            46:   sqlite3_tokenizer_cursor base;
        !            47:   const char *zInput;          /* input we are tokenizing */
        !            48:   int nInput;                  /* size of the input */
        !            49:   int iOffset;                 /* current position in zInput */
        !            50:   int iToken;                  /* index of next token to be returned */
        !            51:   char *zToken;                /* storage for current token */
        !            52:   int nAllocated;              /* space allocated to zToken buffer */
        !            53: } porter_tokenizer_cursor;
        !            54: 
        !            55: 
        !            56: /* Forward declaration */
        !            57: static const sqlite3_tokenizer_module porterTokenizerModule;
        !            58: 
        !            59: 
        !            60: /*
        !            61: ** Create a new tokenizer instance.
        !            62: */
        !            63: static int porterCreate(
        !            64:   int argc, const char * const *argv,
        !            65:   sqlite3_tokenizer **ppTokenizer
        !            66: ){
        !            67:   porter_tokenizer *t;
        !            68:   t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
        !            69:   if( t==NULL ) return SQLITE_NOMEM;
        !            70:   memset(t, 0, sizeof(*t));
        !            71:   *ppTokenizer = &t->base;
        !            72:   return SQLITE_OK;
        !            73: }
        !            74: 
        !            75: /*
        !            76: ** Destroy a tokenizer
        !            77: */
        !            78: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
        !            79:   sqlite3_free(pTokenizer);
        !            80:   return SQLITE_OK;
        !            81: }
        !            82: 
        !            83: /*
        !            84: ** Prepare to begin tokenizing a particular string.  The input
        !            85: ** string to be tokenized is zInput[0..nInput-1].  A cursor
        !            86: ** used to incrementally tokenize this string is returned in 
        !            87: ** *ppCursor.
        !            88: */
        !            89: static int porterOpen(
        !            90:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
        !            91:   const char *zInput, int nInput,        /* String to be tokenized */
        !            92:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
        !            93: ){
        !            94:   porter_tokenizer_cursor *c;
        !            95: 
        !            96:   c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
        !            97:   if( c==NULL ) return SQLITE_NOMEM;
        !            98: 
        !            99:   c->zInput = zInput;
        !           100:   if( zInput==0 ){
        !           101:     c->nInput = 0;
        !           102:   }else if( nInput<0 ){
        !           103:     c->nInput = (int)strlen(zInput);
        !           104:   }else{
        !           105:     c->nInput = nInput;
        !           106:   }
        !           107:   c->iOffset = 0;                 /* start tokenizing at the beginning */
        !           108:   c->iToken = 0;
        !           109:   c->zToken = NULL;               /* no space allocated, yet. */
        !           110:   c->nAllocated = 0;
        !           111: 
        !           112:   *ppCursor = &c->base;
        !           113:   return SQLITE_OK;
        !           114: }
        !           115: 
        !           116: /*
        !           117: ** Close a tokenization cursor previously opened by a call to
        !           118: ** porterOpen() above.
        !           119: */
        !           120: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
        !           121:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
        !           122:   sqlite3_free(c->zToken);
        !           123:   sqlite3_free(c);
        !           124:   return SQLITE_OK;
        !           125: }
        !           126: /*
        !           127: ** Vowel or consonant
        !           128: */
        !           129: static const char cType[] = {
        !           130:    0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
        !           131:    1, 1, 1, 2, 1
        !           132: };
        !           133: 
        !           134: /*
        !           135: ** isConsonant() and isVowel() determine if their first character in
        !           136: ** the string they point to is a consonant or a vowel, according
        !           137: ** to Porter ruls.  
        !           138: **
        !           139: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
        !           140: ** 'Y' is a consonant unless it follows another consonant,
        !           141: ** in which case it is a vowel.
        !           142: **
        !           143: ** In these routine, the letters are in reverse order.  So the 'y' rule
        !           144: ** is that 'y' is a consonant unless it is followed by another
        !           145: ** consonent.
        !           146: */
        !           147: static int isVowel(const char*);
        !           148: static int isConsonant(const char *z){
        !           149:   int j;
        !           150:   char x = *z;
        !           151:   if( x==0 ) return 0;
        !           152:   assert( x>='a' && x<='z' );
        !           153:   j = cType[x-'a'];
        !           154:   if( j<2 ) return j;
        !           155:   return z[1]==0 || isVowel(z + 1);
        !           156: }
        !           157: static int isVowel(const char *z){
        !           158:   int j;
        !           159:   char x = *z;
        !           160:   if( x==0 ) return 0;
        !           161:   assert( x>='a' && x<='z' );
        !           162:   j = cType[x-'a'];
        !           163:   if( j<2 ) return 1-j;
        !           164:   return isConsonant(z + 1);
        !           165: }
        !           166: 
        !           167: /*
        !           168: ** Let any sequence of one or more vowels be represented by V and let
        !           169: ** C be sequence of one or more consonants.  Then every word can be
        !           170: ** represented as:
        !           171: **
        !           172: **           [C] (VC){m} [V]
        !           173: **
        !           174: ** In prose:  A word is an optional consonant followed by zero or
        !           175: ** vowel-consonant pairs followed by an optional vowel.  "m" is the
        !           176: ** number of vowel consonant pairs.  This routine computes the value
        !           177: ** of m for the first i bytes of a word.
        !           178: **
        !           179: ** Return true if the m-value for z is 1 or more.  In other words,
        !           180: ** return true if z contains at least one vowel that is followed
        !           181: ** by a consonant.
        !           182: **
        !           183: ** In this routine z[] is in reverse order.  So we are really looking
        !           184: ** for an instance of of a consonant followed by a vowel.
        !           185: */
        !           186: static int m_gt_0(const char *z){
        !           187:   while( isVowel(z) ){ z++; }
        !           188:   if( *z==0 ) return 0;
        !           189:   while( isConsonant(z) ){ z++; }
        !           190:   return *z!=0;
        !           191: }
        !           192: 
        !           193: /* Like mgt0 above except we are looking for a value of m which is
        !           194: ** exactly 1
        !           195: */
        !           196: static int m_eq_1(const char *z){
        !           197:   while( isVowel(z) ){ z++; }
        !           198:   if( *z==0 ) return 0;
        !           199:   while( isConsonant(z) ){ z++; }
        !           200:   if( *z==0 ) return 0;
        !           201:   while( isVowel(z) ){ z++; }
        !           202:   if( *z==0 ) return 1;
        !           203:   while( isConsonant(z) ){ z++; }
        !           204:   return *z==0;
        !           205: }
        !           206: 
        !           207: /* Like mgt0 above except we are looking for a value of m>1 instead
        !           208: ** or m>0
        !           209: */
        !           210: static int m_gt_1(const char *z){
        !           211:   while( isVowel(z) ){ z++; }
        !           212:   if( *z==0 ) return 0;
        !           213:   while( isConsonant(z) ){ z++; }
        !           214:   if( *z==0 ) return 0;
        !           215:   while( isVowel(z) ){ z++; }
        !           216:   if( *z==0 ) return 0;
        !           217:   while( isConsonant(z) ){ z++; }
        !           218:   return *z!=0;
        !           219: }
        !           220: 
        !           221: /*
        !           222: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
        !           223: */
        !           224: static int hasVowel(const char *z){
        !           225:   while( isConsonant(z) ){ z++; }
        !           226:   return *z!=0;
        !           227: }
        !           228: 
        !           229: /*
        !           230: ** Return TRUE if the word ends in a double consonant.
        !           231: **
        !           232: ** The text is reversed here. So we are really looking at
        !           233: ** the first two characters of z[].
        !           234: */
        !           235: static int doubleConsonant(const char *z){
        !           236:   return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
        !           237: }
        !           238: 
        !           239: /*
        !           240: ** Return TRUE if the word ends with three letters which
        !           241: ** are consonant-vowel-consonent and where the final consonant
        !           242: ** is not 'w', 'x', or 'y'.
        !           243: **
        !           244: ** The word is reversed here.  So we are really checking the
        !           245: ** first three letters and the first one cannot be in [wxy].
        !           246: */
        !           247: static int star_oh(const char *z){
        !           248:   return
        !           249:     z[0]!=0 && isConsonant(z) &&
        !           250:     z[0]!='w' && z[0]!='x' && z[0]!='y' &&
        !           251:     z[1]!=0 && isVowel(z+1) &&
        !           252:     z[2]!=0 && isConsonant(z+2);
        !           253: }
        !           254: 
        !           255: /*
        !           256: ** If the word ends with zFrom and xCond() is true for the stem
        !           257: ** of the word that preceeds the zFrom ending, then change the 
        !           258: ** ending to zTo.
        !           259: **
        !           260: ** The input word *pz and zFrom are both in reverse order.  zTo
        !           261: ** is in normal order. 
        !           262: **
        !           263: ** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
        !           264: ** match.  Not that TRUE is returned even if xCond() fails and
        !           265: ** no substitution occurs.
        !           266: */
        !           267: static int stem(
        !           268:   char **pz,             /* The word being stemmed (Reversed) */
        !           269:   const char *zFrom,     /* If the ending matches this... (Reversed) */
        !           270:   const char *zTo,       /* ... change the ending to this (not reversed) */
        !           271:   int (*xCond)(const char*)   /* Condition that must be true */
        !           272: ){
        !           273:   char *z = *pz;
        !           274:   while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
        !           275:   if( *zFrom!=0 ) return 0;
        !           276:   if( xCond && !xCond(z) ) return 1;
        !           277:   while( *zTo ){
        !           278:     *(--z) = *(zTo++);
        !           279:   }
        !           280:   *pz = z;
        !           281:   return 1;
        !           282: }
        !           283: 
        !           284: /*
        !           285: ** This is the fallback stemmer used when the porter stemmer is
        !           286: ** inappropriate.  The input word is copied into the output with
        !           287: ** US-ASCII case folding.  If the input word is too long (more
        !           288: ** than 20 bytes if it contains no digits or more than 6 bytes if
        !           289: ** it contains digits) then word is truncated to 20 or 6 bytes
        !           290: ** by taking 10 or 3 bytes from the beginning and end.
        !           291: */
        !           292: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
        !           293:   int i, mx, j;
        !           294:   int hasDigit = 0;
        !           295:   for(i=0; i<nIn; i++){
        !           296:     int c = zIn[i];
        !           297:     if( c>='A' && c<='Z' ){
        !           298:       zOut[i] = c - 'A' + 'a';
        !           299:     }else{
        !           300:       if( c>='0' && c<='9' ) hasDigit = 1;
        !           301:       zOut[i] = c;
        !           302:     }
        !           303:   }
        !           304:   mx = hasDigit ? 3 : 10;
        !           305:   if( nIn>mx*2 ){
        !           306:     for(j=mx, i=nIn-mx; i<nIn; i++, j++){
        !           307:       zOut[j] = zOut[i];
        !           308:     }
        !           309:     i = j;
        !           310:   }
        !           311:   zOut[i] = 0;
        !           312:   *pnOut = i;
        !           313: }
        !           314: 
        !           315: 
        !           316: /*
        !           317: ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
        !           318: ** zOut is at least big enough to hold nIn bytes.  Write the actual
        !           319: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
        !           320: **
        !           321: ** Any upper-case characters in the US-ASCII character set ([A-Z])
        !           322: ** are converted to lower case.  Upper-case UTF characters are
        !           323: ** unchanged.
        !           324: **
        !           325: ** Words that are longer than about 20 bytes are stemmed by retaining
        !           326: ** a few bytes from the beginning and the end of the word.  If the
        !           327: ** word contains digits, 3 bytes are taken from the beginning and
        !           328: ** 3 bytes from the end.  For long words without digits, 10 bytes
        !           329: ** are taken from each end.  US-ASCII case folding still applies.
        !           330: ** 
        !           331: ** If the input word contains not digits but does characters not 
        !           332: ** in [a-zA-Z] then no stemming is attempted and this routine just 
        !           333: ** copies the input into the input into the output with US-ASCII
        !           334: ** case folding.
        !           335: **
        !           336: ** Stemming never increases the length of the word.  So there is
        !           337: ** no chance of overflowing the zOut buffer.
        !           338: */
        !           339: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
        !           340:   int i, j, c;
        !           341:   char zReverse[28];
        !           342:   char *z, *z2;
        !           343:   if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
        !           344:     /* The word is too big or too small for the porter stemmer.
        !           345:     ** Fallback to the copy stemmer */
        !           346:     copy_stemmer(zIn, nIn, zOut, pnOut);
        !           347:     return;
        !           348:   }
        !           349:   for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
        !           350:     c = zIn[i];
        !           351:     if( c>='A' && c<='Z' ){
        !           352:       zReverse[j] = c + 'a' - 'A';
        !           353:     }else if( c>='a' && c<='z' ){
        !           354:       zReverse[j] = c;
        !           355:     }else{
        !           356:       /* The use of a character not in [a-zA-Z] means that we fallback
        !           357:       ** to the copy stemmer */
        !           358:       copy_stemmer(zIn, nIn, zOut, pnOut);
        !           359:       return;
        !           360:     }
        !           361:   }
        !           362:   memset(&zReverse[sizeof(zReverse)-5], 0, 5);
        !           363:   z = &zReverse[j+1];
        !           364: 
        !           365: 
        !           366:   /* Step 1a */
        !           367:   if( z[0]=='s' ){
        !           368:     if(
        !           369:      !stem(&z, "sess", "ss", 0) &&
        !           370:      !stem(&z, "sei", "i", 0)  &&
        !           371:      !stem(&z, "ss", "ss", 0)
        !           372:     ){
        !           373:       z++;
        !           374:     }
        !           375:   }
        !           376: 
        !           377:   /* Step 1b */  
        !           378:   z2 = z;
        !           379:   if( stem(&z, "dee", "ee", m_gt_0) ){
        !           380:     /* Do nothing.  The work was all in the test */
        !           381:   }else if( 
        !           382:      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
        !           383:       && z!=z2
        !           384:   ){
        !           385:      if( stem(&z, "ta", "ate", 0) ||
        !           386:          stem(&z, "lb", "ble", 0) ||
        !           387:          stem(&z, "zi", "ize", 0) ){
        !           388:        /* Do nothing.  The work was all in the test */
        !           389:      }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
        !           390:        z++;
        !           391:      }else if( m_eq_1(z) && star_oh(z) ){
        !           392:        *(--z) = 'e';
        !           393:      }
        !           394:   }
        !           395: 
        !           396:   /* Step 1c */
        !           397:   if( z[0]=='y' && hasVowel(z+1) ){
        !           398:     z[0] = 'i';
        !           399:   }
        !           400: 
        !           401:   /* Step 2 */
        !           402:   switch( z[1] ){
        !           403:    case 'a':
        !           404:      stem(&z, "lanoita", "ate", m_gt_0) ||
        !           405:      stem(&z, "lanoit", "tion", m_gt_0);
        !           406:      break;
        !           407:    case 'c':
        !           408:      stem(&z, "icne", "ence", m_gt_0) ||
        !           409:      stem(&z, "icna", "ance", m_gt_0);
        !           410:      break;
        !           411:    case 'e':
        !           412:      stem(&z, "rezi", "ize", m_gt_0);
        !           413:      break;
        !           414:    case 'g':
        !           415:      stem(&z, "igol", "log", m_gt_0);
        !           416:      break;
        !           417:    case 'l':
        !           418:      stem(&z, "ilb", "ble", m_gt_0) ||
        !           419:      stem(&z, "illa", "al", m_gt_0) ||
        !           420:      stem(&z, "iltne", "ent", m_gt_0) ||
        !           421:      stem(&z, "ile", "e", m_gt_0) ||
        !           422:      stem(&z, "ilsuo", "ous", m_gt_0);
        !           423:      break;
        !           424:    case 'o':
        !           425:      stem(&z, "noitazi", "ize", m_gt_0) ||
        !           426:      stem(&z, "noita", "ate", m_gt_0) ||
        !           427:      stem(&z, "rota", "ate", m_gt_0);
        !           428:      break;
        !           429:    case 's':
        !           430:      stem(&z, "msila", "al", m_gt_0) ||
        !           431:      stem(&z, "ssenevi", "ive", m_gt_0) ||
        !           432:      stem(&z, "ssenluf", "ful", m_gt_0) ||
        !           433:      stem(&z, "ssensuo", "ous", m_gt_0);
        !           434:      break;
        !           435:    case 't':
        !           436:      stem(&z, "itila", "al", m_gt_0) ||
        !           437:      stem(&z, "itivi", "ive", m_gt_0) ||
        !           438:      stem(&z, "itilib", "ble", m_gt_0);
        !           439:      break;
        !           440:   }
        !           441: 
        !           442:   /* Step 3 */
        !           443:   switch( z[0] ){
        !           444:    case 'e':
        !           445:      stem(&z, "etaci", "ic", m_gt_0) ||
        !           446:      stem(&z, "evita", "", m_gt_0)   ||
        !           447:      stem(&z, "ezila", "al", m_gt_0);
        !           448:      break;
        !           449:    case 'i':
        !           450:      stem(&z, "itici", "ic", m_gt_0);
        !           451:      break;
        !           452:    case 'l':
        !           453:      stem(&z, "laci", "ic", m_gt_0) ||
        !           454:      stem(&z, "luf", "", m_gt_0);
        !           455:      break;
        !           456:    case 's':
        !           457:      stem(&z, "ssen", "", m_gt_0);
        !           458:      break;
        !           459:   }
        !           460: 
        !           461:   /* Step 4 */
        !           462:   switch( z[1] ){
        !           463:    case 'a':
        !           464:      if( z[0]=='l' && m_gt_1(z+2) ){
        !           465:        z += 2;
        !           466:      }
        !           467:      break;
        !           468:    case 'c':
        !           469:      if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
        !           470:        z += 4;
        !           471:      }
        !           472:      break;
        !           473:    case 'e':
        !           474:      if( z[0]=='r' && m_gt_1(z+2) ){
        !           475:        z += 2;
        !           476:      }
        !           477:      break;
        !           478:    case 'i':
        !           479:      if( z[0]=='c' && m_gt_1(z+2) ){
        !           480:        z += 2;
        !           481:      }
        !           482:      break;
        !           483:    case 'l':
        !           484:      if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
        !           485:        z += 4;
        !           486:      }
        !           487:      break;
        !           488:    case 'n':
        !           489:      if( z[0]=='t' ){
        !           490:        if( z[2]=='a' ){
        !           491:          if( m_gt_1(z+3) ){
        !           492:            z += 3;
        !           493:          }
        !           494:        }else if( z[2]=='e' ){
        !           495:          stem(&z, "tneme", "", m_gt_1) ||
        !           496:          stem(&z, "tnem", "", m_gt_1) ||
        !           497:          stem(&z, "tne", "", m_gt_1);
        !           498:        }
        !           499:      }
        !           500:      break;
        !           501:    case 'o':
        !           502:      if( z[0]=='u' ){
        !           503:        if( m_gt_1(z+2) ){
        !           504:          z += 2;
        !           505:        }
        !           506:      }else if( z[3]=='s' || z[3]=='t' ){
        !           507:        stem(&z, "noi", "", m_gt_1);
        !           508:      }
        !           509:      break;
        !           510:    case 's':
        !           511:      if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
        !           512:        z += 3;
        !           513:      }
        !           514:      break;
        !           515:    case 't':
        !           516:      stem(&z, "eta", "", m_gt_1) ||
        !           517:      stem(&z, "iti", "", m_gt_1);
        !           518:      break;
        !           519:    case 'u':
        !           520:      if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
        !           521:        z += 3;
        !           522:      }
        !           523:      break;
        !           524:    case 'v':
        !           525:    case 'z':
        !           526:      if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
        !           527:        z += 3;
        !           528:      }
        !           529:      break;
        !           530:   }
        !           531: 
        !           532:   /* Step 5a */
        !           533:   if( z[0]=='e' ){
        !           534:     if( m_gt_1(z+1) ){
        !           535:       z++;
        !           536:     }else if( m_eq_1(z+1) && !star_oh(z+1) ){
        !           537:       z++;
        !           538:     }
        !           539:   }
        !           540: 
        !           541:   /* Step 5b */
        !           542:   if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
        !           543:     z++;
        !           544:   }
        !           545: 
        !           546:   /* z[] is now the stemmed word in reverse order.  Flip it back
        !           547:   ** around into forward order and return.
        !           548:   */
        !           549:   *pnOut = i = strlen(z);
        !           550:   zOut[i] = 0;
        !           551:   while( *z ){
        !           552:     zOut[--i] = *(z++);
        !           553:   }
        !           554: }
        !           555: 
        !           556: /*
        !           557: ** Characters that can be part of a token.  We assume any character
        !           558: ** whose value is greater than 0x80 (any UTF character) can be
        !           559: ** part of a token.  In other words, delimiters all must have
        !           560: ** values of 0x7f or lower.
        !           561: */
        !           562: static const char porterIdChar[] = {
        !           563: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
        !           564:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
        !           565:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
        !           566:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
        !           567:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
        !           568:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
        !           569: };
        !           570: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
        !           571: 
        !           572: /*
        !           573: ** Extract the next token from a tokenization cursor.  The cursor must
        !           574: ** have been opened by a prior call to porterOpen().
        !           575: */
        !           576: static int porterNext(
        !           577:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
        !           578:   const char **pzToken,               /* OUT: *pzToken is the token text */
        !           579:   int *pnBytes,                       /* OUT: Number of bytes in token */
        !           580:   int *piStartOffset,                 /* OUT: Starting offset of token */
        !           581:   int *piEndOffset,                   /* OUT: Ending offset of token */
        !           582:   int *piPosition                     /* OUT: Position integer of token */
        !           583: ){
        !           584:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
        !           585:   const char *z = c->zInput;
        !           586: 
        !           587:   while( c->iOffset<c->nInput ){
        !           588:     int iStartOffset, ch;
        !           589: 
        !           590:     /* Scan past delimiter characters */
        !           591:     while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
        !           592:       c->iOffset++;
        !           593:     }
        !           594: 
        !           595:     /* Count non-delimiter characters. */
        !           596:     iStartOffset = c->iOffset;
        !           597:     while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
        !           598:       c->iOffset++;
        !           599:     }
        !           600: 
        !           601:     if( c->iOffset>iStartOffset ){
        !           602:       int n = c->iOffset-iStartOffset;
        !           603:       if( n>c->nAllocated ){
        !           604:         c->nAllocated = n+20;
        !           605:         c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
        !           606:         if( c->zToken==NULL ) return SQLITE_NOMEM;
        !           607:       }
        !           608:       porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
        !           609:       *pzToken = c->zToken;
        !           610:       *piStartOffset = iStartOffset;
        !           611:       *piEndOffset = c->iOffset;
        !           612:       *piPosition = c->iToken++;
        !           613:       return SQLITE_OK;
        !           614:     }
        !           615:   }
        !           616:   return SQLITE_DONE;
        !           617: }
        !           618: 
        !           619: /*
        !           620: ** The set of routines that implement the porter-stemmer tokenizer
        !           621: */
        !           622: static const sqlite3_tokenizer_module porterTokenizerModule = {
        !           623:   0,
        !           624:   porterCreate,
        !           625:   porterDestroy,
        !           626:   porterOpen,
        !           627:   porterClose,
        !           628:   porterNext,
        !           629: };
        !           630: 
        !           631: /*
        !           632: ** Allocate a new porter tokenizer.  Return a pointer to the new
        !           633: ** tokenizer in *ppModule
        !           634: */
        !           635: void sqlite3Fts2PorterTokenizerModule(
        !           636:   sqlite3_tokenizer_module const**ppModule
        !           637: ){
        !           638:   *ppModule = &porterTokenizerModule;
        !           639: }
        !           640: 
        !           641: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>