embedaddon/sqlite3/ext/fts1/fts1_porter.c - annotate

Return to fts1_porter.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts1
Annotation of embedaddon/sqlite3/ext/fts1/fts1_porter.c, revision 1.1

1.1     ! misho       1: /*
        !             2: ** 2006 September 30
        !             3: **
        !             4: ** The author disclaims copyright to this source code.  In place of
        !             5: ** a legal notice, here is a blessing:
        !             6: **
        !             7: **    May you do good and not evil.
        !             8: **    May you find forgiveness for yourself and forgive others.
        !             9: **    May you share freely, never taking more than you give.
        !            10: **
        !            11: *************************************************************************
        !            12: ** Implementation of the full-text-search tokenizer that implements
        !            13: ** a Porter stemmer.
        !            14: */
        !            15: 
        !            16: /*
        !            17: ** The code in this file is only compiled if:
        !            18: **
        !            19: **     * The FTS1 module is being built as an extension
        !            20: **       (in which case SQLITE_CORE is not defined), or
        !            21: **
        !            22: **     * The FTS1 module is being built into the core of
        !            23: **       SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
        !            24: */
        !            25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
        !            26: 
        !            27: 
        !            28: #include <assert.h>
        !            29: #include <stdlib.h>
        !            30: #include <stdio.h>
        !            31: #include <string.h>
        !            32: #include <ctype.h>
        !            33: 
        !            34: #include "fts1_tokenizer.h"
        !            35: 
        !            36: /*
        !            37: ** Class derived from sqlite3_tokenizer
        !            38: */
        !            39: typedef struct porter_tokenizer {
        !            40:   sqlite3_tokenizer base;      /* Base class */
        !            41: } porter_tokenizer;
        !            42: 
        !            43: /*
        !            44: ** Class derived from sqlit3_tokenizer_cursor
        !            45: */
        !            46: typedef struct porter_tokenizer_cursor {
        !            47:   sqlite3_tokenizer_cursor base;
        !            48:   const char *zInput;          /* input we are tokenizing */
        !            49:   int nInput;                  /* size of the input */
        !            50:   int iOffset;                 /* current position in zInput */
        !            51:   int iToken;                  /* index of next token to be returned */
        !            52:   char *zToken;                /* storage for current token */
        !            53:   int nAllocated;              /* space allocated to zToken buffer */
        !            54: } porter_tokenizer_cursor;
        !            55: 
        !            56: 
        !            57: /* Forward declaration */
        !            58: static const sqlite3_tokenizer_module porterTokenizerModule;
        !            59: 
        !            60: 
        !            61: /*
        !            62: ** Create a new tokenizer instance.
        !            63: */
        !            64: static int porterCreate(
        !            65:   int argc, const char * const *argv,
        !            66:   sqlite3_tokenizer **ppTokenizer
        !            67: ){
        !            68:   porter_tokenizer *t;
        !            69:   t = (porter_tokenizer *) calloc(sizeof(*t), 1);
        !            70:   if( t==NULL ) return SQLITE_NOMEM;
        !            71: 
        !            72:   *ppTokenizer = &t->base;
        !            73:   return SQLITE_OK;
        !            74: }
        !            75: 
        !            76: /*
        !            77: ** Destroy a tokenizer
        !            78: */
        !            79: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
        !            80:   free(pTokenizer);
        !            81:   return SQLITE_OK;
        !            82: }
        !            83: 
        !            84: /*
        !            85: ** Prepare to begin tokenizing a particular string.  The input
        !            86: ** string to be tokenized is zInput[0..nInput-1].  A cursor
        !            87: ** used to incrementally tokenize this string is returned in 
        !            88: ** *ppCursor.
        !            89: */
        !            90: static int porterOpen(
        !            91:   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
        !            92:   const char *zInput, int nInput,        /* String to be tokenized */
        !            93:   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
        !            94: ){
        !            95:   porter_tokenizer_cursor *c;
        !            96: 
        !            97:   c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
        !            98:   if( c==NULL ) return SQLITE_NOMEM;
        !            99: 
        !           100:   c->zInput = zInput;
        !           101:   if( zInput==0 ){
        !           102:     c->nInput = 0;
        !           103:   }else if( nInput<0 ){
        !           104:     c->nInput = (int)strlen(zInput);
        !           105:   }else{
        !           106:     c->nInput = nInput;
        !           107:   }
        !           108:   c->iOffset = 0;                 /* start tokenizing at the beginning */
        !           109:   c->iToken = 0;
        !           110:   c->zToken = NULL;               /* no space allocated, yet. */
        !           111:   c->nAllocated = 0;
        !           112: 
        !           113:   *ppCursor = &c->base;
        !           114:   return SQLITE_OK;
        !           115: }
        !           116: 
        !           117: /*
        !           118: ** Close a tokenization cursor previously opened by a call to
        !           119: ** porterOpen() above.
        !           120: */
        !           121: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
        !           122:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
        !           123:   free(c->zToken);
        !           124:   free(c);
        !           125:   return SQLITE_OK;
        !           126: }
        !           127: /*
        !           128: ** Vowel or consonant
        !           129: */
        !           130: static const char cType[] = {
        !           131:    0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
        !           132:    1, 1, 1, 2, 1
        !           133: };
        !           134: 
        !           135: /*
        !           136: ** isConsonant() and isVowel() determine if their first character in
        !           137: ** the string they point to is a consonant or a vowel, according
        !           138: ** to Porter ruls.  
        !           139: **
        !           140: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
        !           141: ** 'Y' is a consonant unless it follows another consonant,
        !           142: ** in which case it is a vowel.
        !           143: **
        !           144: ** In these routine, the letters are in reverse order.  So the 'y' rule
        !           145: ** is that 'y' is a consonant unless it is followed by another
        !           146: ** consonent.
        !           147: */
        !           148: static int isVowel(const char*);
        !           149: static int isConsonant(const char *z){
        !           150:   int j;
        !           151:   char x = *z;
        !           152:   if( x==0 ) return 0;
        !           153:   assert( x>='a' && x<='z' );
        !           154:   j = cType[x-'a'];
        !           155:   if( j<2 ) return j;
        !           156:   return z[1]==0 || isVowel(z + 1);
        !           157: }
        !           158: static int isVowel(const char *z){
        !           159:   int j;
        !           160:   char x = *z;
        !           161:   if( x==0 ) return 0;
        !           162:   assert( x>='a' && x<='z' );
        !           163:   j = cType[x-'a'];
        !           164:   if( j<2 ) return 1-j;
        !           165:   return isConsonant(z + 1);
        !           166: }
        !           167: 
        !           168: /*
        !           169: ** Let any sequence of one or more vowels be represented by V and let
        !           170: ** C be sequence of one or more consonants.  Then every word can be
        !           171: ** represented as:
        !           172: **
        !           173: **           [C] (VC){m} [V]
        !           174: **
        !           175: ** In prose:  A word is an optional consonant followed by zero or
        !           176: ** vowel-consonant pairs followed by an optional vowel.  "m" is the
        !           177: ** number of vowel consonant pairs.  This routine computes the value
        !           178: ** of m for the first i bytes of a word.
        !           179: **
        !           180: ** Return true if the m-value for z is 1 or more.  In other words,
        !           181: ** return true if z contains at least one vowel that is followed
        !           182: ** by a consonant.
        !           183: **
        !           184: ** In this routine z[] is in reverse order.  So we are really looking
        !           185: ** for an instance of of a consonant followed by a vowel.
        !           186: */
        !           187: static int m_gt_0(const char *z){
        !           188:   while( isVowel(z) ){ z++; }
        !           189:   if( *z==0 ) return 0;
        !           190:   while( isConsonant(z) ){ z++; }
        !           191:   return *z!=0;
        !           192: }
        !           193: 
        !           194: /* Like mgt0 above except we are looking for a value of m which is
        !           195: ** exactly 1
        !           196: */
        !           197: static int m_eq_1(const char *z){
        !           198:   while( isVowel(z) ){ z++; }
        !           199:   if( *z==0 ) return 0;
        !           200:   while( isConsonant(z) ){ z++; }
        !           201:   if( *z==0 ) return 0;
        !           202:   while( isVowel(z) ){ z++; }
        !           203:   if( *z==0 ) return 1;
        !           204:   while( isConsonant(z) ){ z++; }
        !           205:   return *z==0;
        !           206: }
        !           207: 
        !           208: /* Like mgt0 above except we are looking for a value of m>1 instead
        !           209: ** or m>0
        !           210: */
        !           211: static int m_gt_1(const char *z){
        !           212:   while( isVowel(z) ){ z++; }
        !           213:   if( *z==0 ) return 0;
        !           214:   while( isConsonant(z) ){ z++; }
        !           215:   if( *z==0 ) return 0;
        !           216:   while( isVowel(z) ){ z++; }
        !           217:   if( *z==0 ) return 0;
        !           218:   while( isConsonant(z) ){ z++; }
        !           219:   return *z!=0;
        !           220: }
        !           221: 
        !           222: /*
        !           223: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
        !           224: */
        !           225: static int hasVowel(const char *z){
        !           226:   while( isConsonant(z) ){ z++; }
        !           227:   return *z!=0;
        !           228: }
        !           229: 
        !           230: /*
        !           231: ** Return TRUE if the word ends in a double consonant.
        !           232: **
        !           233: ** The text is reversed here. So we are really looking at
        !           234: ** the first two characters of z[].
        !           235: */
        !           236: static int doubleConsonant(const char *z){
        !           237:   return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
        !           238: }
        !           239: 
        !           240: /*
        !           241: ** Return TRUE if the word ends with three letters which
        !           242: ** are consonant-vowel-consonent and where the final consonant
        !           243: ** is not 'w', 'x', or 'y'.
        !           244: **
        !           245: ** The word is reversed here.  So we are really checking the
        !           246: ** first three letters and the first one cannot be in [wxy].
        !           247: */
        !           248: static int star_oh(const char *z){
        !           249:   return
        !           250:     z[0]!=0 && isConsonant(z) &&
        !           251:     z[0]!='w' && z[0]!='x' && z[0]!='y' &&
        !           252:     z[1]!=0 && isVowel(z+1) &&
        !           253:     z[2]!=0 && isConsonant(z+2);
        !           254: }
        !           255: 
        !           256: /*
        !           257: ** If the word ends with zFrom and xCond() is true for the stem
        !           258: ** of the word that preceeds the zFrom ending, then change the 
        !           259: ** ending to zTo.
        !           260: **
        !           261: ** The input word *pz and zFrom are both in reverse order.  zTo
        !           262: ** is in normal order. 
        !           263: **
        !           264: ** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
        !           265: ** match.  Not that TRUE is returned even if xCond() fails and
        !           266: ** no substitution occurs.
        !           267: */
        !           268: static int stem(
        !           269:   char **pz,             /* The word being stemmed (Reversed) */
        !           270:   const char *zFrom,     /* If the ending matches this... (Reversed) */
        !           271:   const char *zTo,       /* ... change the ending to this (not reversed) */
        !           272:   int (*xCond)(const char*)   /* Condition that must be true */
        !           273: ){
        !           274:   char *z = *pz;
        !           275:   while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
        !           276:   if( *zFrom!=0 ) return 0;
        !           277:   if( xCond && !xCond(z) ) return 1;
        !           278:   while( *zTo ){
        !           279:     *(--z) = *(zTo++);
        !           280:   }
        !           281:   *pz = z;
        !           282:   return 1;
        !           283: }
        !           284: 
        !           285: /*
        !           286: ** This is the fallback stemmer used when the porter stemmer is
        !           287: ** inappropriate.  The input word is copied into the output with
        !           288: ** US-ASCII case folding.  If the input word is too long (more
        !           289: ** than 20 bytes if it contains no digits or more than 6 bytes if
        !           290: ** it contains digits) then word is truncated to 20 or 6 bytes
        !           291: ** by taking 10 or 3 bytes from the beginning and end.
        !           292: */
        !           293: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
        !           294:   int i, mx, j;
        !           295:   int hasDigit = 0;
        !           296:   for(i=0; i<nIn; i++){
        !           297:     int c = zIn[i];
        !           298:     if( c>='A' && c<='Z' ){
        !           299:       zOut[i] = c - 'A' + 'a';
        !           300:     }else{
        !           301:       if( c>='0' && c<='9' ) hasDigit = 1;
        !           302:       zOut[i] = c;
        !           303:     }
        !           304:   }
        !           305:   mx = hasDigit ? 3 : 10;
        !           306:   if( nIn>mx*2 ){
        !           307:     for(j=mx, i=nIn-mx; i<nIn; i++, j++){
        !           308:       zOut[j] = zOut[i];
        !           309:     }
        !           310:     i = j;
        !           311:   }
        !           312:   zOut[i] = 0;
        !           313:   *pnOut = i;
        !           314: }
        !           315: 
        !           316: 
        !           317: /*
        !           318: ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
        !           319: ** zOut is at least big enough to hold nIn bytes.  Write the actual
        !           320: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
        !           321: **
        !           322: ** Any upper-case characters in the US-ASCII character set ([A-Z])
        !           323: ** are converted to lower case.  Upper-case UTF characters are
        !           324: ** unchanged.
        !           325: **
        !           326: ** Words that are longer than about 20 bytes are stemmed by retaining
        !           327: ** a few bytes from the beginning and the end of the word.  If the
        !           328: ** word contains digits, 3 bytes are taken from the beginning and
        !           329: ** 3 bytes from the end.  For long words without digits, 10 bytes
        !           330: ** are taken from each end.  US-ASCII case folding still applies.
        !           331: ** 
        !           332: ** If the input word contains not digits but does characters not 
        !           333: ** in [a-zA-Z] then no stemming is attempted and this routine just 
        !           334: ** copies the input into the input into the output with US-ASCII
        !           335: ** case folding.
        !           336: **
        !           337: ** Stemming never increases the length of the word.  So there is
        !           338: ** no chance of overflowing the zOut buffer.
        !           339: */
        !           340: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
        !           341:   int i, j, c;
        !           342:   char zReverse[28];
        !           343:   char *z, *z2;
        !           344:   if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
        !           345:     /* The word is too big or too small for the porter stemmer.
        !           346:     ** Fallback to the copy stemmer */
        !           347:     copy_stemmer(zIn, nIn, zOut, pnOut);
        !           348:     return;
        !           349:   }
        !           350:   for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
        !           351:     c = zIn[i];
        !           352:     if( c>='A' && c<='Z' ){
        !           353:       zReverse[j] = c + 'a' - 'A';
        !           354:     }else if( c>='a' && c<='z' ){
        !           355:       zReverse[j] = c;
        !           356:     }else{
        !           357:       /* The use of a character not in [a-zA-Z] means that we fallback
        !           358:       ** to the copy stemmer */
        !           359:       copy_stemmer(zIn, nIn, zOut, pnOut);
        !           360:       return;
        !           361:     }
        !           362:   }
        !           363:   memset(&zReverse[sizeof(zReverse)-5], 0, 5);
        !           364:   z = &zReverse[j+1];
        !           365: 
        !           366: 
        !           367:   /* Step 1a */
        !           368:   if( z[0]=='s' ){
        !           369:     if(
        !           370:      !stem(&z, "sess", "ss", 0) &&
        !           371:      !stem(&z, "sei", "i", 0)  &&
        !           372:      !stem(&z, "ss", "ss", 0)
        !           373:     ){
        !           374:       z++;
        !           375:     }
        !           376:   }
        !           377: 
        !           378:   /* Step 1b */  
        !           379:   z2 = z;
        !           380:   if( stem(&z, "dee", "ee", m_gt_0) ){
        !           381:     /* Do nothing.  The work was all in the test */
        !           382:   }else if( 
        !           383:      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
        !           384:       && z!=z2
        !           385:   ){
        !           386:      if( stem(&z, "ta", "ate", 0) ||
        !           387:          stem(&z, "lb", "ble", 0) ||
        !           388:          stem(&z, "zi", "ize", 0) ){
        !           389:        /* Do nothing.  The work was all in the test */
        !           390:      }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
        !           391:        z++;
        !           392:      }else if( m_eq_1(z) && star_oh(z) ){
        !           393:        *(--z) = 'e';
        !           394:      }
        !           395:   }
        !           396: 
        !           397:   /* Step 1c */
        !           398:   if( z[0]=='y' && hasVowel(z+1) ){
        !           399:     z[0] = 'i';
        !           400:   }
        !           401: 
        !           402:   /* Step 2 */
        !           403:   switch( z[1] ){
        !           404:    case 'a':
        !           405:      stem(&z, "lanoita", "ate", m_gt_0) ||
        !           406:      stem(&z, "lanoit", "tion", m_gt_0);
        !           407:      break;
        !           408:    case 'c':
        !           409:      stem(&z, "icne", "ence", m_gt_0) ||
        !           410:      stem(&z, "icna", "ance", m_gt_0);
        !           411:      break;
        !           412:    case 'e':
        !           413:      stem(&z, "rezi", "ize", m_gt_0);
        !           414:      break;
        !           415:    case 'g':
        !           416:      stem(&z, "igol", "log", m_gt_0);
        !           417:      break;
        !           418:    case 'l':
        !           419:      stem(&z, "ilb", "ble", m_gt_0) ||
        !           420:      stem(&z, "illa", "al", m_gt_0) ||
        !           421:      stem(&z, "iltne", "ent", m_gt_0) ||
        !           422:      stem(&z, "ile", "e", m_gt_0) ||
        !           423:      stem(&z, "ilsuo", "ous", m_gt_0);
        !           424:      break;
        !           425:    case 'o':
        !           426:      stem(&z, "noitazi", "ize", m_gt_0) ||
        !           427:      stem(&z, "noita", "ate", m_gt_0) ||
        !           428:      stem(&z, "rota", "ate", m_gt_0);
        !           429:      break;
        !           430:    case 's':
        !           431:      stem(&z, "msila", "al", m_gt_0) ||
        !           432:      stem(&z, "ssenevi", "ive", m_gt_0) ||
        !           433:      stem(&z, "ssenluf", "ful", m_gt_0) ||
        !           434:      stem(&z, "ssensuo", "ous", m_gt_0);
        !           435:      break;
        !           436:    case 't':
        !           437:      stem(&z, "itila", "al", m_gt_0) ||
        !           438:      stem(&z, "itivi", "ive", m_gt_0) ||
        !           439:      stem(&z, "itilib", "ble", m_gt_0);
        !           440:      break;
        !           441:   }
        !           442: 
        !           443:   /* Step 3 */
        !           444:   switch( z[0] ){
        !           445:    case 'e':
        !           446:      stem(&z, "etaci", "ic", m_gt_0) ||
        !           447:      stem(&z, "evita", "", m_gt_0)   ||
        !           448:      stem(&z, "ezila", "al", m_gt_0);
        !           449:      break;
        !           450:    case 'i':
        !           451:      stem(&z, "itici", "ic", m_gt_0);
        !           452:      break;
        !           453:    case 'l':
        !           454:      stem(&z, "laci", "ic", m_gt_0) ||
        !           455:      stem(&z, "luf", "", m_gt_0);
        !           456:      break;
        !           457:    case 's':
        !           458:      stem(&z, "ssen", "", m_gt_0);
        !           459:      break;
        !           460:   }
        !           461: 
        !           462:   /* Step 4 */
        !           463:   switch( z[1] ){
        !           464:    case 'a':
        !           465:      if( z[0]=='l' && m_gt_1(z+2) ){
        !           466:        z += 2;
        !           467:      }
        !           468:      break;
        !           469:    case 'c':
        !           470:      if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
        !           471:        z += 4;
        !           472:      }
        !           473:      break;
        !           474:    case 'e':
        !           475:      if( z[0]=='r' && m_gt_1(z+2) ){
        !           476:        z += 2;
        !           477:      }
        !           478:      break;
        !           479:    case 'i':
        !           480:      if( z[0]=='c' && m_gt_1(z+2) ){
        !           481:        z += 2;
        !           482:      }
        !           483:      break;
        !           484:    case 'l':
        !           485:      if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
        !           486:        z += 4;
        !           487:      }
        !           488:      break;
        !           489:    case 'n':
        !           490:      if( z[0]=='t' ){
        !           491:        if( z[2]=='a' ){
        !           492:          if( m_gt_1(z+3) ){
        !           493:            z += 3;
        !           494:          }
        !           495:        }else if( z[2]=='e' ){
        !           496:          stem(&z, "tneme", "", m_gt_1) ||
        !           497:          stem(&z, "tnem", "", m_gt_1) ||
        !           498:          stem(&z, "tne", "", m_gt_1);
        !           499:        }
        !           500:      }
        !           501:      break;
        !           502:    case 'o':
        !           503:      if( z[0]=='u' ){
        !           504:        if( m_gt_1(z+2) ){
        !           505:          z += 2;
        !           506:        }
        !           507:      }else if( z[3]=='s' || z[3]=='t' ){
        !           508:        stem(&z, "noi", "", m_gt_1);
        !           509:      }
        !           510:      break;
        !           511:    case 's':
        !           512:      if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
        !           513:        z += 3;
        !           514:      }
        !           515:      break;
        !           516:    case 't':
        !           517:      stem(&z, "eta", "", m_gt_1) ||
        !           518:      stem(&z, "iti", "", m_gt_1);
        !           519:      break;
        !           520:    case 'u':
        !           521:      if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
        !           522:        z += 3;
        !           523:      }
        !           524:      break;
        !           525:    case 'v':
        !           526:    case 'z':
        !           527:      if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
        !           528:        z += 3;
        !           529:      }
        !           530:      break;
        !           531:   }
        !           532: 
        !           533:   /* Step 5a */
        !           534:   if( z[0]=='e' ){
        !           535:     if( m_gt_1(z+1) ){
        !           536:       z++;
        !           537:     }else if( m_eq_1(z+1) && !star_oh(z+1) ){
        !           538:       z++;
        !           539:     }
        !           540:   }
        !           541: 
        !           542:   /* Step 5b */
        !           543:   if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
        !           544:     z++;
        !           545:   }
        !           546: 
        !           547:   /* z[] is now the stemmed word in reverse order.  Flip it back
        !           548:   ** around into forward order and return.
        !           549:   */
        !           550:   *pnOut = i = strlen(z);
        !           551:   zOut[i] = 0;
        !           552:   while( *z ){
        !           553:     zOut[--i] = *(z++);
        !           554:   }
        !           555: }
        !           556: 
        !           557: /*
        !           558: ** Characters that can be part of a token.  We assume any character
        !           559: ** whose value is greater than 0x80 (any UTF character) can be
        !           560: ** part of a token.  In other words, delimiters all must have
        !           561: ** values of 0x7f or lower.
        !           562: */
        !           563: static const char isIdChar[] = {
        !           564: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
        !           565:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
        !           566:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
        !           567:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
        !           568:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
        !           569:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
        !           570: };
        !           571: #define idChar(C)  (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
        !           572: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
        !           573: 
        !           574: /*
        !           575: ** Extract the next token from a tokenization cursor.  The cursor must
        !           576: ** have been opened by a prior call to porterOpen().
        !           577: */
        !           578: static int porterNext(
        !           579:   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
        !           580:   const char **pzToken,               /* OUT: *pzToken is the token text */
        !           581:   int *pnBytes,                       /* OUT: Number of bytes in token */
        !           582:   int *piStartOffset,                 /* OUT: Starting offset of token */
        !           583:   int *piEndOffset,                   /* OUT: Ending offset of token */
        !           584:   int *piPosition                     /* OUT: Position integer of token */
        !           585: ){
        !           586:   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
        !           587:   const char *z = c->zInput;
        !           588: 
        !           589:   while( c->iOffset<c->nInput ){
        !           590:     int iStartOffset, ch;
        !           591: 
        !           592:     /* Scan past delimiter characters */
        !           593:     while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
        !           594:       c->iOffset++;
        !           595:     }
        !           596: 
        !           597:     /* Count non-delimiter characters. */
        !           598:     iStartOffset = c->iOffset;
        !           599:     while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
        !           600:       c->iOffset++;
        !           601:     }
        !           602: 
        !           603:     if( c->iOffset>iStartOffset ){
        !           604:       int n = c->iOffset-iStartOffset;
        !           605:       if( n>c->nAllocated ){
        !           606:         c->nAllocated = n+20;
        !           607:         c->zToken = realloc(c->zToken, c->nAllocated);
        !           608:         if( c->zToken==NULL ) return SQLITE_NOMEM;
        !           609:       }
        !           610:       porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
        !           611:       *pzToken = c->zToken;
        !           612:       *piStartOffset = iStartOffset;
        !           613:       *piEndOffset = c->iOffset;
        !           614:       *piPosition = c->iToken++;
        !           615:       return SQLITE_OK;
        !           616:     }
        !           617:   }
        !           618:   return SQLITE_DONE;
        !           619: }
        !           620: 
        !           621: /*
        !           622: ** The set of routines that implement the porter-stemmer tokenizer
        !           623: */
        !           624: static const sqlite3_tokenizer_module porterTokenizerModule = {
        !           625:   0,
        !           626:   porterCreate,
        !           627:   porterDestroy,
        !           628:   porterOpen,
        !           629:   porterClose,
        !           630:   porterNext,
        !           631: };
        !           632: 
        !           633: /*
        !           634: ** Allocate a new porter tokenizer.  Return a pointer to the new
        !           635: ** tokenizer in *ppModule
        !           636: */
        !           637: void sqlite3Fts1PorterTokenizerModule(
        !           638:   sqlite3_tokenizer_module const**ppModule
        !           639: ){
        !           640:   *ppModule = &porterTokenizerModule;
        !           641: }
        !           642: 
        !           643: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>