Annotation of embedaddon/sqlite3/ext/fts1/fts1_porter.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2006 September 30
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: *************************************************************************
12: ** Implementation of the full-text-search tokenizer that implements
13: ** a Porter stemmer.
14: */
15:
16: /*
17: ** The code in this file is only compiled if:
18: **
19: ** * The FTS1 module is being built as an extension
20: ** (in which case SQLITE_CORE is not defined), or
21: **
22: ** * The FTS1 module is being built into the core of
23: ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
24: */
25: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
26:
27:
28: #include <assert.h>
29: #include <stdlib.h>
30: #include <stdio.h>
31: #include <string.h>
32: #include <ctype.h>
33:
34: #include "fts1_tokenizer.h"
35:
36: /*
37: ** Class derived from sqlite3_tokenizer
38: */
39: typedef struct porter_tokenizer {
40: sqlite3_tokenizer base; /* Base class */
41: } porter_tokenizer;
42:
43: /*
44: ** Class derived from sqlit3_tokenizer_cursor
45: */
46: typedef struct porter_tokenizer_cursor {
47: sqlite3_tokenizer_cursor base;
48: const char *zInput; /* input we are tokenizing */
49: int nInput; /* size of the input */
50: int iOffset; /* current position in zInput */
51: int iToken; /* index of next token to be returned */
52: char *zToken; /* storage for current token */
53: int nAllocated; /* space allocated to zToken buffer */
54: } porter_tokenizer_cursor;
55:
56:
57: /* Forward declaration */
58: static const sqlite3_tokenizer_module porterTokenizerModule;
59:
60:
61: /*
62: ** Create a new tokenizer instance.
63: */
64: static int porterCreate(
65: int argc, const char * const *argv,
66: sqlite3_tokenizer **ppTokenizer
67: ){
68: porter_tokenizer *t;
69: t = (porter_tokenizer *) calloc(sizeof(*t), 1);
70: if( t==NULL ) return SQLITE_NOMEM;
71:
72: *ppTokenizer = &t->base;
73: return SQLITE_OK;
74: }
75:
76: /*
77: ** Destroy a tokenizer
78: */
79: static int porterDestroy(sqlite3_tokenizer *pTokenizer){
80: free(pTokenizer);
81: return SQLITE_OK;
82: }
83:
84: /*
85: ** Prepare to begin tokenizing a particular string. The input
86: ** string to be tokenized is zInput[0..nInput-1]. A cursor
87: ** used to incrementally tokenize this string is returned in
88: ** *ppCursor.
89: */
90: static int porterOpen(
91: sqlite3_tokenizer *pTokenizer, /* The tokenizer */
92: const char *zInput, int nInput, /* String to be tokenized */
93: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
94: ){
95: porter_tokenizer_cursor *c;
96:
97: c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
98: if( c==NULL ) return SQLITE_NOMEM;
99:
100: c->zInput = zInput;
101: if( zInput==0 ){
102: c->nInput = 0;
103: }else if( nInput<0 ){
104: c->nInput = (int)strlen(zInput);
105: }else{
106: c->nInput = nInput;
107: }
108: c->iOffset = 0; /* start tokenizing at the beginning */
109: c->iToken = 0;
110: c->zToken = NULL; /* no space allocated, yet. */
111: c->nAllocated = 0;
112:
113: *ppCursor = &c->base;
114: return SQLITE_OK;
115: }
116:
117: /*
118: ** Close a tokenization cursor previously opened by a call to
119: ** porterOpen() above.
120: */
121: static int porterClose(sqlite3_tokenizer_cursor *pCursor){
122: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
123: free(c->zToken);
124: free(c);
125: return SQLITE_OK;
126: }
127: /*
128: ** Vowel or consonant
129: */
130: static const char cType[] = {
131: 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
132: 1, 1, 1, 2, 1
133: };
134:
135: /*
136: ** isConsonant() and isVowel() determine if their first character in
137: ** the string they point to is a consonant or a vowel, according
138: ** to Porter ruls.
139: **
140: ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
141: ** 'Y' is a consonant unless it follows another consonant,
142: ** in which case it is a vowel.
143: **
144: ** In these routine, the letters are in reverse order. So the 'y' rule
145: ** is that 'y' is a consonant unless it is followed by another
146: ** consonent.
147: */
148: static int isVowel(const char*);
149: static int isConsonant(const char *z){
150: int j;
151: char x = *z;
152: if( x==0 ) return 0;
153: assert( x>='a' && x<='z' );
154: j = cType[x-'a'];
155: if( j<2 ) return j;
156: return z[1]==0 || isVowel(z + 1);
157: }
158: static int isVowel(const char *z){
159: int j;
160: char x = *z;
161: if( x==0 ) return 0;
162: assert( x>='a' && x<='z' );
163: j = cType[x-'a'];
164: if( j<2 ) return 1-j;
165: return isConsonant(z + 1);
166: }
167:
168: /*
169: ** Let any sequence of one or more vowels be represented by V and let
170: ** C be sequence of one or more consonants. Then every word can be
171: ** represented as:
172: **
173: ** [C] (VC){m} [V]
174: **
175: ** In prose: A word is an optional consonant followed by zero or
176: ** vowel-consonant pairs followed by an optional vowel. "m" is the
177: ** number of vowel consonant pairs. This routine computes the value
178: ** of m for the first i bytes of a word.
179: **
180: ** Return true if the m-value for z is 1 or more. In other words,
181: ** return true if z contains at least one vowel that is followed
182: ** by a consonant.
183: **
184: ** In this routine z[] is in reverse order. So we are really looking
185: ** for an instance of of a consonant followed by a vowel.
186: */
187: static int m_gt_0(const char *z){
188: while( isVowel(z) ){ z++; }
189: if( *z==0 ) return 0;
190: while( isConsonant(z) ){ z++; }
191: return *z!=0;
192: }
193:
194: /* Like mgt0 above except we are looking for a value of m which is
195: ** exactly 1
196: */
197: static int m_eq_1(const char *z){
198: while( isVowel(z) ){ z++; }
199: if( *z==0 ) return 0;
200: while( isConsonant(z) ){ z++; }
201: if( *z==0 ) return 0;
202: while( isVowel(z) ){ z++; }
203: if( *z==0 ) return 1;
204: while( isConsonant(z) ){ z++; }
205: return *z==0;
206: }
207:
208: /* Like mgt0 above except we are looking for a value of m>1 instead
209: ** or m>0
210: */
211: static int m_gt_1(const char *z){
212: while( isVowel(z) ){ z++; }
213: if( *z==0 ) return 0;
214: while( isConsonant(z) ){ z++; }
215: if( *z==0 ) return 0;
216: while( isVowel(z) ){ z++; }
217: if( *z==0 ) return 0;
218: while( isConsonant(z) ){ z++; }
219: return *z!=0;
220: }
221:
222: /*
223: ** Return TRUE if there is a vowel anywhere within z[0..n-1]
224: */
225: static int hasVowel(const char *z){
226: while( isConsonant(z) ){ z++; }
227: return *z!=0;
228: }
229:
230: /*
231: ** Return TRUE if the word ends in a double consonant.
232: **
233: ** The text is reversed here. So we are really looking at
234: ** the first two characters of z[].
235: */
236: static int doubleConsonant(const char *z){
237: return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
238: }
239:
240: /*
241: ** Return TRUE if the word ends with three letters which
242: ** are consonant-vowel-consonent and where the final consonant
243: ** is not 'w', 'x', or 'y'.
244: **
245: ** The word is reversed here. So we are really checking the
246: ** first three letters and the first one cannot be in [wxy].
247: */
248: static int star_oh(const char *z){
249: return
250: z[0]!=0 && isConsonant(z) &&
251: z[0]!='w' && z[0]!='x' && z[0]!='y' &&
252: z[1]!=0 && isVowel(z+1) &&
253: z[2]!=0 && isConsonant(z+2);
254: }
255:
256: /*
257: ** If the word ends with zFrom and xCond() is true for the stem
258: ** of the word that preceeds the zFrom ending, then change the
259: ** ending to zTo.
260: **
261: ** The input word *pz and zFrom are both in reverse order. zTo
262: ** is in normal order.
263: **
264: ** Return TRUE if zFrom matches. Return FALSE if zFrom does not
265: ** match. Not that TRUE is returned even if xCond() fails and
266: ** no substitution occurs.
267: */
268: static int stem(
269: char **pz, /* The word being stemmed (Reversed) */
270: const char *zFrom, /* If the ending matches this... (Reversed) */
271: const char *zTo, /* ... change the ending to this (not reversed) */
272: int (*xCond)(const char*) /* Condition that must be true */
273: ){
274: char *z = *pz;
275: while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
276: if( *zFrom!=0 ) return 0;
277: if( xCond && !xCond(z) ) return 1;
278: while( *zTo ){
279: *(--z) = *(zTo++);
280: }
281: *pz = z;
282: return 1;
283: }
284:
285: /*
286: ** This is the fallback stemmer used when the porter stemmer is
287: ** inappropriate. The input word is copied into the output with
288: ** US-ASCII case folding. If the input word is too long (more
289: ** than 20 bytes if it contains no digits or more than 6 bytes if
290: ** it contains digits) then word is truncated to 20 or 6 bytes
291: ** by taking 10 or 3 bytes from the beginning and end.
292: */
293: static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
294: int i, mx, j;
295: int hasDigit = 0;
296: for(i=0; i<nIn; i++){
297: int c = zIn[i];
298: if( c>='A' && c<='Z' ){
299: zOut[i] = c - 'A' + 'a';
300: }else{
301: if( c>='0' && c<='9' ) hasDigit = 1;
302: zOut[i] = c;
303: }
304: }
305: mx = hasDigit ? 3 : 10;
306: if( nIn>mx*2 ){
307: for(j=mx, i=nIn-mx; i<nIn; i++, j++){
308: zOut[j] = zOut[i];
309: }
310: i = j;
311: }
312: zOut[i] = 0;
313: *pnOut = i;
314: }
315:
316:
317: /*
318: ** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
319: ** zOut is at least big enough to hold nIn bytes. Write the actual
320: ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
321: **
322: ** Any upper-case characters in the US-ASCII character set ([A-Z])
323: ** are converted to lower case. Upper-case UTF characters are
324: ** unchanged.
325: **
326: ** Words that are longer than about 20 bytes are stemmed by retaining
327: ** a few bytes from the beginning and the end of the word. If the
328: ** word contains digits, 3 bytes are taken from the beginning and
329: ** 3 bytes from the end. For long words without digits, 10 bytes
330: ** are taken from each end. US-ASCII case folding still applies.
331: **
332: ** If the input word contains not digits but does characters not
333: ** in [a-zA-Z] then no stemming is attempted and this routine just
334: ** copies the input into the input into the output with US-ASCII
335: ** case folding.
336: **
337: ** Stemming never increases the length of the word. So there is
338: ** no chance of overflowing the zOut buffer.
339: */
340: static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
341: int i, j, c;
342: char zReverse[28];
343: char *z, *z2;
344: if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
345: /* The word is too big or too small for the porter stemmer.
346: ** Fallback to the copy stemmer */
347: copy_stemmer(zIn, nIn, zOut, pnOut);
348: return;
349: }
350: for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
351: c = zIn[i];
352: if( c>='A' && c<='Z' ){
353: zReverse[j] = c + 'a' - 'A';
354: }else if( c>='a' && c<='z' ){
355: zReverse[j] = c;
356: }else{
357: /* The use of a character not in [a-zA-Z] means that we fallback
358: ** to the copy stemmer */
359: copy_stemmer(zIn, nIn, zOut, pnOut);
360: return;
361: }
362: }
363: memset(&zReverse[sizeof(zReverse)-5], 0, 5);
364: z = &zReverse[j+1];
365:
366:
367: /* Step 1a */
368: if( z[0]=='s' ){
369: if(
370: !stem(&z, "sess", "ss", 0) &&
371: !stem(&z, "sei", "i", 0) &&
372: !stem(&z, "ss", "ss", 0)
373: ){
374: z++;
375: }
376: }
377:
378: /* Step 1b */
379: z2 = z;
380: if( stem(&z, "dee", "ee", m_gt_0) ){
381: /* Do nothing. The work was all in the test */
382: }else if(
383: (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
384: && z!=z2
385: ){
386: if( stem(&z, "ta", "ate", 0) ||
387: stem(&z, "lb", "ble", 0) ||
388: stem(&z, "zi", "ize", 0) ){
389: /* Do nothing. The work was all in the test */
390: }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
391: z++;
392: }else if( m_eq_1(z) && star_oh(z) ){
393: *(--z) = 'e';
394: }
395: }
396:
397: /* Step 1c */
398: if( z[0]=='y' && hasVowel(z+1) ){
399: z[0] = 'i';
400: }
401:
402: /* Step 2 */
403: switch( z[1] ){
404: case 'a':
405: stem(&z, "lanoita", "ate", m_gt_0) ||
406: stem(&z, "lanoit", "tion", m_gt_0);
407: break;
408: case 'c':
409: stem(&z, "icne", "ence", m_gt_0) ||
410: stem(&z, "icna", "ance", m_gt_0);
411: break;
412: case 'e':
413: stem(&z, "rezi", "ize", m_gt_0);
414: break;
415: case 'g':
416: stem(&z, "igol", "log", m_gt_0);
417: break;
418: case 'l':
419: stem(&z, "ilb", "ble", m_gt_0) ||
420: stem(&z, "illa", "al", m_gt_0) ||
421: stem(&z, "iltne", "ent", m_gt_0) ||
422: stem(&z, "ile", "e", m_gt_0) ||
423: stem(&z, "ilsuo", "ous", m_gt_0);
424: break;
425: case 'o':
426: stem(&z, "noitazi", "ize", m_gt_0) ||
427: stem(&z, "noita", "ate", m_gt_0) ||
428: stem(&z, "rota", "ate", m_gt_0);
429: break;
430: case 's':
431: stem(&z, "msila", "al", m_gt_0) ||
432: stem(&z, "ssenevi", "ive", m_gt_0) ||
433: stem(&z, "ssenluf", "ful", m_gt_0) ||
434: stem(&z, "ssensuo", "ous", m_gt_0);
435: break;
436: case 't':
437: stem(&z, "itila", "al", m_gt_0) ||
438: stem(&z, "itivi", "ive", m_gt_0) ||
439: stem(&z, "itilib", "ble", m_gt_0);
440: break;
441: }
442:
443: /* Step 3 */
444: switch( z[0] ){
445: case 'e':
446: stem(&z, "etaci", "ic", m_gt_0) ||
447: stem(&z, "evita", "", m_gt_0) ||
448: stem(&z, "ezila", "al", m_gt_0);
449: break;
450: case 'i':
451: stem(&z, "itici", "ic", m_gt_0);
452: break;
453: case 'l':
454: stem(&z, "laci", "ic", m_gt_0) ||
455: stem(&z, "luf", "", m_gt_0);
456: break;
457: case 's':
458: stem(&z, "ssen", "", m_gt_0);
459: break;
460: }
461:
462: /* Step 4 */
463: switch( z[1] ){
464: case 'a':
465: if( z[0]=='l' && m_gt_1(z+2) ){
466: z += 2;
467: }
468: break;
469: case 'c':
470: if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
471: z += 4;
472: }
473: break;
474: case 'e':
475: if( z[0]=='r' && m_gt_1(z+2) ){
476: z += 2;
477: }
478: break;
479: case 'i':
480: if( z[0]=='c' && m_gt_1(z+2) ){
481: z += 2;
482: }
483: break;
484: case 'l':
485: if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
486: z += 4;
487: }
488: break;
489: case 'n':
490: if( z[0]=='t' ){
491: if( z[2]=='a' ){
492: if( m_gt_1(z+3) ){
493: z += 3;
494: }
495: }else if( z[2]=='e' ){
496: stem(&z, "tneme", "", m_gt_1) ||
497: stem(&z, "tnem", "", m_gt_1) ||
498: stem(&z, "tne", "", m_gt_1);
499: }
500: }
501: break;
502: case 'o':
503: if( z[0]=='u' ){
504: if( m_gt_1(z+2) ){
505: z += 2;
506: }
507: }else if( z[3]=='s' || z[3]=='t' ){
508: stem(&z, "noi", "", m_gt_1);
509: }
510: break;
511: case 's':
512: if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
513: z += 3;
514: }
515: break;
516: case 't':
517: stem(&z, "eta", "", m_gt_1) ||
518: stem(&z, "iti", "", m_gt_1);
519: break;
520: case 'u':
521: if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
522: z += 3;
523: }
524: break;
525: case 'v':
526: case 'z':
527: if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
528: z += 3;
529: }
530: break;
531: }
532:
533: /* Step 5a */
534: if( z[0]=='e' ){
535: if( m_gt_1(z+1) ){
536: z++;
537: }else if( m_eq_1(z+1) && !star_oh(z+1) ){
538: z++;
539: }
540: }
541:
542: /* Step 5b */
543: if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
544: z++;
545: }
546:
547: /* z[] is now the stemmed word in reverse order. Flip it back
548: ** around into forward order and return.
549: */
550: *pnOut = i = strlen(z);
551: zOut[i] = 0;
552: while( *z ){
553: zOut[--i] = *(z++);
554: }
555: }
556:
557: /*
558: ** Characters that can be part of a token. We assume any character
559: ** whose value is greater than 0x80 (any UTF character) can be
560: ** part of a token. In other words, delimiters all must have
561: ** values of 0x7f or lower.
562: */
563: static const char isIdChar[] = {
564: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
565: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
566: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
567: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
568: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
569: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
570: };
571: #define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
572: #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
573:
574: /*
575: ** Extract the next token from a tokenization cursor. The cursor must
576: ** have been opened by a prior call to porterOpen().
577: */
578: static int porterNext(
579: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
580: const char **pzToken, /* OUT: *pzToken is the token text */
581: int *pnBytes, /* OUT: Number of bytes in token */
582: int *piStartOffset, /* OUT: Starting offset of token */
583: int *piEndOffset, /* OUT: Ending offset of token */
584: int *piPosition /* OUT: Position integer of token */
585: ){
586: porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
587: const char *z = c->zInput;
588:
589: while( c->iOffset<c->nInput ){
590: int iStartOffset, ch;
591:
592: /* Scan past delimiter characters */
593: while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
594: c->iOffset++;
595: }
596:
597: /* Count non-delimiter characters. */
598: iStartOffset = c->iOffset;
599: while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
600: c->iOffset++;
601: }
602:
603: if( c->iOffset>iStartOffset ){
604: int n = c->iOffset-iStartOffset;
605: if( n>c->nAllocated ){
606: c->nAllocated = n+20;
607: c->zToken = realloc(c->zToken, c->nAllocated);
608: if( c->zToken==NULL ) return SQLITE_NOMEM;
609: }
610: porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
611: *pzToken = c->zToken;
612: *piStartOffset = iStartOffset;
613: *piEndOffset = c->iOffset;
614: *piPosition = c->iToken++;
615: return SQLITE_OK;
616: }
617: }
618: return SQLITE_DONE;
619: }
620:
621: /*
622: ** The set of routines that implement the porter-stemmer tokenizer
623: */
624: static const sqlite3_tokenizer_module porterTokenizerModule = {
625: 0,
626: porterCreate,
627: porterDestroy,
628: porterOpen,
629: porterClose,
630: porterNext,
631: };
632:
633: /*
634: ** Allocate a new porter tokenizer. Return a pointer to the new
635: ** tokenizer in *ppModule
636: */
637: void sqlite3Fts1PorterTokenizerModule(
638: sqlite3_tokenizer_module const**ppModule
639: ){
640: *ppModule = &porterTokenizerModule;
641: }
642:
643: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>