Annotation of embedaddon/sqlite3/ext/fts1/fts1.c, revision 1.1.1.1
1.1 misho 1: /* fts1 has a design flaw which can lead to database corruption (see
2: ** below). It is recommended not to use it any longer, instead use
3: ** fts3 (or higher). If you believe that your use of fts1 is safe,
4: ** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
5: */
6: #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)) \
7: && !defined(SQLITE_ENABLE_BROKEN_FTS1)
8: #error fts1 has a design flaw and has been deprecated.
9: #endif
10: /* The flaw is that fts1 uses the content table's unaliased rowid as
11: ** the unique docid. fts1 embeds the rowid in the index it builds,
12: ** and expects the rowid to not change. The SQLite VACUUM operation
13: ** will renumber such rowids, thereby breaking fts1. If you are using
14: ** fts1 in a system which has disabled VACUUM, then you can continue
15: ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
16: ** VACUUM, though systems using auto_vacuum are unlikely to invoke
17: ** VACUUM.
18: **
19: ** fts1 should be safe even across VACUUM if you only insert documents
20: ** and never delete.
21: */
22:
23: /* The author disclaims copyright to this source code.
24: *
25: * This is an SQLite module implementing full-text search.
26: */
27:
28: /*
29: ** The code in this file is only compiled if:
30: **
31: ** * The FTS1 module is being built as an extension
32: ** (in which case SQLITE_CORE is not defined), or
33: **
34: ** * The FTS1 module is being built into the core of
35: ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
36: */
37: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
38:
39: #if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
40: # define SQLITE_CORE 1
41: #endif
42:
43: #include <assert.h>
44: #include <stdlib.h>
45: #include <stdio.h>
46: #include <string.h>
47: #include <ctype.h>
48:
49: #include "fts1.h"
50: #include "fts1_hash.h"
51: #include "fts1_tokenizer.h"
52: #include "sqlite3.h"
53: #include "sqlite3ext.h"
54: SQLITE_EXTENSION_INIT1
55:
56:
57: #if 0
58: # define TRACE(A) printf A; fflush(stdout)
59: #else
60: # define TRACE(A)
61: #endif
62:
63: /* utility functions */
64:
65: typedef struct StringBuffer {
66: int len; /* length, not including null terminator */
67: int alloced; /* Space allocated for s[] */
68: char *s; /* Content of the string */
69: } StringBuffer;
70:
71: static void initStringBuffer(StringBuffer *sb){
72: sb->len = 0;
73: sb->alloced = 100;
74: sb->s = malloc(100);
75: sb->s[0] = '\0';
76: }
77:
78: static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
79: if( sb->len + nFrom >= sb->alloced ){
80: sb->alloced = sb->len + nFrom + 100;
81: sb->s = realloc(sb->s, sb->alloced+1);
82: if( sb->s==0 ){
83: initStringBuffer(sb);
84: return;
85: }
86: }
87: memcpy(sb->s + sb->len, zFrom, nFrom);
88: sb->len += nFrom;
89: sb->s[sb->len] = 0;
90: }
91: static void append(StringBuffer *sb, const char *zFrom){
92: nappend(sb, zFrom, strlen(zFrom));
93: }
94:
95: /* We encode variable-length integers in little-endian order using seven bits
96: * per byte as follows:
97: **
98: ** KEY:
99: ** A = 0xxxxxxx 7 bits of data and one flag bit
100: ** B = 1xxxxxxx 7 bits of data and one flag bit
101: **
102: ** 7 bits - A
103: ** 14 bits - BA
104: ** 21 bits - BBA
105: ** and so on.
106: */
107:
108: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
109: #define VARINT_MAX 10
110:
111: /* Write a 64-bit variable-length integer to memory starting at p[0].
112: * The length of data written will be between 1 and VARINT_MAX bytes.
113: * The number of bytes written is returned. */
114: static int putVarint(char *p, sqlite_int64 v){
115: unsigned char *q = (unsigned char *) p;
116: sqlite_uint64 vu = v;
117: do{
118: *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
119: vu >>= 7;
120: }while( vu!=0 );
121: q[-1] &= 0x7f; /* turn off high bit in final byte */
122: assert( q - (unsigned char *)p <= VARINT_MAX );
123: return (int) (q - (unsigned char *)p);
124: }
125:
126: /* Read a 64-bit variable-length integer from memory starting at p[0].
127: * Return the number of bytes read, or 0 on error.
128: * The value is stored in *v. */
129: static int getVarint(const char *p, sqlite_int64 *v){
130: const unsigned char *q = (const unsigned char *) p;
131: sqlite_uint64 x = 0, y = 1;
132: while( (*q & 0x80) == 0x80 ){
133: x += y * (*q++ & 0x7f);
134: y <<= 7;
135: if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
136: assert( 0 );
137: return 0;
138: }
139: }
140: x += y * (*q++);
141: *v = (sqlite_int64) x;
142: return (int) (q - (unsigned char *)p);
143: }
144:
145: static int getVarint32(const char *p, int *pi){
146: sqlite_int64 i;
147: int ret = getVarint(p, &i);
148: *pi = (int) i;
149: assert( *pi==i );
150: return ret;
151: }
152:
153: /*** Document lists ***
154: *
155: * A document list holds a sorted list of varint-encoded document IDs.
156: *
157: * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
158: *
159: * array {
160: * varint docid;
161: * array {
162: * varint position; (delta from previous position plus POS_BASE)
163: * varint startOffset; (delta from previous startOffset)
164: * varint endOffset; (delta from startOffset)
165: * }
166: * }
167: *
168: * Here, array { X } means zero or more occurrences of X, adjacent in memory.
169: *
170: * A position list may hold positions for text in multiple columns. A position
171: * POS_COLUMN is followed by a varint containing the index of the column for
172: * following positions in the list. Any positions appearing before any
173: * occurrences of POS_COLUMN are for column 0.
174: *
175: * A doclist with type DL_POSITIONS is like the above, but holds only docids
176: * and positions without offset information.
177: *
178: * A doclist with type DL_DOCIDS is like the above, but holds only docids
179: * without positions or offset information.
180: *
181: * On disk, every document list has positions and offsets, so we don't bother
182: * to serialize a doclist's type.
183: *
184: * We don't yet delta-encode document IDs; doing so will probably be a
185: * modest win.
186: *
187: * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
188: * After the first offset, estimate the next offset by using the
189: * current token position and the previous token position and offset,
190: * offset to handle some variance. So the estimate would be
191: * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
192: * as normal. Offsets more than 64 chars from the estimate are
193: * encoded as the delta to the previous start offset + 128. An
194: * additional tiny increment can be gained by using the end offset of
195: * the previous token to make the estimate a tiny bit more precise.
196: */
197:
198: /* It is not safe to call isspace(), tolower(), or isalnum() on
199: ** hi-bit-set characters. This is the same solution used in the
200: ** tokenizer.
201: */
202: /* TODO(shess) The snippet-generation code should be using the
203: ** tokenizer-generated tokens rather than doing its own local
204: ** tokenization.
205: */
206: /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
207: static int safe_isspace(char c){
208: return (c&0x80)==0 ? isspace(c) : 0;
209: }
210: static int safe_tolower(char c){
211: return (c&0x80)==0 ? tolower(c) : c;
212: }
213: static int safe_isalnum(char c){
214: return (c&0x80)==0 ? isalnum(c) : 0;
215: }
216:
217: typedef enum DocListType {
218: DL_DOCIDS, /* docids only */
219: DL_POSITIONS, /* docids + positions */
220: DL_POSITIONS_OFFSETS /* docids + positions + offsets */
221: } DocListType;
222:
223: /*
224: ** By default, only positions and not offsets are stored in the doclists.
225: ** To change this so that offsets are stored too, compile with
226: **
227: ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
228: **
229: */
230: #ifndef DL_DEFAULT
231: # define DL_DEFAULT DL_POSITIONS
232: #endif
233:
234: typedef struct DocList {
235: char *pData;
236: int nData;
237: DocListType iType;
238: int iLastColumn; /* the last column written */
239: int iLastPos; /* the last position written */
240: int iLastOffset; /* the last start offset written */
241: } DocList;
242:
243: enum {
244: POS_END = 0, /* end of this position list */
245: POS_COLUMN, /* followed by new column number */
246: POS_BASE
247: };
248:
249: /* Initialize a new DocList to hold the given data. */
250: static void docListInit(DocList *d, DocListType iType,
251: const char *pData, int nData){
252: d->nData = nData;
253: if( nData>0 ){
254: d->pData = malloc(nData);
255: memcpy(d->pData, pData, nData);
256: } else {
257: d->pData = NULL;
258: }
259: d->iType = iType;
260: d->iLastColumn = 0;
261: d->iLastPos = d->iLastOffset = 0;
262: }
263:
264: /* Create a new dynamically-allocated DocList. */
265: static DocList *docListNew(DocListType iType){
266: DocList *d = (DocList *) malloc(sizeof(DocList));
267: docListInit(d, iType, 0, 0);
268: return d;
269: }
270:
271: static void docListDestroy(DocList *d){
272: free(d->pData);
273: #ifndef NDEBUG
274: memset(d, 0x55, sizeof(*d));
275: #endif
276: }
277:
278: static void docListDelete(DocList *d){
279: docListDestroy(d);
280: free(d);
281: }
282:
283: static char *docListEnd(DocList *d){
284: return d->pData + d->nData;
285: }
286:
287: /* Append a varint to a DocList's data. */
288: static void appendVarint(DocList *d, sqlite_int64 i){
289: char c[VARINT_MAX];
290: int n = putVarint(c, i);
291: d->pData = realloc(d->pData, d->nData + n);
292: memcpy(d->pData + d->nData, c, n);
293: d->nData += n;
294: }
295:
296: static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
297: appendVarint(d, iDocid);
298: if( d->iType>=DL_POSITIONS ){
299: appendVarint(d, POS_END); /* initially empty position list */
300: d->iLastColumn = 0;
301: d->iLastPos = d->iLastOffset = 0;
302: }
303: }
304:
305: /* helper function for docListAddPos and docListAddPosOffset */
306: static void addPos(DocList *d, int iColumn, int iPos){
307: assert( d->nData>0 );
308: --d->nData; /* remove previous terminator */
309: if( iColumn!=d->iLastColumn ){
310: assert( iColumn>d->iLastColumn );
311: appendVarint(d, POS_COLUMN);
312: appendVarint(d, iColumn);
313: d->iLastColumn = iColumn;
314: d->iLastPos = d->iLastOffset = 0;
315: }
316: assert( iPos>=d->iLastPos );
317: appendVarint(d, iPos-d->iLastPos+POS_BASE);
318: d->iLastPos = iPos;
319: }
320:
321: /* Add a position to the last position list in a doclist. */
322: static void docListAddPos(DocList *d, int iColumn, int iPos){
323: assert( d->iType==DL_POSITIONS );
324: addPos(d, iColumn, iPos);
325: appendVarint(d, POS_END); /* add new terminator */
326: }
327:
328: /*
329: ** Add a position and starting and ending offsets to a doclist.
330: **
331: ** If the doclist is setup to handle only positions, then insert
332: ** the position only and ignore the offsets.
333: */
334: static void docListAddPosOffset(
335: DocList *d, /* Doclist under construction */
336: int iColumn, /* Column the inserted term is part of */
337: int iPos, /* Position of the inserted term */
338: int iStartOffset, /* Starting offset of inserted term */
339: int iEndOffset /* Ending offset of inserted term */
340: ){
341: assert( d->iType>=DL_POSITIONS );
342: addPos(d, iColumn, iPos);
343: if( d->iType==DL_POSITIONS_OFFSETS ){
344: assert( iStartOffset>=d->iLastOffset );
345: appendVarint(d, iStartOffset-d->iLastOffset);
346: d->iLastOffset = iStartOffset;
347: assert( iEndOffset>=iStartOffset );
348: appendVarint(d, iEndOffset-iStartOffset);
349: }
350: appendVarint(d, POS_END); /* add new terminator */
351: }
352:
353: /*
354: ** A DocListReader object is a cursor into a doclist. Initialize
355: ** the cursor to the beginning of the doclist by calling readerInit().
356: ** Then use routines
357: **
358: ** peekDocid()
359: ** readDocid()
360: ** readPosition()
361: ** skipPositionList()
362: ** and so forth...
363: **
364: ** to read information out of the doclist. When we reach the end
365: ** of the doclist, atEnd() returns TRUE.
366: */
367: typedef struct DocListReader {
368: DocList *pDoclist; /* The document list we are stepping through */
369: char *p; /* Pointer to next unread byte in the doclist */
370: int iLastColumn;
371: int iLastPos; /* the last position read, or -1 when not in a position list */
372: } DocListReader;
373:
374: /*
375: ** Initialize the DocListReader r to point to the beginning of pDoclist.
376: */
377: static void readerInit(DocListReader *r, DocList *pDoclist){
378: r->pDoclist = pDoclist;
379: if( pDoclist!=NULL ){
380: r->p = pDoclist->pData;
381: }
382: r->iLastColumn = -1;
383: r->iLastPos = -1;
384: }
385:
386: /*
387: ** Return TRUE if we have reached then end of pReader and there is
388: ** nothing else left to read.
389: */
390: static int atEnd(DocListReader *pReader){
391: return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
392: }
393:
394: /* Peek at the next docid without advancing the read pointer.
395: */
396: static sqlite_int64 peekDocid(DocListReader *pReader){
397: sqlite_int64 ret;
398: assert( !atEnd(pReader) );
399: assert( pReader->iLastPos==-1 );
400: getVarint(pReader->p, &ret);
401: return ret;
402: }
403:
404: /* Read the next docid. See also nextDocid().
405: */
406: static sqlite_int64 readDocid(DocListReader *pReader){
407: sqlite_int64 ret;
408: assert( !atEnd(pReader) );
409: assert( pReader->iLastPos==-1 );
410: pReader->p += getVarint(pReader->p, &ret);
411: if( pReader->pDoclist->iType>=DL_POSITIONS ){
412: pReader->iLastColumn = 0;
413: pReader->iLastPos = 0;
414: }
415: return ret;
416: }
417:
418: /* Read the next position and column index from a position list.
419: * Returns the position, or -1 at the end of the list. */
420: static int readPosition(DocListReader *pReader, int *iColumn){
421: int i;
422: int iType = pReader->pDoclist->iType;
423:
424: if( pReader->iLastPos==-1 ){
425: return -1;
426: }
427: assert( !atEnd(pReader) );
428:
429: if( iType<DL_POSITIONS ){
430: return -1;
431: }
432: pReader->p += getVarint32(pReader->p, &i);
433: if( i==POS_END ){
434: pReader->iLastColumn = pReader->iLastPos = -1;
435: *iColumn = -1;
436: return -1;
437: }
438: if( i==POS_COLUMN ){
439: pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
440: pReader->iLastPos = 0;
441: pReader->p += getVarint32(pReader->p, &i);
442: assert( i>=POS_BASE );
443: }
444: pReader->iLastPos += ((int) i)-POS_BASE;
445: if( iType>=DL_POSITIONS_OFFSETS ){
446: /* Skip over offsets, ignoring them for now. */
447: int iStart, iEnd;
448: pReader->p += getVarint32(pReader->p, &iStart);
449: pReader->p += getVarint32(pReader->p, &iEnd);
450: }
451: *iColumn = pReader->iLastColumn;
452: return pReader->iLastPos;
453: }
454:
455: /* Skip past the end of a position list. */
456: static void skipPositionList(DocListReader *pReader){
457: DocList *p = pReader->pDoclist;
458: if( p && p->iType>=DL_POSITIONS ){
459: int iColumn;
460: while( readPosition(pReader, &iColumn)!=-1 ){}
461: }
462: }
463:
464: /* Skip over a docid, including its position list if the doclist has
465: * positions. */
466: static void skipDocument(DocListReader *pReader){
467: readDocid(pReader);
468: skipPositionList(pReader);
469: }
470:
471: /* Skip past all docids which are less than [iDocid]. Returns 1 if a docid
472: * matching [iDocid] was found. */
473: static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
474: sqlite_int64 d = 0;
475: while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
476: skipDocument(pReader);
477: }
478: return !atEnd(pReader) && d==iDocid;
479: }
480:
481: /* Return the first document in a document list.
482: */
483: static sqlite_int64 firstDocid(DocList *d){
484: DocListReader r;
485: readerInit(&r, d);
486: return readDocid(&r);
487: }
488:
489: #ifdef SQLITE_DEBUG
490: /*
491: ** This routine is used for debugging purpose only.
492: **
493: ** Write the content of a doclist to standard output.
494: */
495: static void printDoclist(DocList *p){
496: DocListReader r;
497: const char *zSep = "";
498:
499: readerInit(&r, p);
500: while( !atEnd(&r) ){
501: sqlite_int64 docid = readDocid(&r);
502: if( docid==0 ){
503: skipPositionList(&r);
504: continue;
505: }
506: printf("%s%lld", zSep, docid);
507: zSep = ",";
508: if( p->iType>=DL_POSITIONS ){
509: int iPos, iCol;
510: const char *zDiv = "";
511: printf("(");
512: while( (iPos = readPosition(&r, &iCol))>=0 ){
513: printf("%s%d:%d", zDiv, iCol, iPos);
514: zDiv = ":";
515: }
516: printf(")");
517: }
518: }
519: printf("\n");
520: fflush(stdout);
521: }
522: #endif /* SQLITE_DEBUG */
523:
524: /* Trim the given doclist to contain only positions in column
525: * [iRestrictColumn]. */
526: static void docListRestrictColumn(DocList *in, int iRestrictColumn){
527: DocListReader r;
528: DocList out;
529:
530: assert( in->iType>=DL_POSITIONS );
531: readerInit(&r, in);
532: docListInit(&out, DL_POSITIONS, NULL, 0);
533:
534: while( !atEnd(&r) ){
535: sqlite_int64 iDocid = readDocid(&r);
536: int iPos, iColumn;
537:
538: docListAddDocid(&out, iDocid);
539: while( (iPos = readPosition(&r, &iColumn)) != -1 ){
540: if( iColumn==iRestrictColumn ){
541: docListAddPos(&out, iColumn, iPos);
542: }
543: }
544: }
545:
546: docListDestroy(in);
547: *in = out;
548: }
549:
550: /* Trim the given doclist by discarding any docids without any remaining
551: * positions. */
552: static void docListDiscardEmpty(DocList *in) {
553: DocListReader r;
554: DocList out;
555:
556: /* TODO: It would be nice to implement this operation in place; that
557: * could save a significant amount of memory in queries with long doclists. */
558: assert( in->iType>=DL_POSITIONS );
559: readerInit(&r, in);
560: docListInit(&out, DL_POSITIONS, NULL, 0);
561:
562: while( !atEnd(&r) ){
563: sqlite_int64 iDocid = readDocid(&r);
564: int match = 0;
565: int iPos, iColumn;
566: while( (iPos = readPosition(&r, &iColumn)) != -1 ){
567: if( !match ){
568: docListAddDocid(&out, iDocid);
569: match = 1;
570: }
571: docListAddPos(&out, iColumn, iPos);
572: }
573: }
574:
575: docListDestroy(in);
576: *in = out;
577: }
578:
579: /* Helper function for docListUpdate() and docListAccumulate().
580: ** Splices a doclist element into the doclist represented by r,
581: ** leaving r pointing after the newly spliced element.
582: */
583: static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
584: const char *pSource, int nSource){
585: DocList *d = r->pDoclist;
586: char *pTarget;
587: int nTarget, found;
588:
589: found = skipToDocid(r, iDocid);
590:
591: /* Describe slice in d to place pSource/nSource. */
592: pTarget = r->p;
593: if( found ){
594: skipDocument(r);
595: nTarget = r->p-pTarget;
596: }else{
597: nTarget = 0;
598: }
599:
600: /* The sense of the following is that there are three possibilities.
601: ** If nTarget==nSource, we should not move any memory nor realloc.
602: ** If nTarget>nSource, trim target and realloc.
603: ** If nTarget<nSource, realloc then expand target.
604: */
605: if( nTarget>nSource ){
606: memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
607: }
608: if( nTarget!=nSource ){
609: int iDoclist = pTarget-d->pData;
610: d->pData = realloc(d->pData, d->nData+nSource-nTarget);
611: pTarget = d->pData+iDoclist;
612: }
613: if( nTarget<nSource ){
614: memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
615: }
616:
617: memcpy(pTarget, pSource, nSource);
618: d->nData += nSource-nTarget;
619: r->p = pTarget+nSource;
620: }
621:
622: /* Insert/update pUpdate into the doclist. */
623: static void docListUpdate(DocList *d, DocList *pUpdate){
624: DocListReader reader;
625:
626: assert( d!=NULL && pUpdate!=NULL );
627: assert( d->iType==pUpdate->iType);
628:
629: readerInit(&reader, d);
630: docListSpliceElement(&reader, firstDocid(pUpdate),
631: pUpdate->pData, pUpdate->nData);
632: }
633:
634: /* Propagate elements from pUpdate to pAcc, overwriting elements with
635: ** matching docids.
636: */
637: static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
638: DocListReader accReader, updateReader;
639:
640: /* Handle edge cases where one doclist is empty. */
641: assert( pAcc!=NULL );
642: if( pUpdate==NULL || pUpdate->nData==0 ) return;
643: if( pAcc->nData==0 ){
644: pAcc->pData = malloc(pUpdate->nData);
645: memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
646: pAcc->nData = pUpdate->nData;
647: return;
648: }
649:
650: readerInit(&accReader, pAcc);
651: readerInit(&updateReader, pUpdate);
652:
653: while( !atEnd(&updateReader) ){
654: char *pSource = updateReader.p;
655: sqlite_int64 iDocid = readDocid(&updateReader);
656: skipPositionList(&updateReader);
657: docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
658: }
659: }
660:
661: /*
662: ** Read the next docid off of pIn. Return 0 if we reach the end.
663: *
664: * TODO: This assumes that docids are never 0, but they may actually be 0 since
665: * users can choose docids when inserting into a full-text table. Fix this.
666: */
667: static sqlite_int64 nextDocid(DocListReader *pIn){
668: skipPositionList(pIn);
669: return atEnd(pIn) ? 0 : readDocid(pIn);
670: }
671:
672: /*
673: ** pLeft and pRight are two DocListReaders that are pointing to
674: ** positions lists of the same document: iDocid.
675: **
676: ** If there are no instances in pLeft or pRight where the position
677: ** of pLeft is one less than the position of pRight, then this
678: ** routine adds nothing to pOut.
679: **
680: ** If there are one or more instances where positions from pLeft
681: ** are exactly one less than positions from pRight, then add a new
682: ** document record to pOut. If pOut wants to hold positions, then
683: ** include the positions from pRight that are one more than a
684: ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
685: **
686: ** pLeft and pRight are left pointing at the next document record.
687: */
688: static void mergePosList(
689: DocListReader *pLeft, /* Left position list */
690: DocListReader *pRight, /* Right position list */
691: sqlite_int64 iDocid, /* The docid from pLeft and pRight */
692: DocList *pOut /* Write the merged document record here */
693: ){
694: int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
695: int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
696: int match = 0;
697:
698: /* Loop until we've reached the end of both position lists. */
699: while( iLeftPos!=-1 && iRightPos!=-1 ){
700: if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
701: if( !match ){
702: docListAddDocid(pOut, iDocid);
703: match = 1;
704: }
705: if( pOut->iType>=DL_POSITIONS ){
706: docListAddPos(pOut, iRightCol, iRightPos);
707: }
708: iLeftPos = readPosition(pLeft, &iLeftCol);
709: iRightPos = readPosition(pRight, &iRightCol);
710: }else if( iRightCol<iLeftCol ||
711: (iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){
712: iRightPos = readPosition(pRight, &iRightCol);
713: }else{
714: iLeftPos = readPosition(pLeft, &iLeftCol);
715: }
716: }
717: if( iLeftPos>=0 ) skipPositionList(pLeft);
718: if( iRightPos>=0 ) skipPositionList(pRight);
719: }
720:
721: /* We have two doclists: pLeft and pRight.
722: ** Write the phrase intersection of these two doclists into pOut.
723: **
724: ** A phrase intersection means that two documents only match
725: ** if pLeft.iPos+1==pRight.iPos.
726: **
727: ** The output pOut may or may not contain positions. If pOut
728: ** does contain positions, they are the positions of pRight.
729: */
730: static void docListPhraseMerge(
731: DocList *pLeft, /* Doclist resulting from the words on the left */
732: DocList *pRight, /* Doclist for the next word to the right */
733: DocList *pOut /* Write the combined doclist here */
734: ){
735: DocListReader left, right;
736: sqlite_int64 docidLeft, docidRight;
737:
738: readerInit(&left, pLeft);
739: readerInit(&right, pRight);
740: docidLeft = nextDocid(&left);
741: docidRight = nextDocid(&right);
742:
743: while( docidLeft>0 && docidRight>0 ){
744: if( docidLeft<docidRight ){
745: docidLeft = nextDocid(&left);
746: }else if( docidRight<docidLeft ){
747: docidRight = nextDocid(&right);
748: }else{
749: mergePosList(&left, &right, docidLeft, pOut);
750: docidLeft = nextDocid(&left);
751: docidRight = nextDocid(&right);
752: }
753: }
754: }
755:
756: /* We have two doclists: pLeft and pRight.
757: ** Write the intersection of these two doclists into pOut.
758: ** Only docids are matched. Position information is ignored.
759: **
760: ** The output pOut never holds positions.
761: */
762: static void docListAndMerge(
763: DocList *pLeft, /* Doclist resulting from the words on the left */
764: DocList *pRight, /* Doclist for the next word to the right */
765: DocList *pOut /* Write the combined doclist here */
766: ){
767: DocListReader left, right;
768: sqlite_int64 docidLeft, docidRight;
769:
770: assert( pOut->iType<DL_POSITIONS );
771:
772: readerInit(&left, pLeft);
773: readerInit(&right, pRight);
774: docidLeft = nextDocid(&left);
775: docidRight = nextDocid(&right);
776:
777: while( docidLeft>0 && docidRight>0 ){
778: if( docidLeft<docidRight ){
779: docidLeft = nextDocid(&left);
780: }else if( docidRight<docidLeft ){
781: docidRight = nextDocid(&right);
782: }else{
783: docListAddDocid(pOut, docidLeft);
784: docidLeft = nextDocid(&left);
785: docidRight = nextDocid(&right);
786: }
787: }
788: }
789:
790: /* We have two doclists: pLeft and pRight.
791: ** Write the union of these two doclists into pOut.
792: ** Only docids are matched. Position information is ignored.
793: **
794: ** The output pOut never holds positions.
795: */
796: static void docListOrMerge(
797: DocList *pLeft, /* Doclist resulting from the words on the left */
798: DocList *pRight, /* Doclist for the next word to the right */
799: DocList *pOut /* Write the combined doclist here */
800: ){
801: DocListReader left, right;
802: sqlite_int64 docidLeft, docidRight, priorLeft;
803:
804: readerInit(&left, pLeft);
805: readerInit(&right, pRight);
806: docidLeft = nextDocid(&left);
807: docidRight = nextDocid(&right);
808:
809: while( docidLeft>0 && docidRight>0 ){
810: if( docidLeft<=docidRight ){
811: docListAddDocid(pOut, docidLeft);
812: }else{
813: docListAddDocid(pOut, docidRight);
814: }
815: priorLeft = docidLeft;
816: if( docidLeft<=docidRight ){
817: docidLeft = nextDocid(&left);
818: }
819: if( docidRight>0 && docidRight<=priorLeft ){
820: docidRight = nextDocid(&right);
821: }
822: }
823: while( docidLeft>0 ){
824: docListAddDocid(pOut, docidLeft);
825: docidLeft = nextDocid(&left);
826: }
827: while( docidRight>0 ){
828: docListAddDocid(pOut, docidRight);
829: docidRight = nextDocid(&right);
830: }
831: }
832:
833: /* We have two doclists: pLeft and pRight.
834: ** Write into pOut all documents that occur in pLeft but not
835: ** in pRight.
836: **
837: ** Only docids are matched. Position information is ignored.
838: **
839: ** The output pOut never holds positions.
840: */
841: static void docListExceptMerge(
842: DocList *pLeft, /* Doclist resulting from the words on the left */
843: DocList *pRight, /* Doclist for the next word to the right */
844: DocList *pOut /* Write the combined doclist here */
845: ){
846: DocListReader left, right;
847: sqlite_int64 docidLeft, docidRight, priorLeft;
848:
849: readerInit(&left, pLeft);
850: readerInit(&right, pRight);
851: docidLeft = nextDocid(&left);
852: docidRight = nextDocid(&right);
853:
854: while( docidLeft>0 && docidRight>0 ){
855: priorLeft = docidLeft;
856: if( docidLeft<docidRight ){
857: docListAddDocid(pOut, docidLeft);
858: }
859: if( docidLeft<=docidRight ){
860: docidLeft = nextDocid(&left);
861: }
862: if( docidRight>0 && docidRight<=priorLeft ){
863: docidRight = nextDocid(&right);
864: }
865: }
866: while( docidLeft>0 ){
867: docListAddDocid(pOut, docidLeft);
868: docidLeft = nextDocid(&left);
869: }
870: }
871:
872: static char *string_dup_n(const char *s, int n){
873: char *str = malloc(n + 1);
874: memcpy(str, s, n);
875: str[n] = '\0';
876: return str;
877: }
878:
879: /* Duplicate a string; the caller must free() the returned string.
880: * (We don't use strdup() since it is not part of the standard C library and
881: * may not be available everywhere.) */
882: static char *string_dup(const char *s){
883: return string_dup_n(s, strlen(s));
884: }
885:
886: /* Format a string, replacing each occurrence of the % character with
887: * zDb.zName. This may be more convenient than sqlite_mprintf()
888: * when one string is used repeatedly in a format string.
889: * The caller must free() the returned string. */
890: static char *string_format(const char *zFormat,
891: const char *zDb, const char *zName){
892: const char *p;
893: size_t len = 0;
894: size_t nDb = strlen(zDb);
895: size_t nName = strlen(zName);
896: size_t nFullTableName = nDb+1+nName;
897: char *result;
898: char *r;
899:
900: /* first compute length needed */
901: for(p = zFormat ; *p ; ++p){
902: len += (*p=='%' ? nFullTableName : 1);
903: }
904: len += 1; /* for null terminator */
905:
906: r = result = malloc(len);
907: for(p = zFormat; *p; ++p){
908: if( *p=='%' ){
909: memcpy(r, zDb, nDb);
910: r += nDb;
911: *r++ = '.';
912: memcpy(r, zName, nName);
913: r += nName;
914: } else {
915: *r++ = *p;
916: }
917: }
918: *r++ = '\0';
919: assert( r == result + len );
920: return result;
921: }
922:
923: static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
924: const char *zFormat){
925: char *zCommand = string_format(zFormat, zDb, zName);
926: int rc;
927: TRACE(("FTS1 sql: %s\n", zCommand));
928: rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
929: free(zCommand);
930: return rc;
931: }
932:
933: static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
934: sqlite3_stmt **ppStmt, const char *zFormat){
935: char *zCommand = string_format(zFormat, zDb, zName);
936: int rc;
937: TRACE(("FTS1 prepare: %s\n", zCommand));
938: rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
939: free(zCommand);
940: return rc;
941: }
942:
943: /* end utility functions */
944:
945: /* Forward reference */
946: typedef struct fulltext_vtab fulltext_vtab;
947:
948: /* A single term in a query is represented by an instances of
949: ** the following structure.
950: */
951: typedef struct QueryTerm {
952: short int nPhrase; /* How many following terms are part of the same phrase */
953: short int iPhrase; /* This is the i-th term of a phrase. */
954: short int iColumn; /* Column of the index that must match this term */
955: signed char isOr; /* this term is preceded by "OR" */
956: signed char isNot; /* this term is preceded by "-" */
957: char *pTerm; /* text of the term. '\000' terminated. malloced */
958: int nTerm; /* Number of bytes in pTerm[] */
959: } QueryTerm;
960:
961:
962: /* A query string is parsed into a Query structure.
963: *
964: * We could, in theory, allow query strings to be complicated
965: * nested expressions with precedence determined by parentheses.
966: * But none of the major search engines do this. (Perhaps the
967: * feeling is that an parenthesized expression is two complex of
968: * an idea for the average user to grasp.) Taking our lead from
969: * the major search engines, we will allow queries to be a list
970: * of terms (with an implied AND operator) or phrases in double-quotes,
971: * with a single optional "-" before each non-phrase term to designate
972: * negation and an optional OR connector.
973: *
974: * OR binds more tightly than the implied AND, which is what the
975: * major search engines seem to do. So, for example:
976: *
977: * [one two OR three] ==> one AND (two OR three)
978: * [one OR two three] ==> (one OR two) AND three
979: *
980: * A "-" before a term matches all entries that lack that term.
981: * The "-" must occur immediately before the term with in intervening
982: * space. This is how the search engines do it.
983: *
984: * A NOT term cannot be the right-hand operand of an OR. If this
985: * occurs in the query string, the NOT is ignored:
986: *
987: * [one OR -two] ==> one OR two
988: *
989: */
990: typedef struct Query {
991: fulltext_vtab *pFts; /* The full text index */
992: int nTerms; /* Number of terms in the query */
993: QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
994: int nextIsOr; /* Set the isOr flag on the next inserted term */
995: int nextColumn; /* Next word parsed must be in this column */
996: int dfltColumn; /* The default column */
997: } Query;
998:
999:
1000: /*
1001: ** An instance of the following structure keeps track of generated
1002: ** matching-word offset information and snippets.
1003: */
1004: typedef struct Snippet {
1005: int nMatch; /* Total number of matches */
1006: int nAlloc; /* Space allocated for aMatch[] */
1007: struct snippetMatch { /* One entry for each matching term */
1008: char snStatus; /* Status flag for use while constructing snippets */
1009: short int iCol; /* The column that contains the match */
1010: short int iTerm; /* The index in Query.pTerms[] of the matching term */
1011: short int nByte; /* Number of bytes in the term */
1012: int iStart; /* The offset to the first character of the term */
1013: } *aMatch; /* Points to space obtained from malloc */
1014: char *zOffset; /* Text rendering of aMatch[] */
1015: int nOffset; /* strlen(zOffset) */
1016: char *zSnippet; /* Snippet text */
1017: int nSnippet; /* strlen(zSnippet) */
1018: } Snippet;
1019:
1020:
1021: typedef enum QueryType {
1022: QUERY_GENERIC, /* table scan */
1023: QUERY_ROWID, /* lookup by rowid */
1024: QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1025: } QueryType;
1026:
1027: /* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
1028: ** before we start aggregating into larger segments. Lower CHUNK_MAX
1029: ** means that for a given input we have more individual segments per
1030: ** term, which means more rows in the table and a bigger index (due to
1031: ** both more rows and bigger rowids). But it also reduces the average
1032: ** cost of adding new elements to the segment 0 doclist, and it seems
1033: ** to reduce the number of pages read and written during inserts. 256
1034: ** was chosen by measuring insertion times for a certain input (first
1035: ** 10k documents of Enron corpus), though including query performance
1036: ** in the decision may argue for a larger value.
1037: */
1038: #define CHUNK_MAX 256
1039:
1040: typedef enum fulltext_statement {
1041: CONTENT_INSERT_STMT,
1042: CONTENT_SELECT_STMT,
1043: CONTENT_UPDATE_STMT,
1044: CONTENT_DELETE_STMT,
1045:
1046: TERM_SELECT_STMT,
1047: TERM_SELECT_ALL_STMT,
1048: TERM_INSERT_STMT,
1049: TERM_UPDATE_STMT,
1050: TERM_DELETE_STMT,
1051:
1052: MAX_STMT /* Always at end! */
1053: } fulltext_statement;
1054:
1055: /* These must exactly match the enum above. */
1056: /* TODO(adam): Is there some risk that a statement (in particular,
1057: ** pTermSelectStmt) will be used in two cursors at once, e.g. if a
1058: ** query joins a virtual table to itself? If so perhaps we should
1059: ** move some of these to the cursor object.
1060: */
1061: static const char *const fulltext_zStatement[MAX_STMT] = {
1062: /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
1063: /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
1064: /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
1065: /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
1066:
1067: /* TERM_SELECT */
1068: "select rowid, doclist from %_term where term = ? and segment = ?",
1069: /* TERM_SELECT_ALL */
1070: "select doclist from %_term where term = ? order by segment",
1071: /* TERM_INSERT */
1072: "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
1073: /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
1074: /* TERM_DELETE */ "delete from %_term where rowid = ?",
1075: };
1076:
1077: /*
1078: ** A connection to a fulltext index is an instance of the following
1079: ** structure. The xCreate and xConnect methods create an instance
1080: ** of this structure and xDestroy and xDisconnect free that instance.
1081: ** All other methods receive a pointer to the structure as one of their
1082: ** arguments.
1083: */
1084: struct fulltext_vtab {
1085: sqlite3_vtab base; /* Base class used by SQLite core */
1086: sqlite3 *db; /* The database connection */
1087: const char *zDb; /* logical database name */
1088: const char *zName; /* virtual table name */
1089: int nColumn; /* number of columns in virtual table */
1090: char **azColumn; /* column names. malloced */
1091: char **azContentColumn; /* column names in content table; malloced */
1092: sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
1093:
1094: /* Precompiled statements which we keep as long as the table is
1095: ** open.
1096: */
1097: sqlite3_stmt *pFulltextStatements[MAX_STMT];
1098: };
1099:
1100: /*
1101: ** When the core wants to do a query, it create a cursor using a
1102: ** call to xOpen. This structure is an instance of a cursor. It
1103: ** is destroyed by xClose.
1104: */
1105: typedef struct fulltext_cursor {
1106: sqlite3_vtab_cursor base; /* Base class used by SQLite core */
1107: QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
1108: sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
1109: int eof; /* True if at End Of Results */
1110: Query q; /* Parsed query string */
1111: Snippet snippet; /* Cached snippet for the current row */
1112: int iColumn; /* Column being searched */
1113: DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */
1114: } fulltext_cursor;
1115:
1116: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
1117: return (fulltext_vtab *) c->base.pVtab;
1118: }
1119:
1120: static const sqlite3_module fulltextModule; /* forward declaration */
1121:
1122: /* Append a list of strings separated by commas to a StringBuffer. */
1123: static void appendList(StringBuffer *sb, int nString, char **azString){
1124: int i;
1125: for(i=0; i<nString; ++i){
1126: if( i>0 ) append(sb, ", ");
1127: append(sb, azString[i]);
1128: }
1129: }
1130:
1131: /* Return a dynamically generated statement of the form
1132: * insert into %_content (rowid, ...) values (?, ...)
1133: */
1134: static const char *contentInsertStatement(fulltext_vtab *v){
1135: StringBuffer sb;
1136: int i;
1137:
1138: initStringBuffer(&sb);
1139: append(&sb, "insert into %_content (rowid, ");
1140: appendList(&sb, v->nColumn, v->azContentColumn);
1141: append(&sb, ") values (?");
1142: for(i=0; i<v->nColumn; ++i)
1143: append(&sb, ", ?");
1144: append(&sb, ")");
1145: return sb.s;
1146: }
1147:
1148: /* Return a dynamically generated statement of the form
1149: * update %_content set [col_0] = ?, [col_1] = ?, ...
1150: * where rowid = ?
1151: */
1152: static const char *contentUpdateStatement(fulltext_vtab *v){
1153: StringBuffer sb;
1154: int i;
1155:
1156: initStringBuffer(&sb);
1157: append(&sb, "update %_content set ");
1158: for(i=0; i<v->nColumn; ++i) {
1159: if( i>0 ){
1160: append(&sb, ", ");
1161: }
1162: append(&sb, v->azContentColumn[i]);
1163: append(&sb, " = ?");
1164: }
1165: append(&sb, " where rowid = ?");
1166: return sb.s;
1167: }
1168:
1169: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1170: ** If the indicated statement has never been prepared, it is prepared
1171: ** and cached, otherwise the cached version is reset.
1172: */
1173: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
1174: sqlite3_stmt **ppStmt){
1175: assert( iStmt<MAX_STMT );
1176: if( v->pFulltextStatements[iStmt]==NULL ){
1177: const char *zStmt;
1178: int rc;
1179: switch( iStmt ){
1180: case CONTENT_INSERT_STMT:
1181: zStmt = contentInsertStatement(v); break;
1182: case CONTENT_UPDATE_STMT:
1183: zStmt = contentUpdateStatement(v); break;
1184: default:
1185: zStmt = fulltext_zStatement[iStmt];
1186: }
1187: rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
1188: zStmt);
1189: if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
1190: if( rc!=SQLITE_OK ) return rc;
1191: } else {
1192: int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
1193: if( rc!=SQLITE_OK ) return rc;
1194: }
1195:
1196: *ppStmt = v->pFulltextStatements[iStmt];
1197: return SQLITE_OK;
1198: }
1199:
1200: /* Step the indicated statement, handling errors SQLITE_BUSY (by
1201: ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
1202: ** bindings to the new statement).
1203: ** TODO(adam): We should extend this function so that it can work with
1204: ** statements declared locally, not only globally cached statements.
1205: */
1206: static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
1207: sqlite3_stmt **ppStmt){
1208: int rc;
1209: sqlite3_stmt *s = *ppStmt;
1210: assert( iStmt<MAX_STMT );
1211: assert( s==v->pFulltextStatements[iStmt] );
1212:
1213: while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
1214: if( rc==SQLITE_BUSY ) continue;
1215: if( rc!=SQLITE_ERROR ) return rc;
1216:
1217: /* If an SQLITE_SCHEMA error has occurred, then finalizing this
1218: * statement is going to delete the fulltext_vtab structure. If
1219: * the statement just executed is in the pFulltextStatements[]
1220: * array, it will be finalized twice. So remove it before
1221: * calling sqlite3_finalize().
1222: */
1223: v->pFulltextStatements[iStmt] = NULL;
1224: rc = sqlite3_finalize(s);
1225: break;
1226: }
1227: return rc;
1228:
1229: err:
1230: sqlite3_finalize(s);
1231: return rc;
1232: }
1233:
1234: /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
1235: ** Useful for statements like UPDATE, where we expect no results.
1236: */
1237: static int sql_single_step_statement(fulltext_vtab *v,
1238: fulltext_statement iStmt,
1239: sqlite3_stmt **ppStmt){
1240: int rc = sql_step_statement(v, iStmt, ppStmt);
1241: return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
1242: }
1243:
1244: /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
1245: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
1246: sqlite3_value **pValues){
1247: sqlite3_stmt *s;
1248: int i;
1249: int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
1250: if( rc!=SQLITE_OK ) return rc;
1251:
1252: rc = sqlite3_bind_value(s, 1, rowid);
1253: if( rc!=SQLITE_OK ) return rc;
1254:
1255: for(i=0; i<v->nColumn; ++i){
1256: rc = sqlite3_bind_value(s, 2+i, pValues[i]);
1257: if( rc!=SQLITE_OK ) return rc;
1258: }
1259:
1260: return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
1261: }
1262:
1263: /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
1264: * where rowid = [iRowid] */
1265: static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
1266: sqlite_int64 iRowid){
1267: sqlite3_stmt *s;
1268: int i;
1269: int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
1270: if( rc!=SQLITE_OK ) return rc;
1271:
1272: for(i=0; i<v->nColumn; ++i){
1273: rc = sqlite3_bind_value(s, 1+i, pValues[i]);
1274: if( rc!=SQLITE_OK ) return rc;
1275: }
1276:
1277: rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
1278: if( rc!=SQLITE_OK ) return rc;
1279:
1280: return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
1281: }
1282:
1283: static void freeStringArray(int nString, const char **pString){
1284: int i;
1285:
1286: for (i=0 ; i < nString ; ++i) {
1287: if( pString[i]!=NULL ) free((void *) pString[i]);
1288: }
1289: free((void *) pString);
1290: }
1291:
1292: /* select * from %_content where rowid = [iRow]
1293: * The caller must delete the returned array and all strings in it.
1294: * null fields will be NULL in the returned array.
1295: *
1296: * TODO: Perhaps we should return pointer/length strings here for consistency
1297: * with other code which uses pointer/length. */
1298: static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
1299: const char ***pValues){
1300: sqlite3_stmt *s;
1301: const char **values;
1302: int i;
1303: int rc;
1304:
1305: *pValues = NULL;
1306:
1307: rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
1308: if( rc!=SQLITE_OK ) return rc;
1309:
1310: rc = sqlite3_bind_int64(s, 1, iRow);
1311: if( rc!=SQLITE_OK ) return rc;
1312:
1313: rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
1314: if( rc!=SQLITE_ROW ) return rc;
1315:
1316: values = (const char **) malloc(v->nColumn * sizeof(const char *));
1317: for(i=0; i<v->nColumn; ++i){
1318: if( sqlite3_column_type(s, i)==SQLITE_NULL ){
1319: values[i] = NULL;
1320: }else{
1321: values[i] = string_dup((char*)sqlite3_column_text(s, i));
1322: }
1323: }
1324:
1325: /* We expect only one row. We must execute another sqlite3_step()
1326: * to complete the iteration; otherwise the table will remain locked. */
1327: rc = sqlite3_step(s);
1328: if( rc==SQLITE_DONE ){
1329: *pValues = values;
1330: return SQLITE_OK;
1331: }
1332:
1333: freeStringArray(v->nColumn, values);
1334: return rc;
1335: }
1336:
1337: /* delete from %_content where rowid = [iRow ] */
1338: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
1339: sqlite3_stmt *s;
1340: int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
1341: if( rc!=SQLITE_OK ) return rc;
1342:
1343: rc = sqlite3_bind_int64(s, 1, iRow);
1344: if( rc!=SQLITE_OK ) return rc;
1345:
1346: return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
1347: }
1348:
1349: /* select rowid, doclist from %_term
1350: * where term = [pTerm] and segment = [iSegment]
1351: * If found, returns SQLITE_ROW; the caller must free the
1352: * returned doclist. If no rows found, returns SQLITE_DONE. */
1353: static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
1354: int iSegment,
1355: sqlite_int64 *rowid, DocList *out){
1356: sqlite3_stmt *s;
1357: int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
1358: if( rc!=SQLITE_OK ) return rc;
1359:
1360: rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
1361: if( rc!=SQLITE_OK ) return rc;
1362:
1363: rc = sqlite3_bind_int(s, 2, iSegment);
1364: if( rc!=SQLITE_OK ) return rc;
1365:
1366: rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
1367: if( rc!=SQLITE_ROW ) return rc;
1368:
1369: *rowid = sqlite3_column_int64(s, 0);
1370: docListInit(out, DL_DEFAULT,
1371: sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
1372:
1373: /* We expect only one row. We must execute another sqlite3_step()
1374: * to complete the iteration; otherwise the table will remain locked. */
1375: rc = sqlite3_step(s);
1376: return rc==SQLITE_DONE ? SQLITE_ROW : rc;
1377: }
1378:
1379: /* Load the segment doclists for term pTerm and merge them in
1380: ** appropriate order into out. Returns SQLITE_OK if successful. If
1381: ** there are no segments for pTerm, successfully returns an empty
1382: ** doclist in out.
1383: **
1384: ** Each document consists of 1 or more "columns". The number of
1385: ** columns is v->nColumn. If iColumn==v->nColumn, then return
1386: ** position information about all columns. If iColumn<v->nColumn,
1387: ** then only return position information about the iColumn-th column
1388: ** (where the first column is 0).
1389: */
1390: static int term_select_all(
1391: fulltext_vtab *v, /* The fulltext index we are querying against */
1392: int iColumn, /* If <nColumn, only look at the iColumn-th column */
1393: const char *pTerm, /* The term whose posting lists we want */
1394: int nTerm, /* Number of bytes in pTerm */
1395: DocList *out /* Write the resulting doclist here */
1396: ){
1397: DocList doclist;
1398: sqlite3_stmt *s;
1399: int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
1400: if( rc!=SQLITE_OK ) return rc;
1401:
1402: rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
1403: if( rc!=SQLITE_OK ) return rc;
1404:
1405: docListInit(&doclist, DL_DEFAULT, 0, 0);
1406:
1407: /* TODO(shess) Handle schema and busy errors. */
1408: while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
1409: DocList old;
1410:
1411: /* TODO(shess) If we processed doclists from oldest to newest, we
1412: ** could skip the malloc() involved with the following call. For
1413: ** now, I'd rather keep this logic similar to index_insert_term().
1414: ** We could additionally drop elements when we see deletes, but
1415: ** that would require a distinct version of docListAccumulate().
1416: */
1417: docListInit(&old, DL_DEFAULT,
1418: sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
1419:
1420: if( iColumn<v->nColumn ){ /* querying a single column */
1421: docListRestrictColumn(&old, iColumn);
1422: }
1423:
1424: /* doclist contains the newer data, so write it over old. Then
1425: ** steal accumulated result for doclist.
1426: */
1427: docListAccumulate(&old, &doclist);
1428: docListDestroy(&doclist);
1429: doclist = old;
1430: }
1431: if( rc!=SQLITE_DONE ){
1432: docListDestroy(&doclist);
1433: return rc;
1434: }
1435:
1436: docListDiscardEmpty(&doclist);
1437: *out = doclist;
1438: return SQLITE_OK;
1439: }
1440:
1441: /* insert into %_term (rowid, term, segment, doclist)
1442: values ([piRowid], [pTerm], [iSegment], [doclist])
1443: ** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
1444: **
1445: ** NOTE(shess) piRowid is IN, with values of "space of int64" plus
1446: ** null, it is not used to pass data back to the caller.
1447: */
1448: static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
1449: const char *pTerm, int nTerm,
1450: int iSegment, DocList *doclist){
1451: sqlite3_stmt *s;
1452: int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
1453: if( rc!=SQLITE_OK ) return rc;
1454:
1455: if( piRowid==NULL ){
1456: rc = sqlite3_bind_null(s, 1);
1457: }else{
1458: rc = sqlite3_bind_int64(s, 1, *piRowid);
1459: }
1460: if( rc!=SQLITE_OK ) return rc;
1461:
1462: rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
1463: if( rc!=SQLITE_OK ) return rc;
1464:
1465: rc = sqlite3_bind_int(s, 3, iSegment);
1466: if( rc!=SQLITE_OK ) return rc;
1467:
1468: rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
1469: if( rc!=SQLITE_OK ) return rc;
1470:
1471: return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
1472: }
1473:
1474: /* update %_term set doclist = [doclist] where rowid = [rowid] */
1475: static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
1476: DocList *doclist){
1477: sqlite3_stmt *s;
1478: int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
1479: if( rc!=SQLITE_OK ) return rc;
1480:
1481: rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
1482: if( rc!=SQLITE_OK ) return rc;
1483:
1484: rc = sqlite3_bind_int64(s, 2, rowid);
1485: if( rc!=SQLITE_OK ) return rc;
1486:
1487: return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
1488: }
1489:
1490: static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
1491: sqlite3_stmt *s;
1492: int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
1493: if( rc!=SQLITE_OK ) return rc;
1494:
1495: rc = sqlite3_bind_int64(s, 1, rowid);
1496: if( rc!=SQLITE_OK ) return rc;
1497:
1498: return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
1499: }
1500:
1501: /*
1502: ** Free the memory used to contain a fulltext_vtab structure.
1503: */
1504: static void fulltext_vtab_destroy(fulltext_vtab *v){
1505: int iStmt, i;
1506:
1507: TRACE(("FTS1 Destroy %p\n", v));
1508: for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
1509: if( v->pFulltextStatements[iStmt]!=NULL ){
1510: sqlite3_finalize(v->pFulltextStatements[iStmt]);
1511: v->pFulltextStatements[iStmt] = NULL;
1512: }
1513: }
1514:
1515: if( v->pTokenizer!=NULL ){
1516: v->pTokenizer->pModule->xDestroy(v->pTokenizer);
1517: v->pTokenizer = NULL;
1518: }
1519:
1520: free(v->azColumn);
1521: for(i = 0; i < v->nColumn; ++i) {
1522: sqlite3_free(v->azContentColumn[i]);
1523: }
1524: free(v->azContentColumn);
1525: free(v);
1526: }
1527:
1528: /*
1529: ** Token types for parsing the arguments to xConnect or xCreate.
1530: */
1531: #define TOKEN_EOF 0 /* End of file */
1532: #define TOKEN_SPACE 1 /* Any kind of whitespace */
1533: #define TOKEN_ID 2 /* An identifier */
1534: #define TOKEN_STRING 3 /* A string literal */
1535: #define TOKEN_PUNCT 4 /* A single punctuation character */
1536:
1537: /*
1538: ** If X is a character that can be used in an identifier then
1539: ** IdChar(X) will be true. Otherwise it is false.
1540: **
1541: ** For ASCII, any character with the high-order bit set is
1542: ** allowed in an identifier. For 7-bit characters,
1543: ** sqlite3IsIdChar[X] must be 1.
1544: **
1545: ** Ticket #1066. the SQL standard does not allow '$' in the
1546: ** middle of identfiers. But many SQL implementations do.
1547: ** SQLite will allow '$' in identifiers for compatibility.
1548: ** But the feature is undocumented.
1549: */
1550: static const char isIdChar[] = {
1551: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
1552: 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
1553: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
1554: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1555: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
1556: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1557: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
1558: };
1559: #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
1560:
1561:
1562: /*
1563: ** Return the length of the token that begins at z[0].
1564: ** Store the token type in *tokenType before returning.
1565: */
1566: static int getToken(const char *z, int *tokenType){
1567: int i, c;
1568: switch( *z ){
1569: case 0: {
1570: *tokenType = TOKEN_EOF;
1571: return 0;
1572: }
1573: case ' ': case '\t': case '\n': case '\f': case '\r': {
1574: for(i=1; safe_isspace(z[i]); i++){}
1575: *tokenType = TOKEN_SPACE;
1576: return i;
1577: }
1578: case '`':
1579: case '\'':
1580: case '"': {
1581: int delim = z[0];
1582: for(i=1; (c=z[i])!=0; i++){
1583: if( c==delim ){
1584: if( z[i+1]==delim ){
1585: i++;
1586: }else{
1587: break;
1588: }
1589: }
1590: }
1591: *tokenType = TOKEN_STRING;
1592: return i + (c!=0);
1593: }
1594: case '[': {
1595: for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
1596: *tokenType = TOKEN_ID;
1597: return i;
1598: }
1599: default: {
1600: if( !IdChar(*z) ){
1601: break;
1602: }
1603: for(i=1; IdChar(z[i]); i++){}
1604: *tokenType = TOKEN_ID;
1605: return i;
1606: }
1607: }
1608: *tokenType = TOKEN_PUNCT;
1609: return 1;
1610: }
1611:
1612: /*
1613: ** A token extracted from a string is an instance of the following
1614: ** structure.
1615: */
1616: typedef struct Token {
1617: const char *z; /* Pointer to token text. Not '\000' terminated */
1618: short int n; /* Length of the token text in bytes. */
1619: } Token;
1620:
1621: /*
1622: ** Given a input string (which is really one of the argv[] parameters
1623: ** passed into xConnect or xCreate) split the string up into tokens.
1624: ** Return an array of pointers to '\000' terminated strings, one string
1625: ** for each non-whitespace token.
1626: **
1627: ** The returned array is terminated by a single NULL pointer.
1628: **
1629: ** Space to hold the returned array is obtained from a single
1630: ** malloc and should be freed by passing the return value to free().
1631: ** The individual strings within the token list are all a part of
1632: ** the single memory allocation and will all be freed at once.
1633: */
1634: static char **tokenizeString(const char *z, int *pnToken){
1635: int nToken = 0;
1636: Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
1637: int n = 1;
1638: int e, i;
1639: int totalSize = 0;
1640: char **azToken;
1641: char *zCopy;
1642: while( n>0 ){
1643: n = getToken(z, &e);
1644: if( e!=TOKEN_SPACE ){
1645: aToken[nToken].z = z;
1646: aToken[nToken].n = n;
1647: nToken++;
1648: totalSize += n+1;
1649: }
1650: z += n;
1651: }
1652: azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
1653: zCopy = (char*)&azToken[nToken];
1654: nToken--;
1655: for(i=0; i<nToken; i++){
1656: azToken[i] = zCopy;
1657: n = aToken[i].n;
1658: memcpy(zCopy, aToken[i].z, n);
1659: zCopy[n] = 0;
1660: zCopy += n+1;
1661: }
1662: azToken[nToken] = 0;
1663: free(aToken);
1664: *pnToken = nToken;
1665: return azToken;
1666: }
1667:
1668: /*
1669: ** Convert an SQL-style quoted string into a normal string by removing
1670: ** the quote characters. The conversion is done in-place. If the
1671: ** input does not begin with a quote character, then this routine
1672: ** is a no-op.
1673: **
1674: ** Examples:
1675: **
1676: ** "abc" becomes abc
1677: ** 'xyz' becomes xyz
1678: ** [pqr] becomes pqr
1679: ** `mno` becomes mno
1680: */
1681: static void dequoteString(char *z){
1682: int quote;
1683: int i, j;
1684: if( z==0 ) return;
1685: quote = z[0];
1686: switch( quote ){
1687: case '\'': break;
1688: case '"': break;
1689: case '`': break; /* For MySQL compatibility */
1690: case '[': quote = ']'; break; /* For MS SqlServer compatibility */
1691: default: return;
1692: }
1693: for(i=1, j=0; z[i]; i++){
1694: if( z[i]==quote ){
1695: if( z[i+1]==quote ){
1696: z[j++] = quote;
1697: i++;
1698: }else{
1699: z[j++] = 0;
1700: break;
1701: }
1702: }else{
1703: z[j++] = z[i];
1704: }
1705: }
1706: }
1707:
1708: /*
1709: ** The input azIn is a NULL-terminated list of tokens. Remove the first
1710: ** token and all punctuation tokens. Remove the quotes from
1711: ** around string literal tokens.
1712: **
1713: ** Example:
1714: **
1715: ** input: tokenize chinese ( 'simplifed' , 'mixed' )
1716: ** output: chinese simplifed mixed
1717: **
1718: ** Another example:
1719: **
1720: ** input: delimiters ( '[' , ']' , '...' )
1721: ** output: [ ] ...
1722: */
1723: static void tokenListToIdList(char **azIn){
1724: int i, j;
1725: if( azIn ){
1726: for(i=0, j=-1; azIn[i]; i++){
1727: if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
1728: dequoteString(azIn[i]);
1729: if( j>=0 ){
1730: azIn[j] = azIn[i];
1731: }
1732: j++;
1733: }
1734: }
1735: azIn[j] = 0;
1736: }
1737: }
1738:
1739:
1740: /*
1741: ** Find the first alphanumeric token in the string zIn. Null-terminate
1742: ** this token. Remove any quotation marks. And return a pointer to
1743: ** the result.
1744: */
1745: static char *firstToken(char *zIn, char **pzTail){
1746: int n, ttype;
1747: while(1){
1748: n = getToken(zIn, &ttype);
1749: if( ttype==TOKEN_SPACE ){
1750: zIn += n;
1751: }else if( ttype==TOKEN_EOF ){
1752: *pzTail = zIn;
1753: return 0;
1754: }else{
1755: zIn[n] = 0;
1756: *pzTail = &zIn[1];
1757: dequoteString(zIn);
1758: return zIn;
1759: }
1760: }
1761: /*NOTREACHED*/
1762: }
1763:
1764: /* Return true if...
1765: **
1766: ** * s begins with the string t, ignoring case
1767: ** * s is longer than t
1768: ** * The first character of s beyond t is not a alphanumeric
1769: **
1770: ** Ignore leading space in *s.
1771: **
1772: ** To put it another way, return true if the first token of
1773: ** s[] is t[].
1774: */
1775: static int startsWith(const char *s, const char *t){
1776: while( safe_isspace(*s) ){ s++; }
1777: while( *t ){
1778: if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
1779: }
1780: return *s!='_' && !safe_isalnum(*s);
1781: }
1782:
1783: /*
1784: ** An instance of this structure defines the "spec" of a
1785: ** full text index. This structure is populated by parseSpec
1786: ** and use by fulltextConnect and fulltextCreate.
1787: */
1788: typedef struct TableSpec {
1789: const char *zDb; /* Logical database name */
1790: const char *zName; /* Name of the full-text index */
1791: int nColumn; /* Number of columns to be indexed */
1792: char **azColumn; /* Original names of columns to be indexed */
1793: char **azContentColumn; /* Column names for %_content */
1794: char **azTokenizer; /* Name of tokenizer and its arguments */
1795: } TableSpec;
1796:
1797: /*
1798: ** Reclaim all of the memory used by a TableSpec
1799: */
1800: static void clearTableSpec(TableSpec *p) {
1801: free(p->azColumn);
1802: free(p->azContentColumn);
1803: free(p->azTokenizer);
1804: }
1805:
1806: /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
1807: *
1808: * CREATE VIRTUAL TABLE email
1809: * USING fts1(subject, body, tokenize mytokenizer(myarg))
1810: *
1811: * We return parsed information in a TableSpec structure.
1812: *
1813: */
1814: static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
1815: char**pzErr){
1816: int i, n;
1817: char *z, *zDummy;
1818: char **azArg;
1819: const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
1820:
1821: assert( argc>=3 );
1822: /* Current interface:
1823: ** argv[0] - module name
1824: ** argv[1] - database name
1825: ** argv[2] - table name
1826: ** argv[3..] - columns, optionally followed by tokenizer specification
1827: ** and snippet delimiters specification.
1828: */
1829:
1830: /* Make a copy of the complete argv[][] array in a single allocation.
1831: ** The argv[][] array is read-only and transient. We can write to the
1832: ** copy in order to modify things and the copy is persistent.
1833: */
1834: memset(pSpec, 0, sizeof(*pSpec));
1835: for(i=n=0; i<argc; i++){
1836: n += strlen(argv[i]) + 1;
1837: }
1838: azArg = malloc( sizeof(char*)*argc + n );
1839: if( azArg==0 ){
1840: return SQLITE_NOMEM;
1841: }
1842: z = (char*)&azArg[argc];
1843: for(i=0; i<argc; i++){
1844: azArg[i] = z;
1845: strcpy(z, argv[i]);
1846: z += strlen(z)+1;
1847: }
1848:
1849: /* Identify the column names and the tokenizer and delimiter arguments
1850: ** in the argv[][] array.
1851: */
1852: pSpec->zDb = azArg[1];
1853: pSpec->zName = azArg[2];
1854: pSpec->nColumn = 0;
1855: pSpec->azColumn = azArg;
1856: zTokenizer = "tokenize simple";
1857: for(i=3; i<argc; ++i){
1858: if( startsWith(azArg[i],"tokenize") ){
1859: zTokenizer = azArg[i];
1860: }else{
1861: z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
1862: pSpec->nColumn++;
1863: }
1864: }
1865: if( pSpec->nColumn==0 ){
1866: azArg[0] = "content";
1867: pSpec->nColumn = 1;
1868: }
1869:
1870: /*
1871: ** Construct the list of content column names.
1872: **
1873: ** Each content column name will be of the form cNNAAAA
1874: ** where NN is the column number and AAAA is the sanitized
1875: ** column name. "sanitized" means that special characters are
1876: ** converted to "_". The cNN prefix guarantees that all column
1877: ** names are unique.
1878: **
1879: ** The AAAA suffix is not strictly necessary. It is included
1880: ** for the convenience of people who might examine the generated
1881: ** %_content table and wonder what the columns are used for.
1882: */
1883: pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
1884: if( pSpec->azContentColumn==0 ){
1885: clearTableSpec(pSpec);
1886: return SQLITE_NOMEM;
1887: }
1888: for(i=0; i<pSpec->nColumn; i++){
1889: char *p;
1890: pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
1891: for (p = pSpec->azContentColumn[i]; *p ; ++p) {
1892: if( !safe_isalnum(*p) ) *p = '_';
1893: }
1894: }
1895:
1896: /*
1897: ** Parse the tokenizer specification string.
1898: */
1899: pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
1900: tokenListToIdList(pSpec->azTokenizer);
1901:
1902: return SQLITE_OK;
1903: }
1904:
1905: /*
1906: ** Generate a CREATE TABLE statement that describes the schema of
1907: ** the virtual table. Return a pointer to this schema string.
1908: **
1909: ** Space is obtained from sqlite3_mprintf() and should be freed
1910: ** using sqlite3_free().
1911: */
1912: static char *fulltextSchema(
1913: int nColumn, /* Number of columns */
1914: const char *const* azColumn, /* List of columns */
1915: const char *zTableName /* Name of the table */
1916: ){
1917: int i;
1918: char *zSchema, *zNext;
1919: const char *zSep = "(";
1920: zSchema = sqlite3_mprintf("CREATE TABLE x");
1921: for(i=0; i<nColumn; i++){
1922: zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
1923: sqlite3_free(zSchema);
1924: zSchema = zNext;
1925: zSep = ",";
1926: }
1927: zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
1928: sqlite3_free(zSchema);
1929: return zNext;
1930: }
1931:
1932: /*
1933: ** Build a new sqlite3_vtab structure that will describe the
1934: ** fulltext index defined by spec.
1935: */
1936: static int constructVtab(
1937: sqlite3 *db, /* The SQLite database connection */
1938: TableSpec *spec, /* Parsed spec information from parseSpec() */
1939: sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
1940: char **pzErr /* Write any error message here */
1941: ){
1942: int rc;
1943: int n;
1944: fulltext_vtab *v = 0;
1945: const sqlite3_tokenizer_module *m = NULL;
1946: char *schema;
1947:
1948: v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
1949: if( v==0 ) return SQLITE_NOMEM;
1950: memset(v, 0, sizeof(*v));
1951: /* sqlite will initialize v->base */
1952: v->db = db;
1953: v->zDb = spec->zDb; /* Freed when azColumn is freed */
1954: v->zName = spec->zName; /* Freed when azColumn is freed */
1955: v->nColumn = spec->nColumn;
1956: v->azContentColumn = spec->azContentColumn;
1957: spec->azContentColumn = 0;
1958: v->azColumn = spec->azColumn;
1959: spec->azColumn = 0;
1960:
1961: if( spec->azTokenizer==0 ){
1962: return SQLITE_NOMEM;
1963: }
1964: /* TODO(shess) For now, add new tokenizers as else if clauses. */
1965: if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
1966: sqlite3Fts1SimpleTokenizerModule(&m);
1967: }else if( startsWith(spec->azTokenizer[0], "porter") ){
1968: sqlite3Fts1PorterTokenizerModule(&m);
1969: }else{
1970: *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
1971: rc = SQLITE_ERROR;
1972: goto err;
1973: }
1974: for(n=0; spec->azTokenizer[n]; n++){}
1975: if( n ){
1976: rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
1977: &v->pTokenizer);
1978: }else{
1979: rc = m->xCreate(0, 0, &v->pTokenizer);
1980: }
1981: if( rc!=SQLITE_OK ) goto err;
1982: v->pTokenizer->pModule = m;
1983:
1984: /* TODO: verify the existence of backing tables foo_content, foo_term */
1985:
1986: schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
1987: spec->zName);
1988: rc = sqlite3_declare_vtab(db, schema);
1989: sqlite3_free(schema);
1990: if( rc!=SQLITE_OK ) goto err;
1991:
1992: memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
1993:
1994: *ppVTab = &v->base;
1995: TRACE(("FTS1 Connect %p\n", v));
1996:
1997: return rc;
1998:
1999: err:
2000: fulltext_vtab_destroy(v);
2001: return rc;
2002: }
2003:
2004: static int fulltextConnect(
2005: sqlite3 *db,
2006: void *pAux,
2007: int argc, const char *const*argv,
2008: sqlite3_vtab **ppVTab,
2009: char **pzErr
2010: ){
2011: TableSpec spec;
2012: int rc = parseSpec(&spec, argc, argv, pzErr);
2013: if( rc!=SQLITE_OK ) return rc;
2014:
2015: rc = constructVtab(db, &spec, ppVTab, pzErr);
2016: clearTableSpec(&spec);
2017: return rc;
2018: }
2019:
2020: /* The %_content table holds the text of each document, with
2021: ** the rowid used as the docid.
2022: **
2023: ** The %_term table maps each term to a document list blob
2024: ** containing elements sorted by ascending docid, each element
2025: ** encoded as:
2026: **
2027: ** docid varint-encoded
2028: ** token elements:
2029: ** position+1 varint-encoded as delta from previous position
2030: ** start offset varint-encoded as delta from previous start offset
2031: ** end offset varint-encoded as delta from start offset
2032: **
2033: ** The sentinel position of 0 indicates the end of the token list.
2034: **
2035: ** Additionally, doclist blobs are chunked into multiple segments,
2036: ** using segment to order the segments. New elements are added to
2037: ** the segment at segment 0, until it exceeds CHUNK_MAX. Then
2038: ** segment 0 is deleted, and the doclist is inserted at segment 1.
2039: ** If there is already a doclist at segment 1, the segment 0 doclist
2040: ** is merged with it, the segment 1 doclist is deleted, and the
2041: ** merged doclist is inserted at segment 2, repeating those
2042: ** operations until an insert succeeds.
2043: **
2044: ** Since this structure doesn't allow us to update elements in place
2045: ** in case of deletion or update, these are simply written to
2046: ** segment 0 (with an empty token list in case of deletion), with
2047: ** docListAccumulate() taking care to retain lower-segment
2048: ** information in preference to higher-segment information.
2049: */
2050: /* TODO(shess) Provide a VACUUM type operation which both removes
2051: ** deleted elements which are no longer necessary, and duplicated
2052: ** elements. I suspect this will probably not be necessary in
2053: ** practice, though.
2054: */
2055: static int fulltextCreate(sqlite3 *db, void *pAux,
2056: int argc, const char * const *argv,
2057: sqlite3_vtab **ppVTab, char **pzErr){
2058: int rc;
2059: TableSpec spec;
2060: StringBuffer schema;
2061: TRACE(("FTS1 Create\n"));
2062:
2063: rc = parseSpec(&spec, argc, argv, pzErr);
2064: if( rc!=SQLITE_OK ) return rc;
2065:
2066: initStringBuffer(&schema);
2067: append(&schema, "CREATE TABLE %_content(");
2068: appendList(&schema, spec.nColumn, spec.azContentColumn);
2069: append(&schema, ")");
2070: rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
2071: free(schema.s);
2072: if( rc!=SQLITE_OK ) goto out;
2073:
2074: rc = sql_exec(db, spec.zDb, spec.zName,
2075: "create table %_term(term text, segment integer, doclist blob, "
2076: "primary key(term, segment));");
2077: if( rc!=SQLITE_OK ) goto out;
2078:
2079: rc = constructVtab(db, &spec, ppVTab, pzErr);
2080:
2081: out:
2082: clearTableSpec(&spec);
2083: return rc;
2084: }
2085:
2086: /* Decide how to handle an SQL query. */
2087: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
2088: int i;
2089: TRACE(("FTS1 BestIndex\n"));
2090:
2091: for(i=0; i<pInfo->nConstraint; ++i){
2092: const struct sqlite3_index_constraint *pConstraint;
2093: pConstraint = &pInfo->aConstraint[i];
2094: if( pConstraint->usable ) {
2095: if( pConstraint->iColumn==-1 &&
2096: pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
2097: pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
2098: TRACE(("FTS1 QUERY_ROWID\n"));
2099: } else if( pConstraint->iColumn>=0 &&
2100: pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
2101: /* full-text search */
2102: pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
2103: TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
2104: } else continue;
2105:
2106: pInfo->aConstraintUsage[i].argvIndex = 1;
2107: pInfo->aConstraintUsage[i].omit = 1;
2108:
2109: /* An arbitrary value for now.
2110: * TODO: Perhaps rowid matches should be considered cheaper than
2111: * full-text searches. */
2112: pInfo->estimatedCost = 1.0;
2113:
2114: return SQLITE_OK;
2115: }
2116: }
2117: pInfo->idxNum = QUERY_GENERIC;
2118: return SQLITE_OK;
2119: }
2120:
2121: static int fulltextDisconnect(sqlite3_vtab *pVTab){
2122: TRACE(("FTS1 Disconnect %p\n", pVTab));
2123: fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2124: return SQLITE_OK;
2125: }
2126:
2127: static int fulltextDestroy(sqlite3_vtab *pVTab){
2128: fulltext_vtab *v = (fulltext_vtab *)pVTab;
2129: int rc;
2130:
2131: TRACE(("FTS1 Destroy %p\n", pVTab));
2132: rc = sql_exec(v->db, v->zDb, v->zName,
2133: "drop table if exists %_content;"
2134: "drop table if exists %_term;"
2135: );
2136: if( rc!=SQLITE_OK ) return rc;
2137:
2138: fulltext_vtab_destroy((fulltext_vtab *)pVTab);
2139: return SQLITE_OK;
2140: }
2141:
2142: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2143: fulltext_cursor *c;
2144:
2145: c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
2146: /* sqlite will initialize c->base */
2147: *ppCursor = &c->base;
2148: TRACE(("FTS1 Open %p: %p\n", pVTab, c));
2149:
2150: return SQLITE_OK;
2151: }
2152:
2153:
2154: /* Free all of the dynamically allocated memory held by *q
2155: */
2156: static void queryClear(Query *q){
2157: int i;
2158: for(i = 0; i < q->nTerms; ++i){
2159: free(q->pTerms[i].pTerm);
2160: }
2161: free(q->pTerms);
2162: memset(q, 0, sizeof(*q));
2163: }
2164:
2165: /* Free all of the dynamically allocated memory held by the
2166: ** Snippet
2167: */
2168: static void snippetClear(Snippet *p){
2169: free(p->aMatch);
2170: free(p->zOffset);
2171: free(p->zSnippet);
2172: memset(p, 0, sizeof(*p));
2173: }
2174: /*
2175: ** Append a single entry to the p->aMatch[] log.
2176: */
2177: static void snippetAppendMatch(
2178: Snippet *p, /* Append the entry to this snippet */
2179: int iCol, int iTerm, /* The column and query term */
2180: int iStart, int nByte /* Offset and size of the match */
2181: ){
2182: int i;
2183: struct snippetMatch *pMatch;
2184: if( p->nMatch+1>=p->nAlloc ){
2185: p->nAlloc = p->nAlloc*2 + 10;
2186: p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
2187: if( p->aMatch==0 ){
2188: p->nMatch = 0;
2189: p->nAlloc = 0;
2190: return;
2191: }
2192: }
2193: i = p->nMatch++;
2194: pMatch = &p->aMatch[i];
2195: pMatch->iCol = iCol;
2196: pMatch->iTerm = iTerm;
2197: pMatch->iStart = iStart;
2198: pMatch->nByte = nByte;
2199: }
2200:
2201: /*
2202: ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
2203: */
2204: #define FTS1_ROTOR_SZ (32)
2205: #define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
2206:
2207: /*
2208: ** Add entries to pSnippet->aMatch[] for every match that occurs against
2209: ** document zDoc[0..nDoc-1] which is stored in column iColumn.
2210: */
2211: static void snippetOffsetsOfColumn(
2212: Query *pQuery,
2213: Snippet *pSnippet,
2214: int iColumn,
2215: const char *zDoc,
2216: int nDoc
2217: ){
2218: const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
2219: sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
2220: sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
2221: fulltext_vtab *pVtab; /* The full text index */
2222: int nColumn; /* Number of columns in the index */
2223: const QueryTerm *aTerm; /* Query string terms */
2224: int nTerm; /* Number of query string terms */
2225: int i, j; /* Loop counters */
2226: int rc; /* Return code */
2227: unsigned int match, prevMatch; /* Phrase search bitmasks */
2228: const char *zToken; /* Next token from the tokenizer */
2229: int nToken; /* Size of zToken */
2230: int iBegin, iEnd, iPos; /* Offsets of beginning and end */
2231:
2232: /* The following variables keep a circular buffer of the last
2233: ** few tokens */
2234: unsigned int iRotor = 0; /* Index of current token */
2235: int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */
2236: int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */
2237:
2238: pVtab = pQuery->pFts;
2239: nColumn = pVtab->nColumn;
2240: pTokenizer = pVtab->pTokenizer;
2241: pTModule = pTokenizer->pModule;
2242: rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
2243: if( rc ) return;
2244: pTCursor->pTokenizer = pTokenizer;
2245: aTerm = pQuery->pTerms;
2246: nTerm = pQuery->nTerms;
2247: if( nTerm>=FTS1_ROTOR_SZ ){
2248: nTerm = FTS1_ROTOR_SZ - 1;
2249: }
2250: prevMatch = 0;
2251: while(1){
2252: rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
2253: if( rc ) break;
2254: iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
2255: iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
2256: match = 0;
2257: for(i=0; i<nTerm; i++){
2258: int iCol;
2259: iCol = aTerm[i].iColumn;
2260: if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
2261: if( aTerm[i].nTerm!=nToken ) continue;
2262: if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
2263: if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
2264: match |= 1<<i;
2265: if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
2266: for(j=aTerm[i].iPhrase-1; j>=0; j--){
2267: int k = (iRotor-j) & FTS1_ROTOR_MASK;
2268: snippetAppendMatch(pSnippet, iColumn, i-j,
2269: iRotorBegin[k], iRotorLen[k]);
2270: }
2271: }
2272: }
2273: prevMatch = match<<1;
2274: iRotor++;
2275: }
2276: pTModule->xClose(pTCursor);
2277: }
2278:
2279:
2280: /*
2281: ** Compute all offsets for the current row of the query.
2282: ** If the offsets have already been computed, this routine is a no-op.
2283: */
2284: static void snippetAllOffsets(fulltext_cursor *p){
2285: int nColumn;
2286: int iColumn, i;
2287: int iFirst, iLast;
2288: fulltext_vtab *pFts;
2289:
2290: if( p->snippet.nMatch ) return;
2291: if( p->q.nTerms==0 ) return;
2292: pFts = p->q.pFts;
2293: nColumn = pFts->nColumn;
2294: iColumn = p->iCursorType - QUERY_FULLTEXT;
2295: if( iColumn<0 || iColumn>=nColumn ){
2296: iFirst = 0;
2297: iLast = nColumn-1;
2298: }else{
2299: iFirst = iColumn;
2300: iLast = iColumn;
2301: }
2302: for(i=iFirst; i<=iLast; i++){
2303: const char *zDoc;
2304: int nDoc;
2305: zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
2306: nDoc = sqlite3_column_bytes(p->pStmt, i+1);
2307: snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
2308: }
2309: }
2310:
2311: /*
2312: ** Convert the information in the aMatch[] array of the snippet
2313: ** into the string zOffset[0..nOffset-1].
2314: */
2315: static void snippetOffsetText(Snippet *p){
2316: int i;
2317: int cnt = 0;
2318: StringBuffer sb;
2319: char zBuf[200];
2320: if( p->zOffset ) return;
2321: initStringBuffer(&sb);
2322: for(i=0; i<p->nMatch; i++){
2323: struct snippetMatch *pMatch = &p->aMatch[i];
2324: zBuf[0] = ' ';
2325: sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
2326: pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
2327: append(&sb, zBuf);
2328: cnt++;
2329: }
2330: p->zOffset = sb.s;
2331: p->nOffset = sb.len;
2332: }
2333:
2334: /*
2335: ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
2336: ** of matching words some of which might be in zDoc. zDoc is column
2337: ** number iCol.
2338: **
2339: ** iBreak is suggested spot in zDoc where we could begin or end an
2340: ** excerpt. Return a value similar to iBreak but possibly adjusted
2341: ** to be a little left or right so that the break point is better.
2342: */
2343: static int wordBoundary(
2344: int iBreak, /* The suggested break point */
2345: const char *zDoc, /* Document text */
2346: int nDoc, /* Number of bytes in zDoc[] */
2347: struct snippetMatch *aMatch, /* Matching words */
2348: int nMatch, /* Number of entries in aMatch[] */
2349: int iCol /* The column number for zDoc[] */
2350: ){
2351: int i;
2352: if( iBreak<=10 ){
2353: return 0;
2354: }
2355: if( iBreak>=nDoc-10 ){
2356: return nDoc;
2357: }
2358: for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
2359: while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
2360: if( i<nMatch ){
2361: if( aMatch[i].iStart<iBreak+10 ){
2362: return aMatch[i].iStart;
2363: }
2364: if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
2365: return aMatch[i-1].iStart;
2366: }
2367: }
2368: for(i=1; i<=10; i++){
2369: if( safe_isspace(zDoc[iBreak-i]) ){
2370: return iBreak - i + 1;
2371: }
2372: if( safe_isspace(zDoc[iBreak+i]) ){
2373: return iBreak + i + 1;
2374: }
2375: }
2376: return iBreak;
2377: }
2378:
2379: /*
2380: ** If the StringBuffer does not end in white space, add a single
2381: ** space character to the end.
2382: */
2383: static void appendWhiteSpace(StringBuffer *p){
2384: if( p->len==0 ) return;
2385: if( safe_isspace(p->s[p->len-1]) ) return;
2386: append(p, " ");
2387: }
2388:
2389: /*
2390: ** Remove white space from teh end of the StringBuffer
2391: */
2392: static void trimWhiteSpace(StringBuffer *p){
2393: while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
2394: p->len--;
2395: }
2396: }
2397:
2398:
2399:
2400: /*
2401: ** Allowed values for Snippet.aMatch[].snStatus
2402: */
2403: #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
2404: #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
2405:
2406: /*
2407: ** Generate the text of a snippet.
2408: */
2409: static void snippetText(
2410: fulltext_cursor *pCursor, /* The cursor we need the snippet for */
2411: const char *zStartMark, /* Markup to appear before each match */
2412: const char *zEndMark, /* Markup to appear after each match */
2413: const char *zEllipsis /* Ellipsis mark */
2414: ){
2415: int i, j;
2416: struct snippetMatch *aMatch;
2417: int nMatch;
2418: int nDesired;
2419: StringBuffer sb;
2420: int tailCol;
2421: int tailOffset;
2422: int iCol;
2423: int nDoc;
2424: const char *zDoc;
2425: int iStart, iEnd;
2426: int tailEllipsis = 0;
2427: int iMatch;
2428:
2429:
2430: free(pCursor->snippet.zSnippet);
2431: pCursor->snippet.zSnippet = 0;
2432: aMatch = pCursor->snippet.aMatch;
2433: nMatch = pCursor->snippet.nMatch;
2434: initStringBuffer(&sb);
2435:
2436: for(i=0; i<nMatch; i++){
2437: aMatch[i].snStatus = SNIPPET_IGNORE;
2438: }
2439: nDesired = 0;
2440: for(i=0; i<pCursor->q.nTerms; i++){
2441: for(j=0; j<nMatch; j++){
2442: if( aMatch[j].iTerm==i ){
2443: aMatch[j].snStatus = SNIPPET_DESIRED;
2444: nDesired++;
2445: break;
2446: }
2447: }
2448: }
2449:
2450: iMatch = 0;
2451: tailCol = -1;
2452: tailOffset = 0;
2453: for(i=0; i<nMatch && nDesired>0; i++){
2454: if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
2455: nDesired--;
2456: iCol = aMatch[i].iCol;
2457: zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
2458: nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
2459: iStart = aMatch[i].iStart - 40;
2460: iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
2461: if( iStart<=10 ){
2462: iStart = 0;
2463: }
2464: if( iCol==tailCol && iStart<=tailOffset+20 ){
2465: iStart = tailOffset;
2466: }
2467: if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
2468: trimWhiteSpace(&sb);
2469: appendWhiteSpace(&sb);
2470: append(&sb, zEllipsis);
2471: appendWhiteSpace(&sb);
2472: }
2473: iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
2474: iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
2475: if( iEnd>=nDoc-10 ){
2476: iEnd = nDoc;
2477: tailEllipsis = 0;
2478: }else{
2479: tailEllipsis = 1;
2480: }
2481: while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
2482: while( iStart<iEnd ){
2483: while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
2484: && aMatch[iMatch].iCol<=iCol ){
2485: iMatch++;
2486: }
2487: if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
2488: && aMatch[iMatch].iCol==iCol ){
2489: nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
2490: iStart = aMatch[iMatch].iStart;
2491: append(&sb, zStartMark);
2492: nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
2493: append(&sb, zEndMark);
2494: iStart += aMatch[iMatch].nByte;
2495: for(j=iMatch+1; j<nMatch; j++){
2496: if( aMatch[j].iTerm==aMatch[iMatch].iTerm
2497: && aMatch[j].snStatus==SNIPPET_DESIRED ){
2498: nDesired--;
2499: aMatch[j].snStatus = SNIPPET_IGNORE;
2500: }
2501: }
2502: }else{
2503: nappend(&sb, &zDoc[iStart], iEnd - iStart);
2504: iStart = iEnd;
2505: }
2506: }
2507: tailCol = iCol;
2508: tailOffset = iEnd;
2509: }
2510: trimWhiteSpace(&sb);
2511: if( tailEllipsis ){
2512: appendWhiteSpace(&sb);
2513: append(&sb, zEllipsis);
2514: }
2515: pCursor->snippet.zSnippet = sb.s;
2516: pCursor->snippet.nSnippet = sb.len;
2517: }
2518:
2519:
2520: /*
2521: ** Close the cursor. For additional information see the documentation
2522: ** on the xClose method of the virtual table interface.
2523: */
2524: static int fulltextClose(sqlite3_vtab_cursor *pCursor){
2525: fulltext_cursor *c = (fulltext_cursor *) pCursor;
2526: TRACE(("FTS1 Close %p\n", c));
2527: sqlite3_finalize(c->pStmt);
2528: queryClear(&c->q);
2529: snippetClear(&c->snippet);
2530: if( c->result.pDoclist!=NULL ){
2531: docListDelete(c->result.pDoclist);
2532: }
2533: free(c);
2534: return SQLITE_OK;
2535: }
2536:
2537: static int fulltextNext(sqlite3_vtab_cursor *pCursor){
2538: fulltext_cursor *c = (fulltext_cursor *) pCursor;
2539: sqlite_int64 iDocid;
2540: int rc;
2541:
2542: TRACE(("FTS1 Next %p\n", pCursor));
2543: snippetClear(&c->snippet);
2544: if( c->iCursorType < QUERY_FULLTEXT ){
2545: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
2546: rc = sqlite3_step(c->pStmt);
2547: switch( rc ){
2548: case SQLITE_ROW:
2549: c->eof = 0;
2550: return SQLITE_OK;
2551: case SQLITE_DONE:
2552: c->eof = 1;
2553: return SQLITE_OK;
2554: default:
2555: c->eof = 1;
2556: return rc;
2557: }
2558: } else { /* full-text query */
2559: rc = sqlite3_reset(c->pStmt);
2560: if( rc!=SQLITE_OK ) return rc;
2561:
2562: iDocid = nextDocid(&c->result);
2563: if( iDocid==0 ){
2564: c->eof = 1;
2565: return SQLITE_OK;
2566: }
2567: rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
2568: if( rc!=SQLITE_OK ) return rc;
2569: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
2570: rc = sqlite3_step(c->pStmt);
2571: if( rc==SQLITE_ROW ){ /* the case we expect */
2572: c->eof = 0;
2573: return SQLITE_OK;
2574: }
2575: /* an error occurred; abort */
2576: return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
2577: }
2578: }
2579:
2580:
2581: /* Return a DocList corresponding to the query term *pTerm. If *pTerm
2582: ** is the first term of a phrase query, go ahead and evaluate the phrase
2583: ** query and return the doclist for the entire phrase query.
2584: **
2585: ** The result is stored in pTerm->doclist.
2586: */
2587: static int docListOfTerm(
2588: fulltext_vtab *v, /* The full text index */
2589: int iColumn, /* column to restrict to. No restrition if >=nColumn */
2590: QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
2591: DocList **ppResult /* Write the result here */
2592: ){
2593: DocList *pLeft, *pRight, *pNew;
2594: int i, rc;
2595:
2596: pLeft = docListNew(DL_POSITIONS);
2597: rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
2598: if( rc ){
2599: docListDelete(pLeft);
2600: return rc;
2601: }
2602: for(i=1; i<=pQTerm->nPhrase; i++){
2603: pRight = docListNew(DL_POSITIONS);
2604: rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
2605: if( rc ){
2606: docListDelete(pLeft);
2607: return rc;
2608: }
2609: pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS);
2610: docListPhraseMerge(pLeft, pRight, pNew);
2611: docListDelete(pLeft);
2612: docListDelete(pRight);
2613: pLeft = pNew;
2614: }
2615: *ppResult = pLeft;
2616: return SQLITE_OK;
2617: }
2618:
2619: /* Add a new term pTerm[0..nTerm-1] to the query *q.
2620: */
2621: static void queryAdd(Query *q, const char *pTerm, int nTerm){
2622: QueryTerm *t;
2623: ++q->nTerms;
2624: q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
2625: if( q->pTerms==0 ){
2626: q->nTerms = 0;
2627: return;
2628: }
2629: t = &q->pTerms[q->nTerms - 1];
2630: memset(t, 0, sizeof(*t));
2631: t->pTerm = malloc(nTerm+1);
2632: memcpy(t->pTerm, pTerm, nTerm);
2633: t->pTerm[nTerm] = 0;
2634: t->nTerm = nTerm;
2635: t->isOr = q->nextIsOr;
2636: q->nextIsOr = 0;
2637: t->iColumn = q->nextColumn;
2638: q->nextColumn = q->dfltColumn;
2639: }
2640:
2641: /*
2642: ** Check to see if the string zToken[0...nToken-1] matches any
2643: ** column name in the virtual table. If it does,
2644: ** return the zero-indexed column number. If not, return -1.
2645: */
2646: static int checkColumnSpecifier(
2647: fulltext_vtab *pVtab, /* The virtual table */
2648: const char *zToken, /* Text of the token */
2649: int nToken /* Number of characters in the token */
2650: ){
2651: int i;
2652: for(i=0; i<pVtab->nColumn; i++){
2653: if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
2654: && pVtab->azColumn[i][nToken]==0 ){
2655: return i;
2656: }
2657: }
2658: return -1;
2659: }
2660:
2661: /*
2662: ** Parse the text at pSegment[0..nSegment-1]. Add additional terms
2663: ** to the query being assemblied in pQuery.
2664: **
2665: ** inPhrase is true if pSegment[0..nSegement-1] is contained within
2666: ** double-quotes. If inPhrase is true, then the first term
2667: ** is marked with the number of terms in the phrase less one and
2668: ** OR and "-" syntax is ignored. If inPhrase is false, then every
2669: ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
2670: */
2671: static int tokenizeSegment(
2672: sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
2673: const char *pSegment, int nSegment, /* Query expression being parsed */
2674: int inPhrase, /* True if within "..." */
2675: Query *pQuery /* Append results here */
2676: ){
2677: const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
2678: sqlite3_tokenizer_cursor *pCursor;
2679: int firstIndex = pQuery->nTerms;
2680: int iCol;
2681: int nTerm = 1;
2682:
2683: int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
2684: if( rc!=SQLITE_OK ) return rc;
2685: pCursor->pTokenizer = pTokenizer;
2686:
2687: while( 1 ){
2688: const char *pToken;
2689: int nToken, iBegin, iEnd, iPos;
2690:
2691: rc = pModule->xNext(pCursor,
2692: &pToken, &nToken,
2693: &iBegin, &iEnd, &iPos);
2694: if( rc!=SQLITE_OK ) break;
2695: if( !inPhrase &&
2696: pSegment[iEnd]==':' &&
2697: (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
2698: pQuery->nextColumn = iCol;
2699: continue;
2700: }
2701: if( !inPhrase && pQuery->nTerms>0 && nToken==2
2702: && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
2703: pQuery->nextIsOr = 1;
2704: continue;
2705: }
2706: queryAdd(pQuery, pToken, nToken);
2707: if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
2708: pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
2709: }
2710: pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
2711: if( inPhrase ){
2712: nTerm++;
2713: }
2714: }
2715:
2716: if( inPhrase && pQuery->nTerms>firstIndex ){
2717: pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
2718: }
2719:
2720: return pModule->xClose(pCursor);
2721: }
2722:
2723: /* Parse a query string, yielding a Query object pQuery.
2724: **
2725: ** The calling function will need to queryClear() to clean up
2726: ** the dynamically allocated memory held by pQuery.
2727: */
2728: static int parseQuery(
2729: fulltext_vtab *v, /* The fulltext index */
2730: const char *zInput, /* Input text of the query string */
2731: int nInput, /* Size of the input text */
2732: int dfltColumn, /* Default column of the index to match against */
2733: Query *pQuery /* Write the parse results here. */
2734: ){
2735: int iInput, inPhrase = 0;
2736:
2737: if( zInput==0 ) nInput = 0;
2738: if( nInput<0 ) nInput = strlen(zInput);
2739: pQuery->nTerms = 0;
2740: pQuery->pTerms = NULL;
2741: pQuery->nextIsOr = 0;
2742: pQuery->nextColumn = dfltColumn;
2743: pQuery->dfltColumn = dfltColumn;
2744: pQuery->pFts = v;
2745:
2746: for(iInput=0; iInput<nInput; ++iInput){
2747: int i;
2748: for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
2749: if( i>iInput ){
2750: tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
2751: pQuery);
2752: }
2753: iInput = i;
2754: if( i<nInput ){
2755: assert( zInput[i]=='"' );
2756: inPhrase = !inPhrase;
2757: }
2758: }
2759:
2760: if( inPhrase ){
2761: /* unmatched quote */
2762: queryClear(pQuery);
2763: return SQLITE_ERROR;
2764: }
2765: return SQLITE_OK;
2766: }
2767:
2768: /* Perform a full-text query using the search expression in
2769: ** zInput[0..nInput-1]. Return a list of matching documents
2770: ** in pResult.
2771: **
2772: ** Queries must match column iColumn. Or if iColumn>=nColumn
2773: ** they are allowed to match against any column.
2774: */
2775: static int fulltextQuery(
2776: fulltext_vtab *v, /* The full text index */
2777: int iColumn, /* Match against this column by default */
2778: const char *zInput, /* The query string */
2779: int nInput, /* Number of bytes in zInput[] */
2780: DocList **pResult, /* Write the result doclist here */
2781: Query *pQuery /* Put parsed query string here */
2782: ){
2783: int i, iNext, rc;
2784: DocList *pLeft = NULL;
2785: DocList *pRight, *pNew, *pOr;
2786: int nNot = 0;
2787: QueryTerm *aTerm;
2788:
2789: rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
2790: if( rc!=SQLITE_OK ) return rc;
2791:
2792: /* Merge AND terms. */
2793: aTerm = pQuery->pTerms;
2794: for(i = 0; i<pQuery->nTerms; i=iNext){
2795: if( aTerm[i].isNot ){
2796: /* Handle all NOT terms in a separate pass */
2797: nNot++;
2798: iNext = i + aTerm[i].nPhrase+1;
2799: continue;
2800: }
2801: iNext = i + aTerm[i].nPhrase + 1;
2802: rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
2803: if( rc ){
2804: queryClear(pQuery);
2805: return rc;
2806: }
2807: while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
2808: rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
2809: iNext += aTerm[iNext].nPhrase + 1;
2810: if( rc ){
2811: queryClear(pQuery);
2812: return rc;
2813: }
2814: pNew = docListNew(DL_DOCIDS);
2815: docListOrMerge(pRight, pOr, pNew);
2816: docListDelete(pRight);
2817: docListDelete(pOr);
2818: pRight = pNew;
2819: }
2820: if( pLeft==0 ){
2821: pLeft = pRight;
2822: }else{
2823: pNew = docListNew(DL_DOCIDS);
2824: docListAndMerge(pLeft, pRight, pNew);
2825: docListDelete(pRight);
2826: docListDelete(pLeft);
2827: pLeft = pNew;
2828: }
2829: }
2830:
2831: if( nNot && pLeft==0 ){
2832: /* We do not yet know how to handle a query of only NOT terms */
2833: return SQLITE_ERROR;
2834: }
2835:
2836: /* Do the EXCEPT terms */
2837: for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
2838: if( !aTerm[i].isNot ) continue;
2839: rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
2840: if( rc ){
2841: queryClear(pQuery);
2842: docListDelete(pLeft);
2843: return rc;
2844: }
2845: pNew = docListNew(DL_DOCIDS);
2846: docListExceptMerge(pLeft, pRight, pNew);
2847: docListDelete(pRight);
2848: docListDelete(pLeft);
2849: pLeft = pNew;
2850: }
2851:
2852: *pResult = pLeft;
2853: return rc;
2854: }
2855:
2856: /*
2857: ** This is the xFilter interface for the virtual table. See
2858: ** the virtual table xFilter method documentation for additional
2859: ** information.
2860: **
2861: ** If idxNum==QUERY_GENERIC then do a full table scan against
2862: ** the %_content table.
2863: **
2864: ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
2865: ** in the %_content table.
2866: **
2867: ** If idxNum>=QUERY_FULLTEXT then use the full text index. The
2868: ** column on the left-hand side of the MATCH operator is column
2869: ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
2870: ** side of the MATCH operator.
2871: */
2872: /* TODO(shess) Upgrade the cursor initialization and destruction to
2873: ** account for fulltextFilter() being called multiple times on the
2874: ** same cursor. The current solution is very fragile. Apply fix to
2875: ** fts2 as appropriate.
2876: */
2877: static int fulltextFilter(
2878: sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
2879: int idxNum, const char *idxStr, /* Which indexing scheme to use */
2880: int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
2881: ){
2882: fulltext_cursor *c = (fulltext_cursor *) pCursor;
2883: fulltext_vtab *v = cursor_vtab(c);
2884: int rc;
2885: char *zSql;
2886:
2887: TRACE(("FTS1 Filter %p\n",pCursor));
2888:
2889: zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
2890: idxNum==QUERY_GENERIC ? "" : "where rowid=?");
2891: sqlite3_finalize(c->pStmt);
2892: rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
2893: sqlite3_free(zSql);
2894: if( rc!=SQLITE_OK ) return rc;
2895:
2896: c->iCursorType = idxNum;
2897: switch( idxNum ){
2898: case QUERY_GENERIC:
2899: break;
2900:
2901: case QUERY_ROWID:
2902: rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
2903: if( rc!=SQLITE_OK ) return rc;
2904: break;
2905:
2906: default: /* full-text search */
2907: {
2908: const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
2909: DocList *pResult;
2910: assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
2911: assert( argc==1 );
2912: queryClear(&c->q);
2913: rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
2914: if( rc!=SQLITE_OK ) return rc;
2915: if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
2916: readerInit(&c->result, pResult);
2917: break;
2918: }
2919: }
2920:
2921: return fulltextNext(pCursor);
2922: }
2923:
2924: /* This is the xEof method of the virtual table. The SQLite core
2925: ** calls this routine to find out if it has reached the end of
2926: ** a query's results set.
2927: */
2928: static int fulltextEof(sqlite3_vtab_cursor *pCursor){
2929: fulltext_cursor *c = (fulltext_cursor *) pCursor;
2930: return c->eof;
2931: }
2932:
2933: /* This is the xColumn method of the virtual table. The SQLite
2934: ** core calls this method during a query when it needs the value
2935: ** of a column from the virtual table. This method needs to use
2936: ** one of the sqlite3_result_*() routines to store the requested
2937: ** value back in the pContext.
2938: */
2939: static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
2940: sqlite3_context *pContext, int idxCol){
2941: fulltext_cursor *c = (fulltext_cursor *) pCursor;
2942: fulltext_vtab *v = cursor_vtab(c);
2943:
2944: if( idxCol<v->nColumn ){
2945: sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
2946: sqlite3_result_value(pContext, pVal);
2947: }else if( idxCol==v->nColumn ){
2948: /* The extra column whose name is the same as the table.
2949: ** Return a blob which is a pointer to the cursor
2950: */
2951: sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
2952: }
2953: return SQLITE_OK;
2954: }
2955:
2956: /* This is the xRowid method. The SQLite core calls this routine to
2957: ** retrive the rowid for the current row of the result set. The
2958: ** rowid should be written to *pRowid.
2959: */
2960: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
2961: fulltext_cursor *c = (fulltext_cursor *) pCursor;
2962:
2963: *pRowid = sqlite3_column_int64(c->pStmt, 0);
2964: return SQLITE_OK;
2965: }
2966:
2967: /* Add all terms in [zText] to the given hash table. If [iColumn] > 0,
2968: * we also store positions and offsets in the hash table using the given
2969: * column number. */
2970: static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
2971: const char *zText, int iColumn){
2972: sqlite3_tokenizer *pTokenizer = v->pTokenizer;
2973: sqlite3_tokenizer_cursor *pCursor;
2974: const char *pToken;
2975: int nTokenBytes;
2976: int iStartOffset, iEndOffset, iPosition;
2977: int rc;
2978:
2979: rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
2980: if( rc!=SQLITE_OK ) return rc;
2981:
2982: pCursor->pTokenizer = pTokenizer;
2983: while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
2984: &pToken, &nTokenBytes,
2985: &iStartOffset, &iEndOffset,
2986: &iPosition) ){
2987: DocList *p;
2988:
2989: /* Positions can't be negative; we use -1 as a terminator internally. */
2990: if( iPosition<0 ){
2991: pTokenizer->pModule->xClose(pCursor);
2992: return SQLITE_ERROR;
2993: }
2994:
2995: p = fts1HashFind(terms, pToken, nTokenBytes);
2996: if( p==NULL ){
2997: p = docListNew(DL_DEFAULT);
2998: docListAddDocid(p, iDocid);
2999: fts1HashInsert(terms, pToken, nTokenBytes, p);
3000: }
3001: if( iColumn>=0 ){
3002: docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
3003: }
3004: }
3005:
3006: /* TODO(shess) Check return? Should this be able to cause errors at
3007: ** this point? Actually, same question about sqlite3_finalize(),
3008: ** though one could argue that failure there means that the data is
3009: ** not durable. *ponder*
3010: */
3011: pTokenizer->pModule->xClose(pCursor);
3012: return rc;
3013: }
3014:
3015: /* Update the %_terms table to map the term [pTerm] to the given rowid. */
3016: static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
3017: DocList *d){
3018: sqlite_int64 iIndexRow;
3019: DocList doclist;
3020: int iSegment = 0, rc;
3021:
3022: rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
3023: if( rc==SQLITE_DONE ){
3024: docListInit(&doclist, DL_DEFAULT, 0, 0);
3025: docListUpdate(&doclist, d);
3026: /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
3027: rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
3028: goto err;
3029: }
3030: if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
3031:
3032: docListUpdate(&doclist, d);
3033: if( doclist.nData<=CHUNK_MAX ){
3034: rc = term_update(v, iIndexRow, &doclist);
3035: goto err;
3036: }
3037:
3038: /* Doclist doesn't fit, delete what's there, and accumulate
3039: ** forward.
3040: */
3041: rc = term_delete(v, iIndexRow);
3042: if( rc!=SQLITE_OK ) goto err;
3043:
3044: /* Try to insert the doclist into a higher segment bucket. On
3045: ** failure, accumulate existing doclist with the doclist from that
3046: ** bucket, and put results in the next bucket.
3047: */
3048: iSegment++;
3049: while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
3050: &doclist))!=SQLITE_OK ){
3051: sqlite_int64 iSegmentRow;
3052: DocList old;
3053: int rc2;
3054:
3055: /* Retain old error in case the term_insert() error was really an
3056: ** error rather than a bounced insert.
3057: */
3058: rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
3059: if( rc2!=SQLITE_ROW ) goto err;
3060:
3061: rc = term_delete(v, iSegmentRow);
3062: if( rc!=SQLITE_OK ) goto err;
3063:
3064: /* Reusing lowest-number deleted row keeps the index smaller. */
3065: if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
3066:
3067: /* doclist contains the newer data, so accumulate it over old.
3068: ** Then steal accumulated data for doclist.
3069: */
3070: docListAccumulate(&old, &doclist);
3071: docListDestroy(&doclist);
3072: doclist = old;
3073:
3074: iSegment++;
3075: }
3076:
3077: err:
3078: docListDestroy(&doclist);
3079: return rc;
3080: }
3081:
3082: /* Add doclists for all terms in [pValues] to the hash table [terms]. */
3083: static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid,
3084: sqlite3_value **pValues){
3085: int i;
3086: for(i = 0; i < v->nColumn ; ++i){
3087: char *zText = (char*)sqlite3_value_text(pValues[i]);
3088: int rc = buildTerms(v, terms, iRowid, zText, i);
3089: if( rc!=SQLITE_OK ) return rc;
3090: }
3091: return SQLITE_OK;
3092: }
3093:
3094: /* Add empty doclists for all terms in the given row's content to the hash
3095: * table [pTerms]. */
3096: static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
3097: const char **pValues;
3098: int i;
3099:
3100: int rc = content_select(v, iRowid, &pValues);
3101: if( rc!=SQLITE_OK ) return rc;
3102:
3103: for(i = 0 ; i < v->nColumn; ++i) {
3104: rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
3105: if( rc!=SQLITE_OK ) break;
3106: }
3107:
3108: freeStringArray(v->nColumn, pValues);
3109: return SQLITE_OK;
3110: }
3111:
3112: /* Insert a row into the %_content table; set *piRowid to be the ID of the
3113: * new row. Fill [pTerms] with new doclists for the %_term table. */
3114: static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
3115: sqlite3_value **pValues,
3116: sqlite_int64 *piRowid, fts1Hash *pTerms){
3117: int rc;
3118:
3119: rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
3120: if( rc!=SQLITE_OK ) return rc;
3121: *piRowid = sqlite3_last_insert_rowid(v->db);
3122: return insertTerms(v, pTerms, *piRowid, pValues);
3123: }
3124:
3125: /* Delete a row from the %_content table; fill [pTerms] with empty doclists
3126: * to be written to the %_term table. */
3127: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
3128: int rc = deleteTerms(v, pTerms, iRow);
3129: if( rc!=SQLITE_OK ) return rc;
3130: return content_delete(v, iRow); /* execute an SQL DELETE */
3131: }
3132:
3133: /* Update a row in the %_content table; fill [pTerms] with new doclists for the
3134: * %_term table. */
3135: static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
3136: sqlite3_value **pValues, fts1Hash *pTerms){
3137: /* Generate an empty doclist for each term that previously appeared in this
3138: * row. */
3139: int rc = deleteTerms(v, pTerms, iRow);
3140: if( rc!=SQLITE_OK ) return rc;
3141:
3142: rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
3143: if( rc!=SQLITE_OK ) return rc;
3144:
3145: /* Now add positions for terms which appear in the updated row. */
3146: return insertTerms(v, pTerms, iRow, pValues);
3147: }
3148:
3149: /* This function implements the xUpdate callback; it is the top-level entry
3150: * point for inserting, deleting or updating a row in a full-text table. */
3151: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
3152: sqlite_int64 *pRowid){
3153: fulltext_vtab *v = (fulltext_vtab *) pVtab;
3154: fts1Hash terms; /* maps term string -> PosList */
3155: int rc;
3156: fts1HashElem *e;
3157:
3158: TRACE(("FTS1 Update %p\n", pVtab));
3159:
3160: fts1HashInit(&terms, FTS1_HASH_STRING, 1);
3161:
3162: if( nArg<2 ){
3163: rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
3164: } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
3165: /* An update:
3166: * ppArg[0] = old rowid
3167: * ppArg[1] = new rowid
3168: * ppArg[2..2+v->nColumn-1] = values
3169: * ppArg[2+v->nColumn] = value for magic column (we ignore this)
3170: */
3171: sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
3172: if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
3173: sqlite3_value_int64(ppArg[1]) != rowid ){
3174: rc = SQLITE_ERROR; /* we don't allow changing the rowid */
3175: } else {
3176: assert( nArg==2+v->nColumn+1);
3177: rc = index_update(v, rowid, &ppArg[2], &terms);
3178: }
3179: } else {
3180: /* An insert:
3181: * ppArg[1] = requested rowid
3182: * ppArg[2..2+v->nColumn-1] = values
3183: * ppArg[2+v->nColumn] = value for magic column (we ignore this)
3184: */
3185: assert( nArg==2+v->nColumn+1);
3186: rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
3187: }
3188:
3189: if( rc==SQLITE_OK ){
3190: /* Write updated doclists to disk. */
3191: for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
3192: DocList *p = fts1HashData(e);
3193: rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
3194: if( rc!=SQLITE_OK ) break;
3195: }
3196: }
3197:
3198: /* clean up */
3199: for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
3200: DocList *p = fts1HashData(e);
3201: docListDelete(p);
3202: }
3203: fts1HashClear(&terms);
3204:
3205: return rc;
3206: }
3207:
3208: /*
3209: ** Implementation of the snippet() function for FTS1
3210: */
3211: static void snippetFunc(
3212: sqlite3_context *pContext,
3213: int argc,
3214: sqlite3_value **argv
3215: ){
3216: fulltext_cursor *pCursor;
3217: if( argc<1 ) return;
3218: if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
3219: sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
3220: sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
3221: }else{
3222: const char *zStart = "<b>";
3223: const char *zEnd = "</b>";
3224: const char *zEllipsis = "<b>...</b>";
3225: memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
3226: if( argc>=2 ){
3227: zStart = (const char*)sqlite3_value_text(argv[1]);
3228: if( argc>=3 ){
3229: zEnd = (const char*)sqlite3_value_text(argv[2]);
3230: if( argc>=4 ){
3231: zEllipsis = (const char*)sqlite3_value_text(argv[3]);
3232: }
3233: }
3234: }
3235: snippetAllOffsets(pCursor);
3236: snippetText(pCursor, zStart, zEnd, zEllipsis);
3237: sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
3238: pCursor->snippet.nSnippet, SQLITE_STATIC);
3239: }
3240: }
3241:
3242: /*
3243: ** Implementation of the offsets() function for FTS1
3244: */
3245: static void snippetOffsetsFunc(
3246: sqlite3_context *pContext,
3247: int argc,
3248: sqlite3_value **argv
3249: ){
3250: fulltext_cursor *pCursor;
3251: if( argc<1 ) return;
3252: if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
3253: sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
3254: sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
3255: }else{
3256: memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
3257: snippetAllOffsets(pCursor);
3258: snippetOffsetText(&pCursor->snippet);
3259: sqlite3_result_text(pContext,
3260: pCursor->snippet.zOffset, pCursor->snippet.nOffset,
3261: SQLITE_STATIC);
3262: }
3263: }
3264:
3265: /*
3266: ** This routine implements the xFindFunction method for the FTS1
3267: ** virtual table.
3268: */
3269: static int fulltextFindFunction(
3270: sqlite3_vtab *pVtab,
3271: int nArg,
3272: const char *zName,
3273: void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
3274: void **ppArg
3275: ){
3276: if( strcmp(zName,"snippet")==0 ){
3277: *pxFunc = snippetFunc;
3278: return 1;
3279: }else if( strcmp(zName,"offsets")==0 ){
3280: *pxFunc = snippetOffsetsFunc;
3281: return 1;
3282: }
3283: return 0;
3284: }
3285:
3286: /*
3287: ** Rename an fts1 table.
3288: */
3289: static int fulltextRename(
3290: sqlite3_vtab *pVtab,
3291: const char *zName
3292: ){
3293: fulltext_vtab *p = (fulltext_vtab *)pVtab;
3294: int rc = SQLITE_NOMEM;
3295: char *zSql = sqlite3_mprintf(
3296: "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
3297: "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
3298: , p->zDb, p->zName, zName
3299: , p->zDb, p->zName, zName
3300: );
3301: if( zSql ){
3302: rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
3303: sqlite3_free(zSql);
3304: }
3305: return rc;
3306: }
3307:
3308: static const sqlite3_module fulltextModule = {
3309: /* iVersion */ 0,
3310: /* xCreate */ fulltextCreate,
3311: /* xConnect */ fulltextConnect,
3312: /* xBestIndex */ fulltextBestIndex,
3313: /* xDisconnect */ fulltextDisconnect,
3314: /* xDestroy */ fulltextDestroy,
3315: /* xOpen */ fulltextOpen,
3316: /* xClose */ fulltextClose,
3317: /* xFilter */ fulltextFilter,
3318: /* xNext */ fulltextNext,
3319: /* xEof */ fulltextEof,
3320: /* xColumn */ fulltextColumn,
3321: /* xRowid */ fulltextRowid,
3322: /* xUpdate */ fulltextUpdate,
3323: /* xBegin */ 0,
3324: /* xSync */ 0,
3325: /* xCommit */ 0,
3326: /* xRollback */ 0,
3327: /* xFindFunction */ fulltextFindFunction,
3328: /* xRename */ fulltextRename,
3329: };
3330:
3331: int sqlite3Fts1Init(sqlite3 *db){
3332: sqlite3_overload_function(db, "snippet", -1);
3333: sqlite3_overload_function(db, "offsets", -1);
3334: return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
3335: }
3336:
3337: #if !SQLITE_CORE
3338: int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
3339: const sqlite3_api_routines *pApi){
3340: SQLITE_EXTENSION_INIT2(pApi)
3341: return sqlite3Fts1Init(db);
3342: }
3343: #endif
3344:
3345: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>