1: /* The author disclaims copyright to this source code.
2: *
3: * This is an SQLite module implementing full-text search.
4: */
5:
6: #include <assert.h>
7: #if !defined(__APPLE__)
8: #include <malloc.h>
9: #else
10: #include <stdlib.h>
11: #endif
12: #include <stdio.h>
13: #include <string.h>
14: #include <ctype.h>
15:
16: #include "fulltext.h"
17: #include "ft_hash.h"
18: #include "tokenizer.h"
19: #include "sqlite3.h"
20: #include "sqlite3ext.h"
21: SQLITE_EXTENSION_INIT1
22:
23: /* utility functions */
24:
25: /* We encode variable-length integers in little-endian order using seven bits
26: * per byte as follows:
27: **
28: ** KEY:
29: ** A = 0xxxxxxx 7 bits of data and one flag bit
30: ** B = 1xxxxxxx 7 bits of data and one flag bit
31: **
32: ** 7 bits - A
33: ** 14 bits - BA
34: ** 21 bits - BBA
35: ** and so on.
36: */
37:
38: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
39: #define VARINT_MAX 10
40:
41: /* Write a 64-bit variable-length integer to memory starting at p[0].
42: * The length of data written will be between 1 and VARINT_MAX bytes.
43: * The number of bytes written is returned. */
44: static int putVarint(char *p, sqlite_int64 v){
45: unsigned char *q = (unsigned char *) p;
46: sqlite_uint64 vu = v;
47: do{
48: *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
49: vu >>= 7;
50: }while( vu!=0 );
51: q[-1] &= 0x7f; /* turn off high bit in final byte */
52: assert( q - (unsigned char *)p <= VARINT_MAX );
53: return (int) (q - (unsigned char *)p);
54: }
55:
56: /* Read a 64-bit variable-length integer from memory starting at p[0].
57: * Return the number of bytes read, or 0 on error.
58: * The value is stored in *v. */
59: static int getVarint(const char *p, sqlite_int64 *v){
60: const unsigned char *q = (const unsigned char *) p;
61: sqlite_uint64 x = 0, y = 1;
62: while( (*q & 0x80) == 0x80 ){
63: x += y * (*q++ & 0x7f);
64: y <<= 7;
65: if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
66: assert( 0 );
67: return 0;
68: }
69: }
70: x += y * (*q++);
71: *v = (sqlite_int64) x;
72: return (int) (q - (unsigned char *)p);
73: }
74:
75: static int getVarint32(const char *p, int *pi){
76: sqlite_int64 i;
77: int ret = getVarint(p, &i);
78: *pi = (int) i;
79: assert( *pi==i );
80: return ret;
81: }
82:
83: /*** Document lists ***
84: *
85: * A document list holds a sorted list of varint-encoded document IDs.
86: *
87: * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
88: *
89: * array {
90: * varint docid;
91: * array {
92: * varint position; (delta from previous position plus 1, or 0 for end)
93: * varint startOffset; (delta from previous startOffset)
94: * varint endOffset; (delta from startOffset)
95: * }
96: * }
97: *
98: * Here, array { X } means zero or more occurrences of X, adjacent in memory.
99: *
100: * A doclist with type DL_POSITIONS is like the above, but holds only docids
101: * and positions without offset information.
102: *
103: * A doclist with type DL_DOCIDS is like the above, but holds only docids
104: * without positions or offset information.
105: *
106: * On disk, every document list has positions and offsets, so we don't bother
107: * to serialize a doclist's type.
108: *
109: * We don't yet delta-encode document IDs; doing so will probably be a
110: * modest win.
111: *
112: * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
113: * After the first offset, estimate the next offset by using the
114: * current token position and the previous token position and offset,
115: * offset to handle some variance. So the estimate would be
116: * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
117: * as normal. Offsets more than 64 chars from the estimate are
118: * encoded as the delta to the previous start offset + 128. An
119: * additional tiny increment can be gained by using the end offset of
120: * the previous token to make the estimate a tiny bit more precise.
121: */
122:
123: typedef enum DocListType {
124: DL_DOCIDS, /* docids only */
125: DL_POSITIONS, /* docids + positions */
126: DL_POSITIONS_OFFSETS /* docids + positions + offsets */
127: } DocListType;
128:
129: typedef struct DocList {
130: char *pData;
131: int nData;
132: DocListType iType;
133: int iLastPos; /* the last position written */
134: int iLastOffset; /* the last start offset written */
135: } DocList;
136:
137: /* Initialize a new DocList to hold the given data. */
138: static void docListInit(DocList *d, DocListType iType,
139: const char *pData, int nData){
140: d->nData = nData;
141: if( nData>0 ){
142: d->pData = malloc(nData);
143: memcpy(d->pData, pData, nData);
144: } else {
145: d->pData = NULL;
146: }
147: d->iType = iType;
148: d->iLastPos = 0;
149: d->iLastOffset = 0;
150: }
151:
152: /* Create a new dynamically-allocated DocList. */
153: static DocList *docListNew(DocListType iType){
154: DocList *d = (DocList *) malloc(sizeof(DocList));
155: docListInit(d, iType, 0, 0);
156: return d;
157: }
158:
159: static void docListDestroy(DocList *d){
160: free(d->pData);
161: #ifndef NDEBUG
162: memset(d, 0x55, sizeof(*d));
163: #endif
164: }
165:
166: static void docListDelete(DocList *d){
167: docListDestroy(d);
168: free(d);
169: }
170:
171: static char *docListEnd(DocList *d){
172: return d->pData + d->nData;
173: }
174:
175: /* Append a varint to a DocList's data. */
176: static void appendVarint(DocList *d, sqlite_int64 i){
177: char c[VARINT_MAX];
178: int n = putVarint(c, i);
179: d->pData = realloc(d->pData, d->nData + n);
180: memcpy(d->pData + d->nData, c, n);
181: d->nData += n;
182: }
183:
184: static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
185: appendVarint(d, iDocid);
186: d->iLastPos = 0;
187: }
188:
189: /* Add a position to the last position list in a doclist. */
190: static void docListAddPos(DocList *d, int iPos){
191: assert( d->iType>=DL_POSITIONS );
192: appendVarint(d, iPos-d->iLastPos+1);
193: d->iLastPos = iPos;
194: }
195:
196: static void docListAddPosOffset(DocList *d, int iPos,
197: int iStartOffset, int iEndOffset){
198: assert( d->iType==DL_POSITIONS_OFFSETS );
199: docListAddPos(d, iPos);
200: appendVarint(d, iStartOffset-d->iLastOffset);
201: d->iLastOffset = iStartOffset;
202: appendVarint(d, iEndOffset-iStartOffset);
203: }
204:
205: /* Terminate the last position list in the given doclist. */
206: static void docListAddEndPos(DocList *d){
207: appendVarint(d, 0);
208: }
209:
210: typedef struct DocListReader {
211: DocList *pDoclist;
212: char *p;
213: int iLastPos; /* the last position read */
214: } DocListReader;
215:
216: static void readerInit(DocListReader *r, DocList *pDoclist){
217: r->pDoclist = pDoclist;
218: if( pDoclist!=NULL ){
219: r->p = pDoclist->pData;
220: }
221: r->iLastPos = 0;
222: }
223:
224: static int readerAtEnd(DocListReader *pReader){
225: return pReader->p >= docListEnd(pReader->pDoclist);
226: }
227:
228: /* Peek at the next docid without advancing the read pointer. */
229: static sqlite_int64 peekDocid(DocListReader *pReader){
230: sqlite_int64 ret;
231: assert( !readerAtEnd(pReader) );
232: getVarint(pReader->p, &ret);
233: return ret;
234: }
235:
236: /* Read the next docid. */
237: static sqlite_int64 readDocid(DocListReader *pReader){
238: sqlite_int64 ret;
239: assert( !readerAtEnd(pReader) );
240: pReader->p += getVarint(pReader->p, &ret);
241: pReader->iLastPos = 0;
242: return ret;
243: }
244:
245: /* Read the next position from a position list.
246: * Returns the position, or -1 at the end of the list. */
247: static int readPosition(DocListReader *pReader){
248: int i;
249: int iType = pReader->pDoclist->iType;
250: assert( iType>=DL_POSITIONS );
251: assert( !readerAtEnd(pReader) );
252:
253: pReader->p += getVarint32(pReader->p, &i);
254: if( i==0 ){
255: pReader->iLastPos = -1;
256: return -1;
257: }
258: pReader->iLastPos += ((int) i)-1;
259: if( iType>=DL_POSITIONS_OFFSETS ){
260: /* Skip over offsets, ignoring them for now. */
261: int iStart, iEnd;
262: pReader->p += getVarint32(pReader->p, &iStart);
263: pReader->p += getVarint32(pReader->p, &iEnd);
264: }
265: return pReader->iLastPos;
266: }
267:
268: /* Skip past the end of a position list. */
269: static void skipPositionList(DocListReader *pReader){
270: while( readPosition(pReader)!=-1 )
271: ;
272: }
273:
274: /* Skip over a docid, including its position list if the doclist has
275: * positions. */
276: static void skipDocument(DocListReader *pReader){
277: readDocid(pReader);
278: if( pReader->pDoclist->iType >= DL_POSITIONS ){
279: skipPositionList(pReader);
280: }
281: }
282:
283: static sqlite_int64 firstDocid(DocList *d){
284: DocListReader r;
285: readerInit(&r, d);
286: return readDocid(&r);
287: }
288:
289: /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid;
290: * otherwise pUpdate, which must contain only the single docid [iDocid], is
291: * inserted (if not present) or updated (if already present). */
292: static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){
293: int modified = 0;
294: DocListReader reader;
295: char *p;
296:
297: if( pUpdate!=NULL ){
298: assert( d->iType==pUpdate->iType);
299: assert( iDocid==firstDocid(pUpdate) );
300: }
301:
302: readerInit(&reader, d);
303: while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){
304: skipDocument(&reader);
305: }
306:
307: p = reader.p;
308: /* Delete if there is a matching element. */
309: if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){
310: skipDocument(&reader);
311: memmove(p, reader.p, docListEnd(d) - reader.p);
312: d->nData -= (reader.p - p);
313: modified = 1;
314: }
315:
316: /* Insert if indicated. */
317: if( pUpdate!=NULL ){
318: int iDoclist = p-d->pData;
319: docListAddEndPos(pUpdate);
320:
321: d->pData = realloc(d->pData, d->nData+pUpdate->nData);
322: p = d->pData + iDoclist;
323:
324: memmove(p+pUpdate->nData, p, docListEnd(d) - p);
325: memcpy(p, pUpdate->pData, pUpdate->nData);
326: d->nData += pUpdate->nData;
327: modified = 1;
328: }
329:
330: return modified;
331: }
332:
333: /* Split the second half of doclist d into a separate doclist d2. Returns 1
334: * if successful, or 0 if d contains a single document and hence can't be
335: * split. */
336: static int docListSplit(DocList *d, DocList *d2){
337: const char *pSplitPoint = d->pData + d->nData / 2;
338: DocListReader reader;
339:
340: readerInit(&reader, d);
341: while( reader.p<pSplitPoint ){
342: skipDocument(&reader);
343: }
344: if( readerAtEnd(&reader) ) return 0;
345: docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p);
346: d->nData = reader.p - d->pData;
347: d->pData = realloc(d->pData, d->nData);
348: return 1;
349: }
350:
351: /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked
352: * on-disk doclist, resulting in another in-memory DocList [out]. [in]
353: * and [out] may or may not store position information according to the
354: * caller's wishes. The on-disk doclist always comes with positions.
355: *
356: * The caller must read each chunk of the on-disk doclist in succession and
357: * pass it to mergeBlock().
358: *
359: * If [in] has positions, then the merge output contains only documents with
360: * matching positions in the two input doclists. If [in] does not have
361: * positions, then the merge output contains all documents common to the two
362: * input doclists.
363: *
364: * If [in] is NULL, then the on-disk doclist is copied to [out] directly.
365: *
366: * A merge is performed using an integer [iOffset] provided by the caller.
367: * [iOffset] is subtracted from each position in the on-disk doclist for the
368: * purpose of position comparison; this is helpful in implementing phrase
369: * searches.
370: *
371: * A DocListMerge is not yet able to propagate offsets through query
372: * processing; we should add that capability soon.
373: */
374: typedef struct DocListMerge {
375: DocListReader in;
376: DocList *pOut;
377: int iOffset;
378: } DocListMerge;
379:
380: static void mergeInit(DocListMerge *m,
381: DocList *pIn, int iOffset, DocList *pOut){
382: readerInit(&m->in, pIn);
383: m->pOut = pOut;
384: m->iOffset = iOffset;
385:
386: /* can't handle offsets yet */
387: assert( pIn==NULL || pIn->iType <= DL_POSITIONS );
388: assert( pOut->iType <= DL_POSITIONS );
389: }
390:
391: /* A helper function for mergeBlock(), below. Merge the position lists
392: * pointed to by m->in and pBlockReader.
393: * If the merge matches, write [iDocid] to m->pOut; if m->pOut
394: * has positions then write all matching positions as well. */
395: static void mergePosList(DocListMerge *m, sqlite_int64 iDocid,
396: DocListReader *pBlockReader){
397: int block_pos = readPosition(pBlockReader);
398: int in_pos = readPosition(&m->in);
399: int match = 0;
400: while( block_pos!=-1 || in_pos!=-1 ){
401: if( block_pos-m->iOffset==in_pos ){
402: if( !match ){
403: docListAddDocid(m->pOut, iDocid);
404: match = 1;
405: }
406: if( m->pOut->iType >= DL_POSITIONS ){
407: docListAddPos(m->pOut, in_pos);
408: }
409: block_pos = readPosition(pBlockReader);
410: in_pos = readPosition(&m->in);
411: } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){
412: block_pos = readPosition(pBlockReader);
413: } else {
414: in_pos = readPosition(&m->in);
415: }
416: }
417: if( m->pOut->iType >= DL_POSITIONS && match ){
418: docListAddEndPos(m->pOut);
419: }
420: }
421:
422: /* Merge one block of an on-disk doclist into a DocListMerge. */
423: static void mergeBlock(DocListMerge *m, DocList *pBlock){
424: DocListReader blockReader;
425: assert( pBlock->iType >= DL_POSITIONS );
426: readerInit(&blockReader, pBlock);
427: while( !readerAtEnd(&blockReader) ){
428: sqlite_int64 iDocid = readDocid(&blockReader);
429: if( m->in.pDoclist!=NULL ){
430: while( 1 ){
431: if( readerAtEnd(&m->in) ) return; /* nothing more to merge */
432: if( peekDocid(&m->in)>=iDocid ) break;
433: skipDocument(&m->in);
434: }
435: if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */
436: skipPositionList(&blockReader); /* skip this docid in the block */
437: continue;
438: }
439: readDocid(&m->in);
440: }
441: /* We have a document match. */
442: if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){
443: /* We don't need to do a poslist merge. */
444: docListAddDocid(m->pOut, iDocid);
445: if( m->pOut->iType >= DL_POSITIONS ){
446: /* Copy all positions to the output doclist. */
447: while( 1 ){
448: int pos = readPosition(&blockReader);
449: if( pos==-1 ) break;
450: docListAddPos(m->pOut, pos);
451: }
452: docListAddEndPos(m->pOut);
453: } else skipPositionList(&blockReader);
454: continue;
455: }
456: mergePosList(m, iDocid, &blockReader);
457: }
458: }
459:
460: static char *string_dup_n(const char *s, int n){
461: char *str = malloc(n + 1);
462: memcpy(str, s, n);
463: str[n] = '\0';
464: return str;
465: }
466:
467: /* Duplicate a string; the caller must free() the returned string.
468: * (We don't use strdup() since it's not part of the standard C library and
469: * may not be available everywhere.) */
470: static char *string_dup(const char *s){
471: return string_dup_n(s, strlen(s));
472: }
473:
474: /* Format a string, replacing each occurrence of the % character with
475: * zName. This may be more convenient than sqlite_mprintf()
476: * when one string is used repeatedly in a format string.
477: * The caller must free() the returned string. */
478: static char *string_format(const char *zFormat, const char *zName){
479: const char *p;
480: size_t len = 0;
481: size_t nName = strlen(zName);
482: char *result;
483: char *r;
484:
485: /* first compute length needed */
486: for(p = zFormat ; *p ; ++p){
487: len += (*p=='%' ? nName : 1);
488: }
489: len += 1; /* for null terminator */
490:
491: r = result = malloc(len);
492: for(p = zFormat; *p; ++p){
493: if( *p=='%' ){
494: memcpy(r, zName, nName);
495: r += nName;
496: } else {
497: *r++ = *p;
498: }
499: }
500: *r++ = '\0';
501: assert( r == result + len );
502: return result;
503: }
504:
505: static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){
506: char *zCommand = string_format(zFormat, zName);
507: int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
508: free(zCommand);
509: return rc;
510: }
511:
512: static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt,
513: const char *zFormat){
514: char *zCommand = string_format(zFormat, zName);
515: int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
516: free(zCommand);
517: return rc;
518: }
519:
520: /* end utility functions */
521:
522: #define QUERY_GENERIC 0
523: #define QUERY_FULLTEXT 1
524:
525: #define CHUNK_MAX 1024
526:
527: typedef enum fulltext_statement {
528: CONTENT_INSERT_STMT,
529: CONTENT_SELECT_STMT,
530: CONTENT_DELETE_STMT,
531:
532: TERM_SELECT_STMT,
533: TERM_CHUNK_SELECT_STMT,
534: TERM_INSERT_STMT,
535: TERM_UPDATE_STMT,
536: TERM_DELETE_STMT,
537:
538: MAX_STMT /* Always at end! */
539: } fulltext_statement;
540:
541: /* These must exactly match the enum above. */
542: /* TODO(adam): Is there some risk that a statement (in particular,
543: ** pTermSelectStmt) will be used in two cursors at once, e.g. if a
544: ** query joins a virtual table to itself? If so perhaps we should
545: ** move some of these to the cursor object.
546: */
547: static const char *fulltext_zStatement[MAX_STMT] = {
548: /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)",
549: /* CONTENT_SELECT */ "select content from %_content where rowid = ?",
550: /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
551:
552: /* TERM_SELECT */
553: "select rowid, doclist from %_term where term = ? and first = ?",
554: /* TERM_CHUNK_SELECT */
555: "select max(first) from %_term where term = ? and first <= ?",
556: /* TERM_INSERT */
557: "insert into %_term (term, first, doclist) values (?, ?, ?)",
558: /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
559: /* TERM_DELETE */ "delete from %_term where rowid = ?",
560: };
561:
562: typedef struct fulltext_vtab {
563: sqlite3_vtab base;
564: sqlite3 *db;
565: const char *zName; /* virtual table name */
566: sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
567:
568: /* Precompiled statements which we keep as long as the table is
569: ** open.
570: */
571: sqlite3_stmt *pFulltextStatements[MAX_STMT];
572: } fulltext_vtab;
573:
574: typedef struct fulltext_cursor {
575: sqlite3_vtab_cursor base;
576: int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */
577:
578: sqlite3_stmt *pStmt;
579:
580: int eof;
581:
582: /* The following is used only when iCursorType == QUERY_FULLTEXT. */
583: DocListReader result;
584: } fulltext_cursor;
585:
586: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
587: return (fulltext_vtab *) c->base.pVtab;
588: }
589:
590: static sqlite3_module fulltextModule; /* forward declaration */
591:
592: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
593: ** If the indicated statement has never been prepared, it is prepared
594: ** and cached, otherwise the cached version is reset.
595: */
596: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
597: sqlite3_stmt **ppStmt){
598: assert( iStmt<MAX_STMT );
599: if( v->pFulltextStatements[iStmt]==NULL ){
600: int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt],
601: fulltext_zStatement[iStmt]);
602: if( rc!=SQLITE_OK ) return rc;
603: } else {
604: int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
605: if( rc!=SQLITE_OK ) return rc;
606: }
607:
608: *ppStmt = v->pFulltextStatements[iStmt];
609: return SQLITE_OK;
610: }
611:
612: /* Step the indicated statement, handling errors SQLITE_BUSY (by
613: ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
614: ** bindings to the new statement).
615: ** TODO(adam): We should extend this function so that it can work with
616: ** statements declared locally, not only globally cached statements.
617: */
618: static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
619: sqlite3_stmt **ppStmt){
620: int rc;
621: sqlite3_stmt *s = *ppStmt;
622: assert( iStmt<MAX_STMT );
623: assert( s==v->pFulltextStatements[iStmt] );
624:
625: while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
626: sqlite3_stmt *pNewStmt;
627:
628: if( rc==SQLITE_BUSY ) continue;
629: if( rc!=SQLITE_ERROR ) return rc;
630:
631: rc = sqlite3_reset(s);
632: if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR;
633:
634: v->pFulltextStatements[iStmt] = NULL; /* Still in s */
635: rc = sql_get_statement(v, iStmt, &pNewStmt);
636: if( rc!=SQLITE_OK ) goto err;
637: *ppStmt = pNewStmt;
638:
639: rc = sqlite3_transfer_bindings(s, pNewStmt);
640: if( rc!=SQLITE_OK ) goto err;
641:
642: rc = sqlite3_finalize(s);
643: if( rc!=SQLITE_OK ) return rc;
644: s = pNewStmt;
645: }
646: return rc;
647:
648: err:
649: sqlite3_finalize(s);
650: return rc;
651: }
652:
653: /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
654: ** Useful for statements like UPDATE, where we expect no results.
655: */
656: static int sql_single_step_statement(fulltext_vtab *v,
657: fulltext_statement iStmt,
658: sqlite3_stmt **ppStmt){
659: int rc = sql_step_statement(v, iStmt, ppStmt);
660: return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
661: }
662:
663: /* insert into %_content (rowid, content) values ([rowid], [zContent]) */
664: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
665: const char *zContent, int nContent){
666: sqlite3_stmt *s;
667: int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
668: if( rc!=SQLITE_OK ) return rc;
669:
670: rc = sqlite3_bind_value(s, 1, rowid);
671: if( rc!=SQLITE_OK ) return rc;
672:
673: rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC);
674: if( rc!=SQLITE_OK ) return rc;
675:
676: return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
677: }
678:
679: /* select content from %_content where rowid = [iRow]
680: * The caller must delete the returned string. */
681: static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
682: char **pzContent){
683: sqlite3_stmt *s;
684: int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
685: if( rc!=SQLITE_OK ) return rc;
686:
687: rc = sqlite3_bind_int64(s, 1, iRow);
688: if( rc!=SQLITE_OK ) return rc;
689:
690: rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
691: if( rc!=SQLITE_ROW ) return rc;
692:
693: *pzContent = string_dup((const char *)sqlite3_column_text(s, 0));
694:
695: /* We expect only one row. We must execute another sqlite3_step()
696: * to complete the iteration; otherwise the table will remain locked. */
697: rc = sqlite3_step(s);
698: if( rc==SQLITE_DONE ) return SQLITE_OK;
699:
700: free(*pzContent);
701: return rc;
702: }
703:
704: /* delete from %_content where rowid = [iRow ] */
705: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
706: sqlite3_stmt *s;
707: int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
708: if( rc!=SQLITE_OK ) return rc;
709:
710: rc = sqlite3_bind_int64(s, 1, iRow);
711: if( rc!=SQLITE_OK ) return rc;
712:
713: return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
714: }
715:
716: /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst]
717: * If found, returns SQLITE_OK; the caller must free the returned doclist.
718: * If no rows found, returns SQLITE_ERROR. */
719: static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm,
720: sqlite_int64 iFirst,
721: sqlite_int64 *rowid,
722: DocList *out){
723: sqlite3_stmt *s;
724: int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
725: if( rc!=SQLITE_OK ) return rc;
726:
727: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT);
728: if( rc!=SQLITE_OK ) return rc;
729:
730: rc = sqlite3_bind_int64(s, 2, iFirst);
731: if( rc!=SQLITE_OK ) return rc;
732:
733: rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
734: if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
735:
736: *rowid = sqlite3_column_int64(s, 0);
737: docListInit(out, DL_POSITIONS_OFFSETS,
738: sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
739:
740: /* We expect only one row. We must execute another sqlite3_step()
741: * to complete the iteration; otherwise the table will remain locked. */
742: rc = sqlite3_step(s);
743: return rc==SQLITE_DONE ? SQLITE_OK : rc;
744: }
745:
746: /* select max(first) from %_term where term = [zTerm] and first <= [iFirst]
747: * If found, returns SQLITE_ROW and result in *piResult; if the query returns
748: * NULL (meaning no row found) returns SQLITE_DONE.
749: */
750: static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm,
751: sqlite_int64 iFirst, sqlite_int64 *piResult){
752: sqlite3_stmt *s;
753: int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s);
754: if( rc!=SQLITE_OK ) return rc;
755:
756: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
757: if( rc!=SQLITE_OK ) return rc;
758:
759: rc = sqlite3_bind_int64(s, 2, iFirst);
760: if( rc!=SQLITE_OK ) return rc;
761:
762: rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s);
763: if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
764:
765: switch( sqlite3_column_type(s, 0) ){
766: case SQLITE_NULL:
767: rc = SQLITE_DONE;
768: break;
769: case SQLITE_INTEGER:
770: *piResult = sqlite3_column_int64(s, 0);
771: break;
772: default:
773: return SQLITE_ERROR;
774: }
775: /* We expect only one row. We must execute another sqlite3_step()
776: * to complete the iteration; otherwise the table will remain locked. */
777: if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR;
778: return rc;
779: }
780:
781: /* insert into %_term (term, first, doclist)
782: values ([zTerm], [iFirst], [doclist]) */
783: static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm,
784: sqlite_int64 iFirst, DocList *doclist){
785: sqlite3_stmt *s;
786: int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
787: if( rc!=SQLITE_OK ) return rc;
788:
789: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
790: if( rc!=SQLITE_OK ) return rc;
791:
792: rc = sqlite3_bind_int64(s, 2, iFirst);
793: if( rc!=SQLITE_OK ) return rc;
794:
795: rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC);
796: if( rc!=SQLITE_OK ) return rc;
797:
798: return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
799: }
800:
801: /* update %_term set doclist = [doclist] where rowid = [rowid] */
802: static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
803: DocList *doclist){
804: sqlite3_stmt *s;
805: int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
806: if( rc!=SQLITE_OK ) return rc;
807:
808: rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData,
809: SQLITE_STATIC);
810: if( rc!=SQLITE_OK ) return rc;
811:
812: rc = sqlite3_bind_int64(s, 2, rowid);
813: if( rc!=SQLITE_OK ) return rc;
814:
815: return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
816: }
817:
818: static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
819: sqlite3_stmt *s;
820: int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
821: if( rc!=SQLITE_OK ) return rc;
822:
823: rc = sqlite3_bind_int64(s, 1, rowid);
824: if( rc!=SQLITE_OK ) return rc;
825:
826: return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
827: }
828:
829: static void fulltext_vtab_destroy(fulltext_vtab *v){
830: int iStmt;
831:
832: for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
833: if( v->pFulltextStatements[iStmt]!=NULL ){
834: sqlite3_finalize(v->pFulltextStatements[iStmt]);
835: v->pFulltextStatements[iStmt] = NULL;
836: }
837: }
838:
839: if( v->pTokenizer!=NULL ){
840: v->pTokenizer->pModule->xDestroy(v->pTokenizer);
841: v->pTokenizer = NULL;
842: }
843:
844: free((void *) v->zName);
845: free(v);
846: }
847:
848: /* Current interface:
849: ** argv[0] - module name
850: ** argv[1] - database name
851: ** argv[2] - table name
852: ** argv[3] - tokenizer name (optional, a sensible default is provided)
853: ** argv[4..] - passed to tokenizer (optional based on tokenizer)
854: **/
855: static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv,
856: sqlite3_vtab **ppVTab){
857: int rc;
858: fulltext_vtab *v;
859: sqlite3_tokenizer_module *m = NULL;
860:
861: assert( argc>=3 );
862: v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
863: /* sqlite will initialize v->base */
864: v->db = db;
865: v->zName = string_dup(argv[2]);
866: v->pTokenizer = NULL;
867:
868: if( argc==3 ){
869: get_simple_tokenizer_module(&m);
870: } else {
871: /* TODO(shess) For now, add new tokenizers as else if clauses. */
872: if( !strcmp(argv[3], "simple") ){
873: get_simple_tokenizer_module(&m);
874: } else {
875: assert( "unrecognized tokenizer"==NULL );
876: }
877: }
878:
879: /* TODO(shess) Since tokenization impacts the index, the parameters
880: ** to the tokenizer need to be identical when a persistent virtual
881: ** table is re-created. One solution would be a meta-table to track
882: ** such information in the database. Then we could verify that the
883: ** information is identical on subsequent creates.
884: */
885: /* TODO(shess) Why isn't argv already (const char **)? */
886: rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer);
887: if( rc!=SQLITE_OK ) return rc;
888: v->pTokenizer->pModule = m;
889:
890: /* TODO: verify the existence of backing tables foo_content, foo_term */
891:
892: rc = sqlite3_declare_vtab(db, "create table x(content text)");
893: if( rc!=SQLITE_OK ) return rc;
894:
895: memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
896:
897: *ppVTab = &v->base;
898: return SQLITE_OK;
899: }
900:
901: static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv,
902: sqlite3_vtab **ppVTab){
903: int rc;
904: assert( argc>=3 );
905:
906: /* The %_content table holds the text of each full-text item, with
907: ** the rowid used as the docid.
908: **
909: ** The %_term table maps each term to a document list blob
910: ** containing elements sorted by ascending docid, each element
911: ** encoded as:
912: **
913: ** docid varint-encoded
914: ** token count varint-encoded
915: ** "count" token elements (poslist):
916: ** position varint-encoded as delta from previous position
917: ** start offset varint-encoded as delta from previous start offset
918: ** end offset varint-encoded as delta from start offset
919: **
920: ** Additionally, doclist blobs can be chunked into multiple rows,
921: ** using "first" to order the blobs. "first" is simply the first
922: ** docid in the blob.
923: */
924: /*
925: ** NOTE(shess) That last sentence is incorrect in the face of
926: ** deletion, which can leave a doclist that doesn't contain the
927: ** first from that row. I _believe_ this does not matter to the
928: ** operation of the system, but it might be reasonable to update
929: ** appropriately in case this assumption becomes more important.
930: */
931: rc = sql_exec(db, argv[2],
932: "create table %_content(content text);"
933: "create table %_term(term text, first integer, doclist blob);"
934: "create index %_index on %_term(term, first)");
935: if( rc!=SQLITE_OK ) return rc;
936:
937: return fulltextConnect(db, pAux, argc, argv, ppVTab);
938: }
939:
940: /* Decide how to handle an SQL query.
941: * At the moment, MATCH queries can include implicit boolean ANDs; we
942: * haven't implemented phrase searches or OR yet. */
943: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
944: int i;
945:
946: for(i=0; i<pInfo->nConstraint; ++i){
947: const struct sqlite3_index_constraint *pConstraint;
948: pConstraint = &pInfo->aConstraint[i];
949: if( pConstraint->iColumn==0 &&
950: pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH &&
951: pConstraint->usable ){ /* a full-text search */
952: pInfo->aConstraintUsage[i].argvIndex = 1;
953: pInfo->aConstraintUsage[i].omit = 1;
954: pInfo->idxNum = QUERY_FULLTEXT;
955: pInfo->estimatedCost = 1.0; /* an arbitrary value for now */
956: return SQLITE_OK;
957: }
958: }
959: pInfo->idxNum = QUERY_GENERIC;
960: return SQLITE_OK;
961: }
962:
963: static int fulltextDisconnect(sqlite3_vtab *pVTab){
964: fulltext_vtab_destroy((fulltext_vtab *)pVTab);
965: return SQLITE_OK;
966: }
967:
968: static int fulltextDestroy(sqlite3_vtab *pVTab){
969: fulltext_vtab *v = (fulltext_vtab *)pVTab;
970:
971: int rc = sql_exec(v->db, v->zName,
972: "drop table %_content; drop table %_term");
973: if( rc!=SQLITE_OK ) return rc;
974:
975: fulltext_vtab_destroy((fulltext_vtab *)pVTab);
976: return SQLITE_OK;
977: }
978:
979: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
980: fulltext_cursor *c;
981:
982: c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
983: /* sqlite will initialize c->base */
984: *ppCursor = &c->base;
985:
986: return SQLITE_OK;
987: }
988:
989: static int fulltextClose(sqlite3_vtab_cursor *pCursor){
990: fulltext_cursor *c = (fulltext_cursor *) pCursor;
991: sqlite3_finalize(c->pStmt);
992: if( c->result.pDoclist!=NULL ){
993: docListDelete(c->result.pDoclist);
994: }
995: free(c);
996: return SQLITE_OK;
997: }
998:
999: static int fulltextNext(sqlite3_vtab_cursor *pCursor){
1000: fulltext_cursor *c = (fulltext_cursor *) pCursor;
1001: sqlite_int64 iDocid;
1002: int rc;
1003:
1004: switch( c->iCursorType ){
1005: case QUERY_GENERIC:
1006: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
1007: rc = sqlite3_step(c->pStmt);
1008: switch( rc ){
1009: case SQLITE_ROW:
1010: c->eof = 0;
1011: return SQLITE_OK;
1012: case SQLITE_DONE:
1013: c->eof = 1;
1014: return SQLITE_OK;
1015: default:
1016: c->eof = 1;
1017: return rc;
1018: }
1019: case QUERY_FULLTEXT:
1020: rc = sqlite3_reset(c->pStmt);
1021: if( rc!=SQLITE_OK ) return rc;
1022:
1023: if( readerAtEnd(&c->result)){
1024: c->eof = 1;
1025: return SQLITE_OK;
1026: }
1027: iDocid = readDocid(&c->result);
1028: rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
1029: if( rc!=SQLITE_OK ) return rc;
1030: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
1031: rc = sqlite3_step(c->pStmt);
1032: if( rc==SQLITE_ROW ){ /* the case we expect */
1033: c->eof = 0;
1034: return SQLITE_OK;
1035: }
1036: /* an error occurred; abort */
1037: return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
1038: default:
1039: assert( 0 );
1040: return SQLITE_ERROR; /* not reached */
1041: }
1042: }
1043:
1044: static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm,
1045: sqlite3_stmt **ppStmt){
1046: int rc;
1047: if( *ppStmt ){
1048: rc = sqlite3_reset(*ppStmt);
1049: } else {
1050: rc = sql_prepare(v->db, v->zName, ppStmt,
1051: "select doclist from %_term where term = ? order by first");
1052: }
1053: if( rc!=SQLITE_OK ) return rc;
1054:
1055: rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT);
1056: if( rc!=SQLITE_OK ) return rc;
1057:
1058: return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */
1059: }
1060:
1061: /* Read the posting list for [zTerm]; AND it with the doclist [in] to
1062: * produce the doclist [out], using the given offset [iOffset] for phrase
1063: * matching.
1064: * (*pSelect) is used to hold an SQLite statement used inside this function;
1065: * the caller should initialize *pSelect to NULL before the first call.
1066: */
1067: static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect,
1068: const char *zTerm,
1069: DocList *pIn, int iOffset, DocList *out){
1070: int rc;
1071: DocListMerge merge;
1072:
1073: if( pIn!=NULL && !pIn->nData ){
1074: /* If [pIn] is already empty, there's no point in reading the
1075: * posting list to AND it in; return immediately. */
1076: return SQLITE_OK;
1077: }
1078:
1079: rc = term_select_doclist(v, zTerm, -1, pSelect);
1080: if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
1081:
1082: mergeInit(&merge, pIn, iOffset, out);
1083: while( rc==SQLITE_ROW ){
1084: DocList block;
1085: docListInit(&block, DL_POSITIONS_OFFSETS,
1086: sqlite3_column_blob(*pSelect, 0),
1087: sqlite3_column_bytes(*pSelect, 0));
1088: mergeBlock(&merge, &block);
1089: docListDestroy(&block);
1090:
1091: rc = sqlite3_step(*pSelect);
1092: if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){
1093: return rc;
1094: }
1095: }
1096:
1097: return SQLITE_OK;
1098: }
1099:
1100: typedef struct QueryTerm {
1101: int is_phrase; /* true if this term begins a new phrase */
1102: const char *zTerm;
1103: } QueryTerm;
1104:
1105: /* A parsed query.
1106: *
1107: * As an example, parsing the query ["four score" years "new nation"] will
1108: * yield a Query with 5 terms:
1109: * "four", is_phrase = 1
1110: * "score", is_phrase = 0
1111: * "years", is_phrase = 1
1112: * "new", is_phrase = 1
1113: * "nation", is_phrase = 0
1114: */
1115: typedef struct Query {
1116: int nTerms;
1117: QueryTerm *pTerm;
1118: } Query;
1119:
1120: static void query_add(Query *q, int is_phrase, const char *zTerm){
1121: QueryTerm *t;
1122: ++q->nTerms;
1123: q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0]));
1124: t = &q->pTerm[q->nTerms - 1];
1125: t->is_phrase = is_phrase;
1126: t->zTerm = zTerm;
1127: }
1128:
1129: static void query_free(Query *q){
1130: int i;
1131: for(i = 0; i < q->nTerms; ++i){
1132: free((void *) q->pTerm[i].zTerm);
1133: }
1134: free(q->pTerm);
1135: }
1136:
1137: static int tokenize_segment(sqlite3_tokenizer *pTokenizer,
1138: const char *zQuery, int in_phrase,
1139: Query *pQuery){
1140: sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
1141: sqlite3_tokenizer_cursor *pCursor;
1142: int is_first = 1;
1143:
1144: int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor);
1145: if( rc!=SQLITE_OK ) return rc;
1146: pCursor->pTokenizer = pTokenizer;
1147:
1148: while( 1 ){
1149: const char *zToken;
1150: int nToken, iStartOffset, iEndOffset, dummy_pos;
1151:
1152: rc = pModule->xNext(pCursor,
1153: &zToken, &nToken,
1154: &iStartOffset, &iEndOffset,
1155: &dummy_pos);
1156: if( rc!=SQLITE_OK ) break;
1157: query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken));
1158: is_first = 0;
1159: }
1160:
1161: return pModule->xClose(pCursor);
1162: }
1163:
1164: /* Parse a query string, yielding a Query object. */
1165: static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){
1166: char *zQuery1 = string_dup(zQuery);
1167: int in_phrase = 0;
1168: char *s = zQuery1;
1169: pQuery->nTerms = 0;
1170: pQuery->pTerm = NULL;
1171:
1172: while( *s ){
1173: char *t = s;
1174: while( *t ){
1175: if( *t=='"' ){
1176: *t++ = '\0';
1177: break;
1178: }
1179: ++t;
1180: }
1181: if( *s ){
1182: tokenize_segment(v->pTokenizer, s, in_phrase, pQuery);
1183: }
1184: s = t;
1185: in_phrase = !in_phrase;
1186: }
1187:
1188: free(zQuery1);
1189: return SQLITE_OK;
1190: }
1191:
1192: /* Perform a full-text query; return a list of documents in [pResult]. */
1193: static int fulltext_query(fulltext_vtab *v, const char *zQuery,
1194: DocList **pResult){
1195: Query q;
1196: int phrase_start = -1;
1197: int i;
1198: sqlite3_stmt *pSelect = NULL;
1199: DocList *d = NULL;
1200:
1201: int rc = parse_query(v, zQuery, &q);
1202: if( rc!=SQLITE_OK ) return rc;
1203:
1204: /* Merge terms. */
1205: for(i = 0 ; i < q.nTerms ; ++i){
1206: /* In each merge step, we need to generate positions whenever we're
1207: * processing a phrase which hasn't ended yet. */
1208: int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase;
1209: DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS);
1210: if( q.pTerm[i].is_phrase ){
1211: phrase_start = i;
1212: }
1213: rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next);
1214: if( rc!=SQLITE_OK ) break;
1215: if( d!=NULL ){
1216: docListDelete(d);
1217: }
1218: d = next;
1219: }
1220:
1221: sqlite3_finalize(pSelect);
1222: query_free(&q);
1223: *pResult = d;
1224: return rc;
1225: }
1226:
1227: static int fulltextFilter(sqlite3_vtab_cursor *pCursor,
1228: int idxNum, const char *idxStr,
1229: int argc, sqlite3_value **argv){
1230: fulltext_cursor *c = (fulltext_cursor *) pCursor;
1231: fulltext_vtab *v = cursor_vtab(c);
1232: int rc;
1233: const char *zStatement;
1234:
1235: c->iCursorType = idxNum;
1236: switch( idxNum ){
1237: case QUERY_GENERIC:
1238: zStatement = "select rowid, content from %_content";
1239: break;
1240:
1241: case QUERY_FULLTEXT: /* full-text search */
1242: {
1243: const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
1244: DocList *pResult;
1245: assert( argc==1 );
1246: rc = fulltext_query(v, zQuery, &pResult);
1247: if( rc!=SQLITE_OK ) return rc;
1248: readerInit(&c->result, pResult);
1249: zStatement = "select rowid, content from %_content where rowid = ?";
1250: break;
1251: }
1252:
1253: default:
1254: assert( 0 );
1255: }
1256:
1257: rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement);
1258: if( rc!=SQLITE_OK ) return rc;
1259:
1260: return fulltextNext(pCursor);
1261: }
1262:
1263: static int fulltextEof(sqlite3_vtab_cursor *pCursor){
1264: fulltext_cursor *c = (fulltext_cursor *) pCursor;
1265: return c->eof;
1266: }
1267:
1268: static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
1269: sqlite3_context *pContext, int idxCol){
1270: fulltext_cursor *c = (fulltext_cursor *) pCursor;
1271: const char *s;
1272:
1273: assert( idxCol==0 );
1274: s = (const char *) sqlite3_column_text(c->pStmt, 1);
1275: sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT);
1276:
1277: return SQLITE_OK;
1278: }
1279:
1280: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
1281: fulltext_cursor *c = (fulltext_cursor *) pCursor;
1282:
1283: *pRowid = sqlite3_column_int64(c->pStmt, 0);
1284: return SQLITE_OK;
1285: }
1286:
1287: /* Build a hash table containing all terms in zText. */
1288: static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer,
1289: const char *zText, sqlite_int64 iDocid){
1290: sqlite3_tokenizer_cursor *pCursor;
1291: const char *pToken;
1292: int nTokenBytes;
1293: int iStartOffset, iEndOffset, iPosition;
1294:
1295: int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
1296: if( rc!=SQLITE_OK ) return rc;
1297:
1298: pCursor->pTokenizer = pTokenizer;
1299: HashInit(terms, HASH_STRING, 1);
1300: while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
1301: &pToken, &nTokenBytes,
1302: &iStartOffset, &iEndOffset,
1303: &iPosition) ){
1304: DocList *p;
1305:
1306: /* Positions can't be negative; we use -1 as a terminator internally. */
1307: if( iPosition<0 ) {
1308: rc = SQLITE_ERROR;
1309: goto err;
1310: }
1311:
1312: p = HashFind(terms, pToken, nTokenBytes);
1313: if( p==NULL ){
1314: p = docListNew(DL_POSITIONS_OFFSETS);
1315: docListAddDocid(p, iDocid);
1316: HashInsert(terms, pToken, nTokenBytes, p);
1317: }
1318: docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset);
1319: }
1320:
1321: err:
1322: /* TODO(shess) Check return? Should this be able to cause errors at
1323: ** this point? Actually, same question about sqlite3_finalize(),
1324: ** though one could argue that failure there means that the data is
1325: ** not durable. *ponder*
1326: */
1327: pTokenizer->pModule->xClose(pCursor);
1328: return rc;
1329: }
1330: /* Update the %_terms table to map the term [zTerm] to the given rowid. */
1331: static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm,
1332: sqlite_int64 iDocid, DocList *p){
1333: sqlite_int64 iFirst;
1334: sqlite_int64 iIndexRow;
1335: DocList doclist;
1336:
1337: int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
1338: if( rc==SQLITE_DONE ){
1339: docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0);
1340: if( docListUpdate(&doclist, iDocid, p) ){
1341: rc = term_insert(v, zTerm, nTerm, iDocid, &doclist);
1342: docListDestroy(&doclist);
1343: return rc;
1344: }
1345: return SQLITE_OK;
1346: }
1347: if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
1348:
1349: /* This word is in the index; add this document ID to its blob. */
1350:
1351: rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
1352: if( rc!=SQLITE_OK ) return rc;
1353:
1354: if( docListUpdate(&doclist, iDocid, p) ){
1355: /* If the blob is too big, split it in half. */
1356: if( doclist.nData>CHUNK_MAX ){
1357: DocList half;
1358: if( docListSplit(&doclist, &half) ){
1359: rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half);
1360: docListDestroy(&half);
1361: if( rc!=SQLITE_OK ) goto err;
1362: }
1363: }
1364: rc = term_update(v, iIndexRow, &doclist);
1365: }
1366:
1367: err:
1368: docListDestroy(&doclist);
1369: return rc;
1370: }
1371:
1372: /* Insert a row into the full-text index; set *piRowid to be the ID of the
1373: * new row. */
1374: static int index_insert(fulltext_vtab *v,
1375: sqlite3_value *pRequestRowid, const char *zText,
1376: sqlite_int64 *piRowid){
1377: Hash terms; /* maps term string -> PosList */
1378: HashElem *e;
1379:
1380: int rc = content_insert(v, pRequestRowid, zText, -1);
1381: if( rc!=SQLITE_OK ) return rc;
1382: *piRowid = sqlite3_last_insert_rowid(v->db);
1383:
1384: if( !zText ) return SQLITE_OK; /* nothing to index */
1385:
1386: rc = build_terms(&terms, v->pTokenizer, zText, *piRowid);
1387: if( rc!=SQLITE_OK ) return rc;
1388:
1389: for(e=HashFirst(&terms); e; e=HashNext(e)){
1390: DocList *p = HashData(e);
1391: rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p);
1392: if( rc!=SQLITE_OK ) break;
1393: }
1394:
1395: for(e=HashFirst(&terms); e; e=HashNext(e)){
1396: DocList *p = HashData(e);
1397: docListDelete(p);
1398: }
1399: HashClear(&terms);
1400: return rc;
1401: }
1402:
1403: static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm,
1404: sqlite_int64 iDocid){
1405: sqlite_int64 iFirst;
1406: sqlite_int64 iIndexRow;
1407: DocList doclist;
1408:
1409: int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
1410: if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
1411:
1412: rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
1413: if( rc!=SQLITE_OK ) return rc;
1414:
1415: if( docListUpdate(&doclist, iDocid, NULL) ){
1416: if( doclist.nData>0 ){
1417: rc = term_update(v, iIndexRow, &doclist);
1418: } else { /* empty posting list */
1419: rc = term_delete(v, iIndexRow);
1420: }
1421: }
1422: docListDestroy(&doclist);
1423: return rc;
1424: }
1425:
1426: /* Delete a row from the full-text index. */
1427: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
1428: char *zText;
1429: Hash terms;
1430: HashElem *e;
1431:
1432: int rc = content_select(v, iRow, &zText);
1433: if( rc!=SQLITE_OK ) return rc;
1434:
1435: rc = build_terms(&terms, v->pTokenizer, zText, iRow);
1436: free(zText);
1437: if( rc!=SQLITE_OK ) return rc;
1438:
1439: for(e=HashFirst(&terms); e; e=HashNext(e)){
1440: rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow);
1441: if( rc!=SQLITE_OK ) break;
1442: }
1443: for(e=HashFirst(&terms); e; e=HashNext(e)){
1444: DocList *p = HashData(e);
1445: docListDelete(p);
1446: }
1447: HashClear(&terms);
1448:
1449: return content_delete(v, iRow);
1450: }
1451:
1452: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
1453: sqlite_int64 *pRowid){
1454: fulltext_vtab *v = (fulltext_vtab *) pVtab;
1455:
1456: if( nArg<2 ){
1457: return index_delete(v, sqlite3_value_int64(ppArg[0]));
1458: }
1459:
1460: if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
1461: return SQLITE_ERROR; /* an update; not yet supported */
1462: }
1463:
1464: assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */
1465: return index_insert(v, ppArg[1],
1466: (const char *)sqlite3_value_text(ppArg[2]), pRowid);
1467: }
1468:
1469: static sqlite3_module fulltextModule = {
1470: 0,
1471: fulltextCreate,
1472: fulltextConnect,
1473: fulltextBestIndex,
1474: fulltextDisconnect,
1475: fulltextDestroy,
1476: fulltextOpen,
1477: fulltextClose,
1478: fulltextFilter,
1479: fulltextNext,
1480: fulltextEof,
1481: fulltextColumn,
1482: fulltextRowid,
1483: fulltextUpdate
1484: };
1485:
1486: int fulltext_init(sqlite3 *db){
1487: return sqlite3_create_module(db, "fulltext", &fulltextModule, 0);
1488: }
1489:
1490: #if !SQLITE_CORE
1491: int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
1492: const sqlite3_api_routines *pApi){
1493: SQLITE_EXTENSION_INIT2(pApi)
1494: return fulltext_init(db);
1495: }
1496: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>