embedaddon/sqlite3/ext/fts1/fulltext.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts1 / fulltext.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:17 2012 UTC (13 years, 4 months ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD

sqlite3

1: /* The author disclaims copyright to this source code. 2: * 3: * This is an SQLite module implementing full-text search. 4: */ 5: 6: #include <assert.h> 7: #if !defined(__APPLE__) 8: #include <malloc.h> 9: #else 10: #include <stdlib.h> 11: #endif 12: #include <stdio.h> 13: #include <string.h> 14: #include <ctype.h> 15: 16: #include "fulltext.h" 17: #include "ft_hash.h" 18: #include "tokenizer.h" 19: #include "sqlite3.h" 20: #include "sqlite3ext.h" 21: SQLITE_EXTENSION_INIT1 22: 23: /* utility functions */ 24: 25: /* We encode variable-length integers in little-endian order using seven bits 26: * per byte as follows: 27: ** 28: ** KEY: 29: ** A = 0xxxxxxx 7 bits of data and one flag bit 30: ** B = 1xxxxxxx 7 bits of data and one flag bit 31: ** 32: ** 7 bits - A 33: ** 14 bits - BA 34: ** 21 bits - BBA 35: ** and so on. 36: */ 37: 38: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ 39: #define VARINT_MAX 10 40: 41: /* Write a 64-bit variable-length integer to memory starting at p[0]. 42: * The length of data written will be between 1 and VARINT_MAX bytes. 43: * The number of bytes written is returned. */ 44: static int putVarint(char *p, sqlite_int64 v){ 45: unsigned char *q = (unsigned char *) p; 46: sqlite_uint64 vu = v; 47: do{ 48: *q++ = (unsigned char) ((vu & 0x7f) | 0x80); 49: vu >>= 7; 50: }while( vu!=0 ); 51: q[-1] &= 0x7f; /* turn off high bit in final byte */ 52: assert( q - (unsigned char *)p <= VARINT_MAX ); 53: return (int) (q - (unsigned char *)p); 54: } 55: 56: /* Read a 64-bit variable-length integer from memory starting at p[0]. 57: * Return the number of bytes read, or 0 on error. 58: * The value is stored in *v. */ 59: static int getVarint(const char *p, sqlite_int64 *v){ 60: const unsigned char *q = (const unsigned char *) p; 61: sqlite_uint64 x = 0, y = 1; 62: while( (*q & 0x80) == 0x80 ){ 63: x += y * (*q++ & 0x7f); 64: y <<= 7; 65: if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ 66: assert( 0 ); 67: return 0; 68: } 69: } 70: x += y * (*q++); 71: *v = (sqlite_int64) x; 72: return (int) (q - (unsigned char *)p); 73: } 74: 75: static int getVarint32(const char *p, int *pi){ 76: sqlite_int64 i; 77: int ret = getVarint(p, &i); 78: *pi = (int) i; 79: assert( *pi==i ); 80: return ret; 81: } 82: 83: /*** Document lists *** 84: * 85: * A document list holds a sorted list of varint-encoded document IDs. 86: * 87: * A doclist with type DL_POSITIONS_OFFSETS is stored like this: 88: * 89: * array { 90: * varint docid; 91: * array { 92: * varint position; (delta from previous position plus 1, or 0 for end) 93: * varint startOffset; (delta from previous startOffset) 94: * varint endOffset; (delta from startOffset) 95: * } 96: * } 97: * 98: * Here, array { X } means zero or more occurrences of X, adjacent in memory. 99: * 100: * A doclist with type DL_POSITIONS is like the above, but holds only docids 101: * and positions without offset information. 102: * 103: * A doclist with type DL_DOCIDS is like the above, but holds only docids 104: * without positions or offset information. 105: * 106: * On disk, every document list has positions and offsets, so we don't bother 107: * to serialize a doclist's type. 108: * 109: * We don't yet delta-encode document IDs; doing so will probably be a 110: * modest win. 111: * 112: * NOTE(shess) I've thought of a slightly (1%) better offset encoding. 113: * After the first offset, estimate the next offset by using the 114: * current token position and the previous token position and offset, 115: * offset to handle some variance. So the estimate would be 116: * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded 117: * as normal. Offsets more than 64 chars from the estimate are 118: * encoded as the delta to the previous start offset + 128. An 119: * additional tiny increment can be gained by using the end offset of 120: * the previous token to make the estimate a tiny bit more precise. 121: */ 122: 123: typedef enum DocListType { 124: DL_DOCIDS, /* docids only */ 125: DL_POSITIONS, /* docids + positions */ 126: DL_POSITIONS_OFFSETS /* docids + positions + offsets */ 127: } DocListType; 128: 129: typedef struct DocList { 130: char *pData; 131: int nData; 132: DocListType iType; 133: int iLastPos; /* the last position written */ 134: int iLastOffset; /* the last start offset written */ 135: } DocList; 136: 137: /* Initialize a new DocList to hold the given data. */ 138: static void docListInit(DocList *d, DocListType iType, 139: const char *pData, int nData){ 140: d->nData = nData; 141: if( nData>0 ){ 142: d->pData = malloc(nData); 143: memcpy(d->pData, pData, nData); 144: } else { 145: d->pData = NULL; 146: } 147: d->iType = iType; 148: d->iLastPos = 0; 149: d->iLastOffset = 0; 150: } 151: 152: /* Create a new dynamically-allocated DocList. */ 153: static DocList *docListNew(DocListType iType){ 154: DocList *d = (DocList *) malloc(sizeof(DocList)); 155: docListInit(d, iType, 0, 0); 156: return d; 157: } 158: 159: static void docListDestroy(DocList *d){ 160: free(d->pData); 161: #ifndef NDEBUG 162: memset(d, 0x55, sizeof(*d)); 163: #endif 164: } 165: 166: static void docListDelete(DocList *d){ 167: docListDestroy(d); 168: free(d); 169: } 170: 171: static char *docListEnd(DocList *d){ 172: return d->pData + d->nData; 173: } 174: 175: /* Append a varint to a DocList's data. */ 176: static void appendVarint(DocList *d, sqlite_int64 i){ 177: char c[VARINT_MAX]; 178: int n = putVarint(c, i); 179: d->pData = realloc(d->pData, d->nData + n); 180: memcpy(d->pData + d->nData, c, n); 181: d->nData += n; 182: } 183: 184: static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ 185: appendVarint(d, iDocid); 186: d->iLastPos = 0; 187: } 188: 189: /* Add a position to the last position list in a doclist. */ 190: static void docListAddPos(DocList *d, int iPos){ 191: assert( d->iType>=DL_POSITIONS ); 192: appendVarint(d, iPos-d->iLastPos+1); 193: d->iLastPos = iPos; 194: } 195: 196: static void docListAddPosOffset(DocList *d, int iPos, 197: int iStartOffset, int iEndOffset){ 198: assert( d->iType==DL_POSITIONS_OFFSETS ); 199: docListAddPos(d, iPos); 200: appendVarint(d, iStartOffset-d->iLastOffset); 201: d->iLastOffset = iStartOffset; 202: appendVarint(d, iEndOffset-iStartOffset); 203: } 204: 205: /* Terminate the last position list in the given doclist. */ 206: static void docListAddEndPos(DocList *d){ 207: appendVarint(d, 0); 208: } 209: 210: typedef struct DocListReader { 211: DocList *pDoclist; 212: char *p; 213: int iLastPos; /* the last position read */ 214: } DocListReader; 215: 216: static void readerInit(DocListReader *r, DocList *pDoclist){ 217: r->pDoclist = pDoclist; 218: if( pDoclist!=NULL ){ 219: r->p = pDoclist->pData; 220: } 221: r->iLastPos = 0; 222: } 223: 224: static int readerAtEnd(DocListReader *pReader){ 225: return pReader->p >= docListEnd(pReader->pDoclist); 226: } 227: 228: /* Peek at the next docid without advancing the read pointer. */ 229: static sqlite_int64 peekDocid(DocListReader *pReader){ 230: sqlite_int64 ret; 231: assert( !readerAtEnd(pReader) ); 232: getVarint(pReader->p, &ret); 233: return ret; 234: } 235: 236: /* Read the next docid. */ 237: static sqlite_int64 readDocid(DocListReader *pReader){ 238: sqlite_int64 ret; 239: assert( !readerAtEnd(pReader) ); 240: pReader->p += getVarint(pReader->p, &ret); 241: pReader->iLastPos = 0; 242: return ret; 243: } 244: 245: /* Read the next position from a position list. 246: * Returns the position, or -1 at the end of the list. */ 247: static int readPosition(DocListReader *pReader){ 248: int i; 249: int iType = pReader->pDoclist->iType; 250: assert( iType>=DL_POSITIONS ); 251: assert( !readerAtEnd(pReader) ); 252: 253: pReader->p += getVarint32(pReader->p, &i); 254: if( i==0 ){ 255: pReader->iLastPos = -1; 256: return -1; 257: } 258: pReader->iLastPos += ((int) i)-1; 259: if( iType>=DL_POSITIONS_OFFSETS ){ 260: /* Skip over offsets, ignoring them for now. */ 261: int iStart, iEnd; 262: pReader->p += getVarint32(pReader->p, &iStart); 263: pReader->p += getVarint32(pReader->p, &iEnd); 264: } 265: return pReader->iLastPos; 266: } 267: 268: /* Skip past the end of a position list. */ 269: static void skipPositionList(DocListReader *pReader){ 270: while( readPosition(pReader)!=-1 ) 271: ; 272: } 273: 274: /* Skip over a docid, including its position list if the doclist has 275: * positions. */ 276: static void skipDocument(DocListReader *pReader){ 277: readDocid(pReader); 278: if( pReader->pDoclist->iType >= DL_POSITIONS ){ 279: skipPositionList(pReader); 280: } 281: } 282: 283: static sqlite_int64 firstDocid(DocList *d){ 284: DocListReader r; 285: readerInit(&r, d); 286: return readDocid(&r); 287: } 288: 289: /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; 290: * otherwise pUpdate, which must contain only the single docid [iDocid], is 291: * inserted (if not present) or updated (if already present). */ 292: static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ 293: int modified = 0; 294: DocListReader reader; 295: char *p; 296: 297: if( pUpdate!=NULL ){ 298: assert( d->iType==pUpdate->iType); 299: assert( iDocid==firstDocid(pUpdate) ); 300: } 301: 302: readerInit(&reader, d); 303: while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ 304: skipDocument(&reader); 305: } 306: 307: p = reader.p; 308: /* Delete if there is a matching element. */ 309: if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ 310: skipDocument(&reader); 311: memmove(p, reader.p, docListEnd(d) - reader.p); 312: d->nData -= (reader.p - p); 313: modified = 1; 314: } 315: 316: /* Insert if indicated. */ 317: if( pUpdate!=NULL ){ 318: int iDoclist = p-d->pData; 319: docListAddEndPos(pUpdate); 320: 321: d->pData = realloc(d->pData, d->nData+pUpdate->nData); 322: p = d->pData + iDoclist; 323: 324: memmove(p+pUpdate->nData, p, docListEnd(d) - p); 325: memcpy(p, pUpdate->pData, pUpdate->nData); 326: d->nData += pUpdate->nData; 327: modified = 1; 328: } 329: 330: return modified; 331: } 332: 333: /* Split the second half of doclist d into a separate doclist d2. Returns 1 334: * if successful, or 0 if d contains a single document and hence can't be 335: * split. */ 336: static int docListSplit(DocList *d, DocList *d2){ 337: const char *pSplitPoint = d->pData + d->nData / 2; 338: DocListReader reader; 339: 340: readerInit(&reader, d); 341: while( reader.p<pSplitPoint ){ 342: skipDocument(&reader); 343: } 344: if( readerAtEnd(&reader) ) return 0; 345: docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); 346: d->nData = reader.p - d->pData; 347: d->pData = realloc(d->pData, d->nData); 348: return 1; 349: } 350: 351: /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked 352: * on-disk doclist, resulting in another in-memory DocList [out]. [in] 353: * and [out] may or may not store position information according to the 354: * caller's wishes. The on-disk doclist always comes with positions. 355: * 356: * The caller must read each chunk of the on-disk doclist in succession and 357: * pass it to mergeBlock(). 358: * 359: * If [in] has positions, then the merge output contains only documents with 360: * matching positions in the two input doclists. If [in] does not have 361: * positions, then the merge output contains all documents common to the two 362: * input doclists. 363: * 364: * If [in] is NULL, then the on-disk doclist is copied to [out] directly. 365: * 366: * A merge is performed using an integer [iOffset] provided by the caller. 367: * [iOffset] is subtracted from each position in the on-disk doclist for the 368: * purpose of position comparison; this is helpful in implementing phrase 369: * searches. 370: * 371: * A DocListMerge is not yet able to propagate offsets through query 372: * processing; we should add that capability soon. 373: */ 374: typedef struct DocListMerge { 375: DocListReader in; 376: DocList *pOut; 377: int iOffset; 378: } DocListMerge; 379: 380: static void mergeInit(DocListMerge *m, 381: DocList *pIn, int iOffset, DocList *pOut){ 382: readerInit(&m->in, pIn); 383: m->pOut = pOut; 384: m->iOffset = iOffset; 385: 386: /* can't handle offsets yet */ 387: assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); 388: assert( pOut->iType <= DL_POSITIONS ); 389: } 390: 391: /* A helper function for mergeBlock(), below. Merge the position lists 392: * pointed to by m->in and pBlockReader. 393: * If the merge matches, write [iDocid] to m->pOut; if m->pOut 394: * has positions then write all matching positions as well. */ 395: static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, 396: DocListReader *pBlockReader){ 397: int block_pos = readPosition(pBlockReader); 398: int in_pos = readPosition(&m->in); 399: int match = 0; 400: while( block_pos!=-1 || in_pos!=-1 ){ 401: if( block_pos-m->iOffset==in_pos ){ 402: if( !match ){ 403: docListAddDocid(m->pOut, iDocid); 404: match = 1; 405: } 406: if( m->pOut->iType >= DL_POSITIONS ){ 407: docListAddPos(m->pOut, in_pos); 408: } 409: block_pos = readPosition(pBlockReader); 410: in_pos = readPosition(&m->in); 411: } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ 412: block_pos = readPosition(pBlockReader); 413: } else { 414: in_pos = readPosition(&m->in); 415: } 416: } 417: if( m->pOut->iType >= DL_POSITIONS && match ){ 418: docListAddEndPos(m->pOut); 419: } 420: } 421: 422: /* Merge one block of an on-disk doclist into a DocListMerge. */ 423: static void mergeBlock(DocListMerge *m, DocList *pBlock){ 424: DocListReader blockReader; 425: assert( pBlock->iType >= DL_POSITIONS ); 426: readerInit(&blockReader, pBlock); 427: while( !readerAtEnd(&blockReader) ){ 428: sqlite_int64 iDocid = readDocid(&blockReader); 429: if( m->in.pDoclist!=NULL ){ 430: while( 1 ){ 431: if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ 432: if( peekDocid(&m->in)>=iDocid ) break; 433: skipDocument(&m->in); 434: } 435: if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ 436: skipPositionList(&blockReader); /* skip this docid in the block */ 437: continue; 438: } 439: readDocid(&m->in); 440: } 441: /* We have a document match. */ 442: if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ 443: /* We don't need to do a poslist merge. */ 444: docListAddDocid(m->pOut, iDocid); 445: if( m->pOut->iType >= DL_POSITIONS ){ 446: /* Copy all positions to the output doclist. */ 447: while( 1 ){ 448: int pos = readPosition(&blockReader); 449: if( pos==-1 ) break; 450: docListAddPos(m->pOut, pos); 451: } 452: docListAddEndPos(m->pOut); 453: } else skipPositionList(&blockReader); 454: continue; 455: } 456: mergePosList(m, iDocid, &blockReader); 457: } 458: } 459: 460: static char *string_dup_n(const char *s, int n){ 461: char *str = malloc(n + 1); 462: memcpy(str, s, n); 463: str[n] = '\0'; 464: return str; 465: } 466: 467: /* Duplicate a string; the caller must free() the returned string. 468: * (We don't use strdup() since it's not part of the standard C library and 469: * may not be available everywhere.) */ 470: static char *string_dup(const char *s){ 471: return string_dup_n(s, strlen(s)); 472: } 473: 474: /* Format a string, replacing each occurrence of the % character with 475: * zName. This may be more convenient than sqlite_mprintf() 476: * when one string is used repeatedly in a format string. 477: * The caller must free() the returned string. */ 478: static char *string_format(const char *zFormat, const char *zName){ 479: const char *p; 480: size_t len = 0; 481: size_t nName = strlen(zName); 482: char *result; 483: char *r; 484: 485: /* first compute length needed */ 486: for(p = zFormat ; *p ; ++p){ 487: len += (*p=='%' ? nName : 1); 488: } 489: len += 1; /* for null terminator */ 490: 491: r = result = malloc(len); 492: for(p = zFormat; *p; ++p){ 493: if( *p=='%' ){ 494: memcpy(r, zName, nName); 495: r += nName; 496: } else { 497: *r++ = *p; 498: } 499: } 500: *r++ = '\0'; 501: assert( r == result + len ); 502: return result; 503: } 504: 505: static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ 506: char *zCommand = string_format(zFormat, zName); 507: int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); 508: free(zCommand); 509: return rc; 510: } 511: 512: static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, 513: const char *zFormat){ 514: char *zCommand = string_format(zFormat, zName); 515: int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); 516: free(zCommand); 517: return rc; 518: } 519: 520: /* end utility functions */ 521: 522: #define QUERY_GENERIC 0 523: #define QUERY_FULLTEXT 1 524: 525: #define CHUNK_MAX 1024 526: 527: typedef enum fulltext_statement { 528: CONTENT_INSERT_STMT, 529: CONTENT_SELECT_STMT, 530: CONTENT_DELETE_STMT, 531: 532: TERM_SELECT_STMT, 533: TERM_CHUNK_SELECT_STMT, 534: TERM_INSERT_STMT, 535: TERM_UPDATE_STMT, 536: TERM_DELETE_STMT, 537: 538: MAX_STMT /* Always at end! */ 539: } fulltext_statement; 540: 541: /* These must exactly match the enum above. */ 542: /* TODO(adam): Is there some risk that a statement (in particular, 543: ** pTermSelectStmt) will be used in two cursors at once, e.g. if a 544: ** query joins a virtual table to itself? If so perhaps we should 545: ** move some of these to the cursor object. 546: */ 547: static const char *fulltext_zStatement[MAX_STMT] = { 548: /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", 549: /* CONTENT_SELECT */ "select content from %_content where rowid = ?", 550: /* CONTENT_DELETE */ "delete from %_content where rowid = ?", 551: 552: /* TERM_SELECT */ 553: "select rowid, doclist from %_term where term = ? and first = ?", 554: /* TERM_CHUNK_SELECT */ 555: "select max(first) from %_term where term = ? and first <= ?", 556: /* TERM_INSERT */ 557: "insert into %_term (term, first, doclist) values (?, ?, ?)", 558: /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", 559: /* TERM_DELETE */ "delete from %_term where rowid = ?", 560: }; 561: 562: typedef struct fulltext_vtab { 563: sqlite3_vtab base; 564: sqlite3 *db; 565: const char *zName; /* virtual table name */ 566: sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ 567: 568: /* Precompiled statements which we keep as long as the table is 569: ** open. 570: */ 571: sqlite3_stmt *pFulltextStatements[MAX_STMT]; 572: } fulltext_vtab; 573: 574: typedef struct fulltext_cursor { 575: sqlite3_vtab_cursor base; 576: int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ 577: 578: sqlite3_stmt *pStmt; 579: 580: int eof; 581: 582: /* The following is used only when iCursorType == QUERY_FULLTEXT. */ 583: DocListReader result; 584: } fulltext_cursor; 585: 586: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ 587: return (fulltext_vtab *) c->base.pVtab; 588: } 589: 590: static sqlite3_module fulltextModule; /* forward declaration */ 591: 592: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. 593: ** If the indicated statement has never been prepared, it is prepared 594: ** and cached, otherwise the cached version is reset. 595: */ 596: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, 597: sqlite3_stmt **ppStmt){ 598: assert( iStmt<MAX_STMT ); 599: if( v->pFulltextStatements[iStmt]==NULL ){ 600: int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], 601: fulltext_zStatement[iStmt]); 602: if( rc!=SQLITE_OK ) return rc; 603: } else { 604: int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); 605: if( rc!=SQLITE_OK ) return rc; 606: } 607: 608: *ppStmt = v->pFulltextStatements[iStmt]; 609: return SQLITE_OK; 610: } 611: 612: /* Step the indicated statement, handling errors SQLITE_BUSY (by 613: ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring 614: ** bindings to the new statement). 615: ** TODO(adam): We should extend this function so that it can work with 616: ** statements declared locally, not only globally cached statements. 617: */ 618: static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, 619: sqlite3_stmt **ppStmt){ 620: int rc; 621: sqlite3_stmt *s = *ppStmt; 622: assert( iStmt<MAX_STMT ); 623: assert( s==v->pFulltextStatements[iStmt] ); 624: 625: while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ 626: sqlite3_stmt *pNewStmt; 627: 628: if( rc==SQLITE_BUSY ) continue; 629: if( rc!=SQLITE_ERROR ) return rc; 630: 631: rc = sqlite3_reset(s); 632: if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; 633: 634: v->pFulltextStatements[iStmt] = NULL; /* Still in s */ 635: rc = sql_get_statement(v, iStmt, &pNewStmt); 636: if( rc!=SQLITE_OK ) goto err; 637: *ppStmt = pNewStmt; 638: 639: rc = sqlite3_transfer_bindings(s, pNewStmt); 640: if( rc!=SQLITE_OK ) goto err; 641: 642: rc = sqlite3_finalize(s); 643: if( rc!=SQLITE_OK ) return rc; 644: s = pNewStmt; 645: } 646: return rc; 647: 648: err: 649: sqlite3_finalize(s); 650: return rc; 651: } 652: 653: /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. 654: ** Useful for statements like UPDATE, where we expect no results. 655: */ 656: static int sql_single_step_statement(fulltext_vtab *v, 657: fulltext_statement iStmt, 658: sqlite3_stmt **ppStmt){ 659: int rc = sql_step_statement(v, iStmt, ppStmt); 660: return (rc==SQLITE_DONE) ? SQLITE_OK : rc; 661: } 662: 663: /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ 664: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, 665: const char *zContent, int nContent){ 666: sqlite3_stmt *s; 667: int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); 668: if( rc!=SQLITE_OK ) return rc; 669: 670: rc = sqlite3_bind_value(s, 1, rowid); 671: if( rc!=SQLITE_OK ) return rc; 672: 673: rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); 674: if( rc!=SQLITE_OK ) return rc; 675: 676: return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); 677: } 678: 679: /* select content from %_content where rowid = [iRow] 680: * The caller must delete the returned string. */ 681: static int content_select(fulltext_vtab *v, sqlite_int64 iRow, 682: char **pzContent){ 683: sqlite3_stmt *s; 684: int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); 685: if( rc!=SQLITE_OK ) return rc; 686: 687: rc = sqlite3_bind_int64(s, 1, iRow); 688: if( rc!=SQLITE_OK ) return rc; 689: 690: rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); 691: if( rc!=SQLITE_ROW ) return rc; 692: 693: *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); 694: 695: /* We expect only one row. We must execute another sqlite3_step() 696: * to complete the iteration; otherwise the table will remain locked. */ 697: rc = sqlite3_step(s); 698: if( rc==SQLITE_DONE ) return SQLITE_OK; 699: 700: free(*pzContent); 701: return rc; 702: } 703: 704: /* delete from %_content where rowid = [iRow ] */ 705: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ 706: sqlite3_stmt *s; 707: int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); 708: if( rc!=SQLITE_OK ) return rc; 709: 710: rc = sqlite3_bind_int64(s, 1, iRow); 711: if( rc!=SQLITE_OK ) return rc; 712: 713: return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); 714: } 715: 716: /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] 717: * If found, returns SQLITE_OK; the caller must free the returned doclist. 718: * If no rows found, returns SQLITE_ERROR. */ 719: static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, 720: sqlite_int64 iFirst, 721: sqlite_int64 *rowid, 722: DocList *out){ 723: sqlite3_stmt *s; 724: int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); 725: if( rc!=SQLITE_OK ) return rc; 726: 727: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); 728: if( rc!=SQLITE_OK ) return rc; 729: 730: rc = sqlite3_bind_int64(s, 2, iFirst); 731: if( rc!=SQLITE_OK ) return rc; 732: 733: rc = sql_step_statement(v, TERM_SELECT_STMT, &s); 734: if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 735: 736: *rowid = sqlite3_column_int64(s, 0); 737: docListInit(out, DL_POSITIONS_OFFSETS, 738: sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); 739: 740: /* We expect only one row. We must execute another sqlite3_step() 741: * to complete the iteration; otherwise the table will remain locked. */ 742: rc = sqlite3_step(s); 743: return rc==SQLITE_DONE ? SQLITE_OK : rc; 744: } 745: 746: /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] 747: * If found, returns SQLITE_ROW and result in *piResult; if the query returns 748: * NULL (meaning no row found) returns SQLITE_DONE. 749: */ 750: static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, 751: sqlite_int64 iFirst, sqlite_int64 *piResult){ 752: sqlite3_stmt *s; 753: int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); 754: if( rc!=SQLITE_OK ) return rc; 755: 756: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); 757: if( rc!=SQLITE_OK ) return rc; 758: 759: rc = sqlite3_bind_int64(s, 2, iFirst); 760: if( rc!=SQLITE_OK ) return rc; 761: 762: rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); 763: if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 764: 765: switch( sqlite3_column_type(s, 0) ){ 766: case SQLITE_NULL: 767: rc = SQLITE_DONE; 768: break; 769: case SQLITE_INTEGER: 770: *piResult = sqlite3_column_int64(s, 0); 771: break; 772: default: 773: return SQLITE_ERROR; 774: } 775: /* We expect only one row. We must execute another sqlite3_step() 776: * to complete the iteration; otherwise the table will remain locked. */ 777: if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; 778: return rc; 779: } 780: 781: /* insert into %_term (term, first, doclist) 782: values ([zTerm], [iFirst], [doclist]) */ 783: static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, 784: sqlite_int64 iFirst, DocList *doclist){ 785: sqlite3_stmt *s; 786: int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); 787: if( rc!=SQLITE_OK ) return rc; 788: 789: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); 790: if( rc!=SQLITE_OK ) return rc; 791: 792: rc = sqlite3_bind_int64(s, 2, iFirst); 793: if( rc!=SQLITE_OK ) return rc; 794: 795: rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); 796: if( rc!=SQLITE_OK ) return rc; 797: 798: return sql_single_step_statement(v, TERM_INSERT_STMT, &s); 799: } 800: 801: /* update %_term set doclist = [doclist] where rowid = [rowid] */ 802: static int term_update(fulltext_vtab *v, sqlite_int64 rowid, 803: DocList *doclist){ 804: sqlite3_stmt *s; 805: int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); 806: if( rc!=SQLITE_OK ) return rc; 807: 808: rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, 809: SQLITE_STATIC); 810: if( rc!=SQLITE_OK ) return rc; 811: 812: rc = sqlite3_bind_int64(s, 2, rowid); 813: if( rc!=SQLITE_OK ) return rc; 814: 815: return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); 816: } 817: 818: static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ 819: sqlite3_stmt *s; 820: int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); 821: if( rc!=SQLITE_OK ) return rc; 822: 823: rc = sqlite3_bind_int64(s, 1, rowid); 824: if( rc!=SQLITE_OK ) return rc; 825: 826: return sql_single_step_statement(v, TERM_DELETE_STMT, &s); 827: } 828: 829: static void fulltext_vtab_destroy(fulltext_vtab *v){ 830: int iStmt; 831: 832: for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ 833: if( v->pFulltextStatements[iStmt]!=NULL ){ 834: sqlite3_finalize(v->pFulltextStatements[iStmt]); 835: v->pFulltextStatements[iStmt] = NULL; 836: } 837: } 838: 839: if( v->pTokenizer!=NULL ){ 840: v->pTokenizer->pModule->xDestroy(v->pTokenizer); 841: v->pTokenizer = NULL; 842: } 843: 844: free((void *) v->zName); 845: free(v); 846: } 847: 848: /* Current interface: 849: ** argv[0] - module name 850: ** argv[1] - database name 851: ** argv[2] - table name 852: ** argv[3] - tokenizer name (optional, a sensible default is provided) 853: ** argv[4..] - passed to tokenizer (optional based on tokenizer) 854: **/ 855: static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv, 856: sqlite3_vtab **ppVTab){ 857: int rc; 858: fulltext_vtab *v; 859: sqlite3_tokenizer_module *m = NULL; 860: 861: assert( argc>=3 ); 862: v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); 863: /* sqlite will initialize v->base */ 864: v->db = db; 865: v->zName = string_dup(argv[2]); 866: v->pTokenizer = NULL; 867: 868: if( argc==3 ){ 869: get_simple_tokenizer_module(&m); 870: } else { 871: /* TODO(shess) For now, add new tokenizers as else if clauses. */ 872: if( !strcmp(argv[3], "simple") ){ 873: get_simple_tokenizer_module(&m); 874: } else { 875: assert( "unrecognized tokenizer"==NULL ); 876: } 877: } 878: 879: /* TODO(shess) Since tokenization impacts the index, the parameters 880: ** to the tokenizer need to be identical when a persistent virtual 881: ** table is re-created. One solution would be a meta-table to track 882: ** such information in the database. Then we could verify that the 883: ** information is identical on subsequent creates. 884: */ 885: /* TODO(shess) Why isn't argv already (const char **)? */ 886: rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); 887: if( rc!=SQLITE_OK ) return rc; 888: v->pTokenizer->pModule = m; 889: 890: /* TODO: verify the existence of backing tables foo_content, foo_term */ 891: 892: rc = sqlite3_declare_vtab(db, "create table x(content text)"); 893: if( rc!=SQLITE_OK ) return rc; 894: 895: memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); 896: 897: *ppVTab = &v->base; 898: return SQLITE_OK; 899: } 900: 901: static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv, 902: sqlite3_vtab **ppVTab){ 903: int rc; 904: assert( argc>=3 ); 905: 906: /* The %_content table holds the text of each full-text item, with 907: ** the rowid used as the docid. 908: ** 909: ** The %_term table maps each term to a document list blob 910: ** containing elements sorted by ascending docid, each element 911: ** encoded as: 912: ** 913: ** docid varint-encoded 914: ** token count varint-encoded 915: ** "count" token elements (poslist): 916: ** position varint-encoded as delta from previous position 917: ** start offset varint-encoded as delta from previous start offset 918: ** end offset varint-encoded as delta from start offset 919: ** 920: ** Additionally, doclist blobs can be chunked into multiple rows, 921: ** using "first" to order the blobs. "first" is simply the first 922: ** docid in the blob. 923: */ 924: /* 925: ** NOTE(shess) That last sentence is incorrect in the face of 926: ** deletion, which can leave a doclist that doesn't contain the 927: ** first from that row. I _believe_ this does not matter to the 928: ** operation of the system, but it might be reasonable to update 929: ** appropriately in case this assumption becomes more important. 930: */ 931: rc = sql_exec(db, argv[2], 932: "create table %_content(content text);" 933: "create table %_term(term text, first integer, doclist blob);" 934: "create index %_index on %_term(term, first)"); 935: if( rc!=SQLITE_OK ) return rc; 936: 937: return fulltextConnect(db, pAux, argc, argv, ppVTab); 938: } 939: 940: /* Decide how to handle an SQL query. 941: * At the moment, MATCH queries can include implicit boolean ANDs; we 942: * haven't implemented phrase searches or OR yet. */ 943: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ 944: int i; 945: 946: for(i=0; i<pInfo->nConstraint; ++i){ 947: const struct sqlite3_index_constraint *pConstraint; 948: pConstraint = &pInfo->aConstraint[i]; 949: if( pConstraint->iColumn==0 && 950: pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && 951: pConstraint->usable ){ /* a full-text search */ 952: pInfo->aConstraintUsage[i].argvIndex = 1; 953: pInfo->aConstraintUsage[i].omit = 1; 954: pInfo->idxNum = QUERY_FULLTEXT; 955: pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ 956: return SQLITE_OK; 957: } 958: } 959: pInfo->idxNum = QUERY_GENERIC; 960: return SQLITE_OK; 961: } 962: 963: static int fulltextDisconnect(sqlite3_vtab *pVTab){ 964: fulltext_vtab_destroy((fulltext_vtab *)pVTab); 965: return SQLITE_OK; 966: } 967: 968: static int fulltextDestroy(sqlite3_vtab *pVTab){ 969: fulltext_vtab *v = (fulltext_vtab *)pVTab; 970: 971: int rc = sql_exec(v->db, v->zName, 972: "drop table %_content; drop table %_term"); 973: if( rc!=SQLITE_OK ) return rc; 974: 975: fulltext_vtab_destroy((fulltext_vtab *)pVTab); 976: return SQLITE_OK; 977: } 978: 979: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ 980: fulltext_cursor *c; 981: 982: c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); 983: /* sqlite will initialize c->base */ 984: *ppCursor = &c->base; 985: 986: return SQLITE_OK; 987: } 988: 989: static int fulltextClose(sqlite3_vtab_cursor *pCursor){ 990: fulltext_cursor *c = (fulltext_cursor *) pCursor; 991: sqlite3_finalize(c->pStmt); 992: if( c->result.pDoclist!=NULL ){ 993: docListDelete(c->result.pDoclist); 994: } 995: free(c); 996: return SQLITE_OK; 997: } 998: 999: static int fulltextNext(sqlite3_vtab_cursor *pCursor){ 1000: fulltext_cursor *c = (fulltext_cursor *) pCursor; 1001: sqlite_int64 iDocid; 1002: int rc; 1003: 1004: switch( c->iCursorType ){ 1005: case QUERY_GENERIC: 1006: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 1007: rc = sqlite3_step(c->pStmt); 1008: switch( rc ){ 1009: case SQLITE_ROW: 1010: c->eof = 0; 1011: return SQLITE_OK; 1012: case SQLITE_DONE: 1013: c->eof = 1; 1014: return SQLITE_OK; 1015: default: 1016: c->eof = 1; 1017: return rc; 1018: } 1019: case QUERY_FULLTEXT: 1020: rc = sqlite3_reset(c->pStmt); 1021: if( rc!=SQLITE_OK ) return rc; 1022: 1023: if( readerAtEnd(&c->result)){ 1024: c->eof = 1; 1025: return SQLITE_OK; 1026: } 1027: iDocid = readDocid(&c->result); 1028: rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); 1029: if( rc!=SQLITE_OK ) return rc; 1030: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ 1031: rc = sqlite3_step(c->pStmt); 1032: if( rc==SQLITE_ROW ){ /* the case we expect */ 1033: c->eof = 0; 1034: return SQLITE_OK; 1035: } 1036: /* an error occurred; abort */ 1037: return rc==SQLITE_DONE ? SQLITE_ERROR : rc; 1038: default: 1039: assert( 0 ); 1040: return SQLITE_ERROR; /* not reached */ 1041: } 1042: } 1043: 1044: static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, 1045: sqlite3_stmt **ppStmt){ 1046: int rc; 1047: if( *ppStmt ){ 1048: rc = sqlite3_reset(*ppStmt); 1049: } else { 1050: rc = sql_prepare(v->db, v->zName, ppStmt, 1051: "select doclist from %_term where term = ? order by first"); 1052: } 1053: if( rc!=SQLITE_OK ) return rc; 1054: 1055: rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); 1056: if( rc!=SQLITE_OK ) return rc; 1057: 1058: return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ 1059: } 1060: 1061: /* Read the posting list for [zTerm]; AND it with the doclist [in] to 1062: * produce the doclist [out], using the given offset [iOffset] for phrase 1063: * matching. 1064: * (*pSelect) is used to hold an SQLite statement used inside this function; 1065: * the caller should initialize *pSelect to NULL before the first call. 1066: */ 1067: static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, 1068: const char *zTerm, 1069: DocList *pIn, int iOffset, DocList *out){ 1070: int rc; 1071: DocListMerge merge; 1072: 1073: if( pIn!=NULL && !pIn->nData ){ 1074: /* If [pIn] is already empty, there's no point in reading the 1075: * posting list to AND it in; return immediately. */ 1076: return SQLITE_OK; 1077: } 1078: 1079: rc = term_select_doclist(v, zTerm, -1, pSelect); 1080: if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; 1081: 1082: mergeInit(&merge, pIn, iOffset, out); 1083: while( rc==SQLITE_ROW ){ 1084: DocList block; 1085: docListInit(&block, DL_POSITIONS_OFFSETS, 1086: sqlite3_column_blob(*pSelect, 0), 1087: sqlite3_column_bytes(*pSelect, 0)); 1088: mergeBlock(&merge, &block); 1089: docListDestroy(&block); 1090: 1091: rc = sqlite3_step(*pSelect); 1092: if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ 1093: return rc; 1094: } 1095: } 1096: 1097: return SQLITE_OK; 1098: } 1099: 1100: typedef struct QueryTerm { 1101: int is_phrase; /* true if this term begins a new phrase */ 1102: const char *zTerm; 1103: } QueryTerm; 1104: 1105: /* A parsed query. 1106: * 1107: * As an example, parsing the query ["four score" years "new nation"] will 1108: * yield a Query with 5 terms: 1109: * "four", is_phrase = 1 1110: * "score", is_phrase = 0 1111: * "years", is_phrase = 1 1112: * "new", is_phrase = 1 1113: * "nation", is_phrase = 0 1114: */ 1115: typedef struct Query { 1116: int nTerms; 1117: QueryTerm *pTerm; 1118: } Query; 1119: 1120: static void query_add(Query *q, int is_phrase, const char *zTerm){ 1121: QueryTerm *t; 1122: ++q->nTerms; 1123: q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); 1124: t = &q->pTerm[q->nTerms - 1]; 1125: t->is_phrase = is_phrase; 1126: t->zTerm = zTerm; 1127: } 1128: 1129: static void query_free(Query *q){ 1130: int i; 1131: for(i = 0; i < q->nTerms; ++i){ 1132: free((void *) q->pTerm[i].zTerm); 1133: } 1134: free(q->pTerm); 1135: } 1136: 1137: static int tokenize_segment(sqlite3_tokenizer *pTokenizer, 1138: const char *zQuery, int in_phrase, 1139: Query *pQuery){ 1140: sqlite3_tokenizer_module *pModule = pTokenizer->pModule; 1141: sqlite3_tokenizer_cursor *pCursor; 1142: int is_first = 1; 1143: 1144: int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); 1145: if( rc!=SQLITE_OK ) return rc; 1146: pCursor->pTokenizer = pTokenizer; 1147: 1148: while( 1 ){ 1149: const char *zToken; 1150: int nToken, iStartOffset, iEndOffset, dummy_pos; 1151: 1152: rc = pModule->xNext(pCursor, 1153: &zToken, &nToken, 1154: &iStartOffset, &iEndOffset, 1155: &dummy_pos); 1156: if( rc!=SQLITE_OK ) break; 1157: query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); 1158: is_first = 0; 1159: } 1160: 1161: return pModule->xClose(pCursor); 1162: } 1163: 1164: /* Parse a query string, yielding a Query object. */ 1165: static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ 1166: char *zQuery1 = string_dup(zQuery); 1167: int in_phrase = 0; 1168: char *s = zQuery1; 1169: pQuery->nTerms = 0; 1170: pQuery->pTerm = NULL; 1171: 1172: while( *s ){ 1173: char *t = s; 1174: while( *t ){ 1175: if( *t=='"' ){ 1176: *t++ = '\0'; 1177: break; 1178: } 1179: ++t; 1180: } 1181: if( *s ){ 1182: tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); 1183: } 1184: s = t; 1185: in_phrase = !in_phrase; 1186: } 1187: 1188: free(zQuery1); 1189: return SQLITE_OK; 1190: } 1191: 1192: /* Perform a full-text query; return a list of documents in [pResult]. */ 1193: static int fulltext_query(fulltext_vtab *v, const char *zQuery, 1194: DocList **pResult){ 1195: Query q; 1196: int phrase_start = -1; 1197: int i; 1198: sqlite3_stmt *pSelect = NULL; 1199: DocList *d = NULL; 1200: 1201: int rc = parse_query(v, zQuery, &q); 1202: if( rc!=SQLITE_OK ) return rc; 1203: 1204: /* Merge terms. */ 1205: for(i = 0 ; i < q.nTerms ; ++i){ 1206: /* In each merge step, we need to generate positions whenever we're 1207: * processing a phrase which hasn't ended yet. */ 1208: int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; 1209: DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); 1210: if( q.pTerm[i].is_phrase ){ 1211: phrase_start = i; 1212: } 1213: rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); 1214: if( rc!=SQLITE_OK ) break; 1215: if( d!=NULL ){ 1216: docListDelete(d); 1217: } 1218: d = next; 1219: } 1220: 1221: sqlite3_finalize(pSelect); 1222: query_free(&q); 1223: *pResult = d; 1224: return rc; 1225: } 1226: 1227: static int fulltextFilter(sqlite3_vtab_cursor *pCursor, 1228: int idxNum, const char *idxStr, 1229: int argc, sqlite3_value **argv){ 1230: fulltext_cursor *c = (fulltext_cursor *) pCursor; 1231: fulltext_vtab *v = cursor_vtab(c); 1232: int rc; 1233: const char *zStatement; 1234: 1235: c->iCursorType = idxNum; 1236: switch( idxNum ){ 1237: case QUERY_GENERIC: 1238: zStatement = "select rowid, content from %_content"; 1239: break; 1240: 1241: case QUERY_FULLTEXT: /* full-text search */ 1242: { 1243: const char *zQuery = (const char *)sqlite3_value_text(argv[0]); 1244: DocList *pResult; 1245: assert( argc==1 ); 1246: rc = fulltext_query(v, zQuery, &pResult); 1247: if( rc!=SQLITE_OK ) return rc; 1248: readerInit(&c->result, pResult); 1249: zStatement = "select rowid, content from %_content where rowid = ?"; 1250: break; 1251: } 1252: 1253: default: 1254: assert( 0 ); 1255: } 1256: 1257: rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); 1258: if( rc!=SQLITE_OK ) return rc; 1259: 1260: return fulltextNext(pCursor); 1261: } 1262: 1263: static int fulltextEof(sqlite3_vtab_cursor *pCursor){ 1264: fulltext_cursor *c = (fulltext_cursor *) pCursor; 1265: return c->eof; 1266: } 1267: 1268: static int fulltextColumn(sqlite3_vtab_cursor *pCursor, 1269: sqlite3_context *pContext, int idxCol){ 1270: fulltext_cursor *c = (fulltext_cursor *) pCursor; 1271: const char *s; 1272: 1273: assert( idxCol==0 ); 1274: s = (const char *) sqlite3_column_text(c->pStmt, 1); 1275: sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); 1276: 1277: return SQLITE_OK; 1278: } 1279: 1280: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ 1281: fulltext_cursor *c = (fulltext_cursor *) pCursor; 1282: 1283: *pRowid = sqlite3_column_int64(c->pStmt, 0); 1284: return SQLITE_OK; 1285: } 1286: 1287: /* Build a hash table containing all terms in zText. */ 1288: static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, 1289: const char *zText, sqlite_int64 iDocid){ 1290: sqlite3_tokenizer_cursor *pCursor; 1291: const char *pToken; 1292: int nTokenBytes; 1293: int iStartOffset, iEndOffset, iPosition; 1294: 1295: int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); 1296: if( rc!=SQLITE_OK ) return rc; 1297: 1298: pCursor->pTokenizer = pTokenizer; 1299: HashInit(terms, HASH_STRING, 1); 1300: while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, 1301: &pToken, &nTokenBytes, 1302: &iStartOffset, &iEndOffset, 1303: &iPosition) ){ 1304: DocList *p; 1305: 1306: /* Positions can't be negative; we use -1 as a terminator internally. */ 1307: if( iPosition<0 ) { 1308: rc = SQLITE_ERROR; 1309: goto err; 1310: } 1311: 1312: p = HashFind(terms, pToken, nTokenBytes); 1313: if( p==NULL ){ 1314: p = docListNew(DL_POSITIONS_OFFSETS); 1315: docListAddDocid(p, iDocid); 1316: HashInsert(terms, pToken, nTokenBytes, p); 1317: } 1318: docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); 1319: } 1320: 1321: err: 1322: /* TODO(shess) Check return? Should this be able to cause errors at 1323: ** this point? Actually, same question about sqlite3_finalize(), 1324: ** though one could argue that failure there means that the data is 1325: ** not durable. *ponder* 1326: */ 1327: pTokenizer->pModule->xClose(pCursor); 1328: return rc; 1329: } 1330: /* Update the %_terms table to map the term [zTerm] to the given rowid. */ 1331: static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, 1332: sqlite_int64 iDocid, DocList *p){ 1333: sqlite_int64 iFirst; 1334: sqlite_int64 iIndexRow; 1335: DocList doclist; 1336: 1337: int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); 1338: if( rc==SQLITE_DONE ){ 1339: docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); 1340: if( docListUpdate(&doclist, iDocid, p) ){ 1341: rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); 1342: docListDestroy(&doclist); 1343: return rc; 1344: } 1345: return SQLITE_OK; 1346: } 1347: if( rc!=SQLITE_ROW ) return SQLITE_ERROR; 1348: 1349: /* This word is in the index; add this document ID to its blob. */ 1350: 1351: rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); 1352: if( rc!=SQLITE_OK ) return rc; 1353: 1354: if( docListUpdate(&doclist, iDocid, p) ){ 1355: /* If the blob is too big, split it in half. */ 1356: if( doclist.nData>CHUNK_MAX ){ 1357: DocList half; 1358: if( docListSplit(&doclist, &half) ){ 1359: rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); 1360: docListDestroy(&half); 1361: if( rc!=SQLITE_OK ) goto err; 1362: } 1363: } 1364: rc = term_update(v, iIndexRow, &doclist); 1365: } 1366: 1367: err: 1368: docListDestroy(&doclist); 1369: return rc; 1370: } 1371: 1372: /* Insert a row into the full-text index; set *piRowid to be the ID of the 1373: * new row. */ 1374: static int index_insert(fulltext_vtab *v, 1375: sqlite3_value *pRequestRowid, const char *zText, 1376: sqlite_int64 *piRowid){ 1377: Hash terms; /* maps term string -> PosList */ 1378: HashElem *e; 1379: 1380: int rc = content_insert(v, pRequestRowid, zText, -1); 1381: if( rc!=SQLITE_OK ) return rc; 1382: *piRowid = sqlite3_last_insert_rowid(v->db); 1383: 1384: if( !zText ) return SQLITE_OK; /* nothing to index */ 1385: 1386: rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); 1387: if( rc!=SQLITE_OK ) return rc; 1388: 1389: for(e=HashFirst(&terms); e; e=HashNext(e)){ 1390: DocList *p = HashData(e); 1391: rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); 1392: if( rc!=SQLITE_OK ) break; 1393: } 1394: 1395: for(e=HashFirst(&terms); e; e=HashNext(e)){ 1396: DocList *p = HashData(e); 1397: docListDelete(p); 1398: } 1399: HashClear(&terms); 1400: return rc; 1401: } 1402: 1403: static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, 1404: sqlite_int64 iDocid){ 1405: sqlite_int64 iFirst; 1406: sqlite_int64 iIndexRow; 1407: DocList doclist; 1408: 1409: int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); 1410: if( rc!=SQLITE_ROW ) return SQLITE_ERROR; 1411: 1412: rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); 1413: if( rc!=SQLITE_OK ) return rc; 1414: 1415: if( docListUpdate(&doclist, iDocid, NULL) ){ 1416: if( doclist.nData>0 ){ 1417: rc = term_update(v, iIndexRow, &doclist); 1418: } else { /* empty posting list */ 1419: rc = term_delete(v, iIndexRow); 1420: } 1421: } 1422: docListDestroy(&doclist); 1423: return rc; 1424: } 1425: 1426: /* Delete a row from the full-text index. */ 1427: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ 1428: char *zText; 1429: Hash terms; 1430: HashElem *e; 1431: 1432: int rc = content_select(v, iRow, &zText); 1433: if( rc!=SQLITE_OK ) return rc; 1434: 1435: rc = build_terms(&terms, v->pTokenizer, zText, iRow); 1436: free(zText); 1437: if( rc!=SQLITE_OK ) return rc; 1438: 1439: for(e=HashFirst(&terms); e; e=HashNext(e)){ 1440: rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); 1441: if( rc!=SQLITE_OK ) break; 1442: } 1443: for(e=HashFirst(&terms); e; e=HashNext(e)){ 1444: DocList *p = HashData(e); 1445: docListDelete(p); 1446: } 1447: HashClear(&terms); 1448: 1449: return content_delete(v, iRow); 1450: } 1451: 1452: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, 1453: sqlite_int64 *pRowid){ 1454: fulltext_vtab *v = (fulltext_vtab *) pVtab; 1455: 1456: if( nArg<2 ){ 1457: return index_delete(v, sqlite3_value_int64(ppArg[0])); 1458: } 1459: 1460: if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ 1461: return SQLITE_ERROR; /* an update; not yet supported */ 1462: } 1463: 1464: assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ 1465: return index_insert(v, ppArg[1], 1466: (const char *)sqlite3_value_text(ppArg[2]), pRowid); 1467: } 1468: 1469: static sqlite3_module fulltextModule = { 1470: 0, 1471: fulltextCreate, 1472: fulltextConnect, 1473: fulltextBestIndex, 1474: fulltextDisconnect, 1475: fulltextDestroy, 1476: fulltextOpen, 1477: fulltextClose, 1478: fulltextFilter, 1479: fulltextNext, 1480: fulltextEof, 1481: fulltextColumn, 1482: fulltextRowid, 1483: fulltextUpdate 1484: }; 1485: 1486: int fulltext_init(sqlite3 *db){ 1487: return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); 1488: } 1489: 1490: #if !SQLITE_CORE 1491: int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, 1492: const sqlite3_api_routines *pApi){ 1493: SQLITE_EXTENSION_INIT2(pApi) 1494: return fulltext_init(db); 1495: } 1496: #endif