Annotation of embedaddon/sqlite3/test/fts3_common.tcl, revision 1.1.1.1

1.1       misho       1: # 2009 November 04
                      2: #
                      3: # The author disclaims copyright to this source code.  In place of
                      4: # a legal notice, here is a blessing:
                      5: #
                      6: #    May you do good and not evil.
                      7: #    May you find forgiveness for yourself and forgive others.
                      8: #    May you share freely, never taking more than you give.
                      9: #
                     10: #***********************************************************************
                     11: #
                     12: # This file contains common code used the fts3 tests. At one point
                     13: # equivalent functionality was implemented in C code. But it is easier
                     14: # to use Tcl.
                     15: #
                     16: 
                     17: #-------------------------------------------------------------------------
                     18: # USAGE: fts3_integrity_check TBL
                     19: #
                     20: # This proc is used to verify that the full-text index is consistent with
                     21: # the contents of the fts3 table. In other words, it checks that the
                     22: # data in the %_contents table matches that in the %_segdir and %_segments 
                     23: # tables.
                     24: #
                     25: # This is not an efficient procedure. It uses a lot of memory and a lot
                     26: # of CPU. But it is better than not checking at all.
                     27: #
                     28: # The procedure is:
                     29: #
                     30: #   1) Read the entire full-text index from the %_segdir and %_segments
                     31: #      tables into memory. For each entry in the index, the following is
                     32: #      done:
                     33: #
                     34: #          set C($iDocid,$iCol,$iPosition) $zTerm
                     35: #
                     36: #   2) Iterate through each column of each row of the %_content table. 
                     37: #      Tokenize all documents, and check that for each token there is
                     38: #      a corresponding entry in the $C array. After checking a token,
                     39: #      [unset] the $C array entry.
                     40: #
                     41: #   3) Check that array $C is now empty.
                     42: #      
                     43: #
                     44: proc fts3_integrity_check {tbl} {
                     45: 
                     46:   fts3_read2 $tbl 1 A
                     47: 
                     48:   foreach zTerm [array names A] {
                     49:     foreach doclist $A($zTerm) {
                     50:       set docid 0
                     51:       while {[string length $doclist]>0} {
                     52:         set iCol 0
                     53:         set iPos 0
                     54:         set lPos [list]
                     55:         set lCol [list]
                     56: 
                     57:         # First varint of a doclist-entry is the docid. Delta-compressed
                     58:         # with respect to the docid of the previous entry.
                     59:         #
                     60:         incr docid [gobble_varint doclist]
                     61:         if {[info exists D($zTerm,$docid)]} {
                     62:           while {[set iDelta [gobble_varint doclist]] != 0} {}
                     63:           continue
                     64:         }
                     65:         set D($zTerm,$docid) 1
                     66: 
                     67:         # Gobble varints until the 0x00 that terminates the doclist-entry
                     68:         # is found.
                     69:         while {[set iDelta [gobble_varint doclist]] > 0} {
                     70:           if {$iDelta == 1} {
                     71:             set iCol [gobble_varint doclist]
                     72:             set iPos 0
                     73:           } else {
                     74:             incr iPos $iDelta
                     75:             incr iPos -2
                     76:             set C($docid,$iCol,$iPos) $zTerm
                     77:           }
                     78:         }
                     79:       }
                     80:     }
                     81:   }
                     82: 
                     83:   foreach key [array names C] {
                     84:     #puts "$key -> $C($key)"
                     85:   }
                     86: 
                     87: 
                     88:   db eval "SELECT * FROM ${tbl}_content" E {
                     89:     set iCol 0
                     90:     set iDoc $E(docid)
                     91:     foreach col [lrange $E(*) 1 end] {
                     92:       set c $E($col)
                     93:       set sql {SELECT fts3_tokenizer_test('simple', $c)}
                     94: 
                     95:       foreach {pos term dummy} [db one $sql] {
                     96:         if {![info exists C($iDoc,$iCol,$pos)]} {
                     97:           set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
                     98:           lappend errors $es
                     99:         } else {
                    100:           if {$C($iDoc,$iCol,$pos) != "$term"} {
                    101:             set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
                    102:             append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
                    103:             lappend errors $es
                    104:           }
                    105:           unset C($iDoc,$iCol,$pos)
                    106:         }
                    107:       }
                    108:       incr iCol
                    109:     }
                    110:   }
                    111: 
                    112:   foreach c [array names C] {
                    113:     lappend errors "Bad index entry: $c -> $C($c)"
                    114:   }
                    115: 
                    116:   if {[info exists errors]} { return [join $errors "\n"] }
                    117:   return "ok"
                    118: }
                    119: 
                    120: # USAGE: fts3_terms TBL WHERE
                    121: #
                    122: # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
                    123: # SQL expression that will be used as the WHERE clause when scanning
                    124: # the %_segdir table. As in the following query:
                    125: #
                    126: #   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
                    127: #
                    128: # This function returns a list of all terms present in the segments
                    129: # selected by the statement above.
                    130: #
                    131: proc fts3_terms {tbl where} {
                    132:   fts3_read $tbl $where a
                    133:   return [lsort [array names a]]
                    134: }
                    135: 
                    136: 
                    137: # USAGE: fts3_doclist TBL TERM WHERE
                    138: #
                    139: # Argument TBL must be the name of an FTS3 table. TERM is a term that may
                    140: # or may not be present in the table. Argument WHERE is used to select a 
                    141: # subset of the b-tree segments in the associated full-text index as 
                    142: # described above for [fts3_terms].
                    143: #
                    144: # This function returns the results of merging the doclists associated
                    145: # with TERM in the selected segments. Each doclist is an element of the
                    146: # returned list. Each doclist is formatted as follows:
                    147: #
                    148: #   [$docid ?$col[$off1 $off2...]?...]
                    149: #
                    150: # The formatting is odd for a Tcl command in order to be compatible with
                    151: # the original C-language implementation. If argument WHERE is "1", then 
                    152: # any empty doclists are omitted from the returned list.
                    153: #
                    154: proc fts3_doclist {tbl term where} {
                    155:   fts3_read $tbl $where a
                    156: 
                    157: 
                    158:   foreach doclist $a($term) {
                    159:     set docid 0
                    160: 
                    161:     while {[string length $doclist]>0} {
                    162:       set iCol 0
                    163:       set iPos 0
                    164:       set lPos [list]
                    165:       set lCol [list]
                    166:       incr docid [gobble_varint doclist]
                    167:   
                    168:       while {[set iDelta [gobble_varint doclist]] > 0} {
                    169:         if {$iDelta == 1} {
                    170:           lappend lCol [list $iCol $lPos]
                    171:           set iPos 0
                    172:           set lPos [list]
                    173:           set iCol [gobble_varint doclist]
                    174:         } else {
                    175:           incr iPos $iDelta
                    176:           incr iPos -2
                    177:           lappend lPos $iPos
                    178:         }
                    179:       }
                    180:   
                    181:       if {[llength $lPos]>0} {
                    182:         lappend lCol [list $iCol $lPos]
                    183:       }
                    184:   
                    185:       if {$where != "1" || [llength $lCol]>0} {
                    186:         set ret($docid) $lCol
                    187:       } else {
                    188:         unset -nocomplain ret($docid)
                    189:       }
                    190:     }
                    191:   }
                    192: 
                    193:   set lDoc [list]
                    194:   foreach docid [lsort -integer [array names ret]] {
                    195:     set lCol [list]
                    196:     set cols ""
                    197:     foreach col $ret($docid) {
                    198:       foreach {iCol lPos} $col {}
                    199:       append cols " $iCol\[[join $lPos { }]\]"
                    200:     }
                    201:     lappend lDoc "\[${docid}${cols}\]"
                    202:   }
                    203: 
                    204:   join $lDoc " "
                    205: }
                    206: 
                    207: ###########################################################################
                    208: 
                    209: proc gobble_varint {varname} {
                    210:   upvar $varname blob
                    211:   set n [read_fts3varint $blob ret]
                    212:   set blob [string range $blob $n end]
                    213:   return $ret
                    214: }
                    215: proc gobble_string {varname nLength} {
                    216:   upvar $varname blob
                    217:   set ret [string range $blob 0 [expr $nLength-1]]
                    218:   set blob [string range $blob $nLength end]
                    219:   return $ret
                    220: }
                    221: 
                    222: # The argument is a blob of data representing an FTS3 segment leaf. 
                    223: # Return a list consisting of alternating terms (strings) and doclists
                    224: # (blobs of data).
                    225: #
                    226: proc fts3_readleaf {blob} {
                    227:   set zPrev ""
                    228:   set terms [list]
                    229: 
                    230:   while {[string length $blob] > 0} {
                    231:     set nPrefix [gobble_varint blob]
                    232:     set nSuffix [gobble_varint blob]
                    233: 
                    234:     set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
                    235:     append zTerm [gobble_string blob $nSuffix]
                    236:     set doclist [gobble_string blob [gobble_varint blob]]
                    237: 
                    238:     lappend terms $zTerm $doclist
                    239:     set zPrev $zTerm
                    240:   }
                    241: 
                    242:   return $terms
                    243: }
                    244: 
                    245: proc fts3_read2 {tbl where varname} {
                    246:   upvar $varname a
                    247:   array unset a
                    248:   db eval " SELECT start_block, leaves_end_block, root 
                    249:             FROM ${tbl}_segdir WHERE $where
                    250:             ORDER BY level ASC, idx DESC
                    251:   " {
                    252:     if {$start_block == 0} {
                    253:       foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
                    254:     } else {
                    255:       db eval " SELECT block 
                    256:                 FROM ${tbl}_segments 
                    257:                 WHERE blockid>=$start_block AND blockid<=$leaves_end_block
                    258:                 ORDER BY blockid
                    259:       " {
                    260:         foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
                    261: 
                    262:       }
                    263:     }
                    264:   }
                    265: }
                    266: 
                    267: proc fts3_read {tbl where varname} {
                    268:   upvar $varname a
                    269:   array unset a
                    270:   db eval " SELECT start_block, leaves_end_block, root 
                    271:             FROM ${tbl}_segdir WHERE $where
                    272:             ORDER BY level DESC, idx ASC
                    273:   " {
                    274:     if {$start_block == 0} {
                    275:       foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
                    276:     } else {
                    277:       db eval " SELECT block 
                    278:                 FROM ${tbl}_segments 
                    279:                 WHERE blockid>=$start_block AND blockid<$leaves_end_block
                    280:                 ORDER BY blockid
                    281:       " {
                    282:         foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
                    283: 
                    284:       }
                    285:     }
                    286:   }
                    287: }
                    288: 
                    289: ##########################################################################
                    290: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>