File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / test / fts3_common.tcl
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:16 2012 UTC (12 years, 4 months ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD
sqlite3

    1: # 2009 November 04
    2: #
    3: # The author disclaims copyright to this source code.  In place of
    4: # a legal notice, here is a blessing:
    5: #
    6: #    May you do good and not evil.
    7: #    May you find forgiveness for yourself and forgive others.
    8: #    May you share freely, never taking more than you give.
    9: #
   10: #***********************************************************************
   11: #
   12: # This file contains common code used the fts3 tests. At one point
   13: # equivalent functionality was implemented in C code. But it is easier
   14: # to use Tcl.
   15: #
   16: 
   17: #-------------------------------------------------------------------------
   18: # USAGE: fts3_integrity_check TBL
   19: #
   20: # This proc is used to verify that the full-text index is consistent with
   21: # the contents of the fts3 table. In other words, it checks that the
   22: # data in the %_contents table matches that in the %_segdir and %_segments 
   23: # tables.
   24: #
   25: # This is not an efficient procedure. It uses a lot of memory and a lot
   26: # of CPU. But it is better than not checking at all.
   27: #
   28: # The procedure is:
   29: #
   30: #   1) Read the entire full-text index from the %_segdir and %_segments
   31: #      tables into memory. For each entry in the index, the following is
   32: #      done:
   33: #
   34: #          set C($iDocid,$iCol,$iPosition) $zTerm
   35: #
   36: #   2) Iterate through each column of each row of the %_content table. 
   37: #      Tokenize all documents, and check that for each token there is
   38: #      a corresponding entry in the $C array. After checking a token,
   39: #      [unset] the $C array entry.
   40: #
   41: #   3) Check that array $C is now empty.
   42: #      
   43: #
   44: proc fts3_integrity_check {tbl} {
   45: 
   46:   fts3_read2 $tbl 1 A
   47: 
   48:   foreach zTerm [array names A] {
   49:     foreach doclist $A($zTerm) {
   50:       set docid 0
   51:       while {[string length $doclist]>0} {
   52:         set iCol 0
   53:         set iPos 0
   54:         set lPos [list]
   55:         set lCol [list]
   56: 
   57:         # First varint of a doclist-entry is the docid. Delta-compressed
   58:         # with respect to the docid of the previous entry.
   59:         #
   60:         incr docid [gobble_varint doclist]
   61:         if {[info exists D($zTerm,$docid)]} {
   62:           while {[set iDelta [gobble_varint doclist]] != 0} {}
   63:           continue
   64:         }
   65:         set D($zTerm,$docid) 1
   66: 
   67:         # Gobble varints until the 0x00 that terminates the doclist-entry
   68:         # is found.
   69:         while {[set iDelta [gobble_varint doclist]] > 0} {
   70:           if {$iDelta == 1} {
   71:             set iCol [gobble_varint doclist]
   72:             set iPos 0
   73:           } else {
   74:             incr iPos $iDelta
   75:             incr iPos -2
   76:             set C($docid,$iCol,$iPos) $zTerm
   77:           }
   78:         }
   79:       }
   80:     }
   81:   }
   82: 
   83:   foreach key [array names C] {
   84:     #puts "$key -> $C($key)"
   85:   }
   86: 
   87: 
   88:   db eval "SELECT * FROM ${tbl}_content" E {
   89:     set iCol 0
   90:     set iDoc $E(docid)
   91:     foreach col [lrange $E(*) 1 end] {
   92:       set c $E($col)
   93:       set sql {SELECT fts3_tokenizer_test('simple', $c)}
   94: 
   95:       foreach {pos term dummy} [db one $sql] {
   96:         if {![info exists C($iDoc,$iCol,$pos)]} {
   97:           set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
   98:           lappend errors $es
   99:         } else {
  100:           if {$C($iDoc,$iCol,$pos) != "$term"} {
  101:             set    es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
  102:             append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
  103:             lappend errors $es
  104:           }
  105:           unset C($iDoc,$iCol,$pos)
  106:         }
  107:       }
  108:       incr iCol
  109:     }
  110:   }
  111: 
  112:   foreach c [array names C] {
  113:     lappend errors "Bad index entry: $c -> $C($c)"
  114:   }
  115: 
  116:   if {[info exists errors]} { return [join $errors "\n"] }
  117:   return "ok"
  118: }
  119: 
  120: # USAGE: fts3_terms TBL WHERE
  121: #
  122: # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
  123: # SQL expression that will be used as the WHERE clause when scanning
  124: # the %_segdir table. As in the following query:
  125: #
  126: #   "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
  127: #
  128: # This function returns a list of all terms present in the segments
  129: # selected by the statement above.
  130: #
  131: proc fts3_terms {tbl where} {
  132:   fts3_read $tbl $where a
  133:   return [lsort [array names a]]
  134: }
  135: 
  136: 
  137: # USAGE: fts3_doclist TBL TERM WHERE
  138: #
  139: # Argument TBL must be the name of an FTS3 table. TERM is a term that may
  140: # or may not be present in the table. Argument WHERE is used to select a 
  141: # subset of the b-tree segments in the associated full-text index as 
  142: # described above for [fts3_terms].
  143: #
  144: # This function returns the results of merging the doclists associated
  145: # with TERM in the selected segments. Each doclist is an element of the
  146: # returned list. Each doclist is formatted as follows:
  147: #
  148: #   [$docid ?$col[$off1 $off2...]?...]
  149: #
  150: # The formatting is odd for a Tcl command in order to be compatible with
  151: # the original C-language implementation. If argument WHERE is "1", then 
  152: # any empty doclists are omitted from the returned list.
  153: #
  154: proc fts3_doclist {tbl term where} {
  155:   fts3_read $tbl $where a
  156: 
  157: 
  158:   foreach doclist $a($term) {
  159:     set docid 0
  160: 
  161:     while {[string length $doclist]>0} {
  162:       set iCol 0
  163:       set iPos 0
  164:       set lPos [list]
  165:       set lCol [list]
  166:       incr docid [gobble_varint doclist]
  167:   
  168:       while {[set iDelta [gobble_varint doclist]] > 0} {
  169:         if {$iDelta == 1} {
  170:           lappend lCol [list $iCol $lPos]
  171:           set iPos 0
  172:           set lPos [list]
  173:           set iCol [gobble_varint doclist]
  174:         } else {
  175:           incr iPos $iDelta
  176:           incr iPos -2
  177:           lappend lPos $iPos
  178:         }
  179:       }
  180:   
  181:       if {[llength $lPos]>0} {
  182:         lappend lCol [list $iCol $lPos]
  183:       }
  184:   
  185:       if {$where != "1" || [llength $lCol]>0} {
  186:         set ret($docid) $lCol
  187:       } else {
  188:         unset -nocomplain ret($docid)
  189:       }
  190:     }
  191:   }
  192: 
  193:   set lDoc [list]
  194:   foreach docid [lsort -integer [array names ret]] {
  195:     set lCol [list]
  196:     set cols ""
  197:     foreach col $ret($docid) {
  198:       foreach {iCol lPos} $col {}
  199:       append cols " $iCol\[[join $lPos { }]\]"
  200:     }
  201:     lappend lDoc "\[${docid}${cols}\]"
  202:   }
  203: 
  204:   join $lDoc " "
  205: }
  206: 
  207: ###########################################################################
  208: 
  209: proc gobble_varint {varname} {
  210:   upvar $varname blob
  211:   set n [read_fts3varint $blob ret]
  212:   set blob [string range $blob $n end]
  213:   return $ret
  214: }
  215: proc gobble_string {varname nLength} {
  216:   upvar $varname blob
  217:   set ret [string range $blob 0 [expr $nLength-1]]
  218:   set blob [string range $blob $nLength end]
  219:   return $ret
  220: }
  221: 
  222: # The argument is a blob of data representing an FTS3 segment leaf. 
  223: # Return a list consisting of alternating terms (strings) and doclists
  224: # (blobs of data).
  225: #
  226: proc fts3_readleaf {blob} {
  227:   set zPrev ""
  228:   set terms [list]
  229: 
  230:   while {[string length $blob] > 0} {
  231:     set nPrefix [gobble_varint blob]
  232:     set nSuffix [gobble_varint blob]
  233: 
  234:     set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
  235:     append zTerm [gobble_string blob $nSuffix]
  236:     set doclist [gobble_string blob [gobble_varint blob]]
  237: 
  238:     lappend terms $zTerm $doclist
  239:     set zPrev $zTerm
  240:   }
  241: 
  242:   return $terms
  243: }
  244: 
  245: proc fts3_read2 {tbl where varname} {
  246:   upvar $varname a
  247:   array unset a
  248:   db eval " SELECT start_block, leaves_end_block, root 
  249:             FROM ${tbl}_segdir WHERE $where
  250:             ORDER BY level ASC, idx DESC
  251:   " {
  252:     if {$start_block == 0} {
  253:       foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
  254:     } else {
  255:       db eval " SELECT block 
  256:                 FROM ${tbl}_segments 
  257:                 WHERE blockid>=$start_block AND blockid<=$leaves_end_block
  258:                 ORDER BY blockid
  259:       " {
  260:         foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
  261: 
  262:       }
  263:     }
  264:   }
  265: }
  266: 
  267: proc fts3_read {tbl where varname} {
  268:   upvar $varname a
  269:   array unset a
  270:   db eval " SELECT start_block, leaves_end_block, root 
  271:             FROM ${tbl}_segdir WHERE $where
  272:             ORDER BY level DESC, idx ASC
  273:   " {
  274:     if {$start_block == 0} {
  275:       foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
  276:     } else {
  277:       db eval " SELECT block 
  278:                 FROM ${tbl}_segments 
  279:                 WHERE blockid>=$start_block AND blockid<$leaves_end_block
  280:                 ORDER BY blockid
  281:       " {
  282:         foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
  283: 
  284:       }
  285:     }
  286:   }
  287: }
  288: 
  289: ##########################################################################
  290: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>