1: # 2009 November 04
2: #
3: # The author disclaims copyright to this source code. In place of
4: # a legal notice, here is a blessing:
5: #
6: # May you do good and not evil.
7: # May you find forgiveness for yourself and forgive others.
8: # May you share freely, never taking more than you give.
9: #
10: #***********************************************************************
11: #
12: # This file contains common code used the fts3 tests. At one point
13: # equivalent functionality was implemented in C code. But it is easier
14: # to use Tcl.
15: #
16:
17: #-------------------------------------------------------------------------
18: # USAGE: fts3_integrity_check TBL
19: #
20: # This proc is used to verify that the full-text index is consistent with
21: # the contents of the fts3 table. In other words, it checks that the
22: # data in the %_contents table matches that in the %_segdir and %_segments
23: # tables.
24: #
25: # This is not an efficient procedure. It uses a lot of memory and a lot
26: # of CPU. But it is better than not checking at all.
27: #
28: # The procedure is:
29: #
30: # 1) Read the entire full-text index from the %_segdir and %_segments
31: # tables into memory. For each entry in the index, the following is
32: # done:
33: #
34: # set C($iDocid,$iCol,$iPosition) $zTerm
35: #
36: # 2) Iterate through each column of each row of the %_content table.
37: # Tokenize all documents, and check that for each token there is
38: # a corresponding entry in the $C array. After checking a token,
39: # [unset] the $C array entry.
40: #
41: # 3) Check that array $C is now empty.
42: #
43: #
44: proc fts3_integrity_check {tbl} {
45:
46: fts3_read2 $tbl 1 A
47:
48: foreach zTerm [array names A] {
49: foreach doclist $A($zTerm) {
50: set docid 0
51: while {[string length $doclist]>0} {
52: set iCol 0
53: set iPos 0
54: set lPos [list]
55: set lCol [list]
56:
57: # First varint of a doclist-entry is the docid. Delta-compressed
58: # with respect to the docid of the previous entry.
59: #
60: incr docid [gobble_varint doclist]
61: if {[info exists D($zTerm,$docid)]} {
62: while {[set iDelta [gobble_varint doclist]] != 0} {}
63: continue
64: }
65: set D($zTerm,$docid) 1
66:
67: # Gobble varints until the 0x00 that terminates the doclist-entry
68: # is found.
69: while {[set iDelta [gobble_varint doclist]] > 0} {
70: if {$iDelta == 1} {
71: set iCol [gobble_varint doclist]
72: set iPos 0
73: } else {
74: incr iPos $iDelta
75: incr iPos -2
76: set C($docid,$iCol,$iPos) $zTerm
77: }
78: }
79: }
80: }
81: }
82:
83: foreach key [array names C] {
84: #puts "$key -> $C($key)"
85: }
86:
87:
88: db eval "SELECT * FROM ${tbl}_content" E {
89: set iCol 0
90: set iDoc $E(docid)
91: foreach col [lrange $E(*) 1 end] {
92: set c $E($col)
93: set sql {SELECT fts3_tokenizer_test('simple', $c)}
94:
95: foreach {pos term dummy} [db one $sql] {
96: if {![info exists C($iDoc,$iCol,$pos)]} {
97: set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
98: lappend errors $es
99: } else {
100: if {$C($iDoc,$iCol,$pos) != "$term"} {
101: set es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
102: append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
103: lappend errors $es
104: }
105: unset C($iDoc,$iCol,$pos)
106: }
107: }
108: incr iCol
109: }
110: }
111:
112: foreach c [array names C] {
113: lappend errors "Bad index entry: $c -> $C($c)"
114: }
115:
116: if {[info exists errors]} { return [join $errors "\n"] }
117: return "ok"
118: }
119:
120: # USAGE: fts3_terms TBL WHERE
121: #
122: # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
123: # SQL expression that will be used as the WHERE clause when scanning
124: # the %_segdir table. As in the following query:
125: #
126: # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
127: #
128: # This function returns a list of all terms present in the segments
129: # selected by the statement above.
130: #
131: proc fts3_terms {tbl where} {
132: fts3_read $tbl $where a
133: return [lsort [array names a]]
134: }
135:
136:
137: # USAGE: fts3_doclist TBL TERM WHERE
138: #
139: # Argument TBL must be the name of an FTS3 table. TERM is a term that may
140: # or may not be present in the table. Argument WHERE is used to select a
141: # subset of the b-tree segments in the associated full-text index as
142: # described above for [fts3_terms].
143: #
144: # This function returns the results of merging the doclists associated
145: # with TERM in the selected segments. Each doclist is an element of the
146: # returned list. Each doclist is formatted as follows:
147: #
148: # [$docid ?$col[$off1 $off2...]?...]
149: #
150: # The formatting is odd for a Tcl command in order to be compatible with
151: # the original C-language implementation. If argument WHERE is "1", then
152: # any empty doclists are omitted from the returned list.
153: #
154: proc fts3_doclist {tbl term where} {
155: fts3_read $tbl $where a
156:
157:
158: foreach doclist $a($term) {
159: set docid 0
160:
161: while {[string length $doclist]>0} {
162: set iCol 0
163: set iPos 0
164: set lPos [list]
165: set lCol [list]
166: incr docid [gobble_varint doclist]
167:
168: while {[set iDelta [gobble_varint doclist]] > 0} {
169: if {$iDelta == 1} {
170: lappend lCol [list $iCol $lPos]
171: set iPos 0
172: set lPos [list]
173: set iCol [gobble_varint doclist]
174: } else {
175: incr iPos $iDelta
176: incr iPos -2
177: lappend lPos $iPos
178: }
179: }
180:
181: if {[llength $lPos]>0} {
182: lappend lCol [list $iCol $lPos]
183: }
184:
185: if {$where != "1" || [llength $lCol]>0} {
186: set ret($docid) $lCol
187: } else {
188: unset -nocomplain ret($docid)
189: }
190: }
191: }
192:
193: set lDoc [list]
194: foreach docid [lsort -integer [array names ret]] {
195: set lCol [list]
196: set cols ""
197: foreach col $ret($docid) {
198: foreach {iCol lPos} $col {}
199: append cols " $iCol\[[join $lPos { }]\]"
200: }
201: lappend lDoc "\[${docid}${cols}\]"
202: }
203:
204: join $lDoc " "
205: }
206:
207: ###########################################################################
208:
209: proc gobble_varint {varname} {
210: upvar $varname blob
211: set n [read_fts3varint $blob ret]
212: set blob [string range $blob $n end]
213: return $ret
214: }
215: proc gobble_string {varname nLength} {
216: upvar $varname blob
217: set ret [string range $blob 0 [expr $nLength-1]]
218: set blob [string range $blob $nLength end]
219: return $ret
220: }
221:
222: # The argument is a blob of data representing an FTS3 segment leaf.
223: # Return a list consisting of alternating terms (strings) and doclists
224: # (blobs of data).
225: #
226: proc fts3_readleaf {blob} {
227: set zPrev ""
228: set terms [list]
229:
230: while {[string length $blob] > 0} {
231: set nPrefix [gobble_varint blob]
232: set nSuffix [gobble_varint blob]
233:
234: set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
235: append zTerm [gobble_string blob $nSuffix]
236: set doclist [gobble_string blob [gobble_varint blob]]
237:
238: lappend terms $zTerm $doclist
239: set zPrev $zTerm
240: }
241:
242: return $terms
243: }
244:
245: proc fts3_read2 {tbl where varname} {
246: upvar $varname a
247: array unset a
248: db eval " SELECT start_block, leaves_end_block, root
249: FROM ${tbl}_segdir WHERE $where
250: ORDER BY level ASC, idx DESC
251: " {
252: if {$start_block == 0} {
253: foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
254: } else {
255: db eval " SELECT block
256: FROM ${tbl}_segments
257: WHERE blockid>=$start_block AND blockid<=$leaves_end_block
258: ORDER BY blockid
259: " {
260: foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
261:
262: }
263: }
264: }
265: }
266:
267: proc fts3_read {tbl where varname} {
268: upvar $varname a
269: array unset a
270: db eval " SELECT start_block, leaves_end_block, root
271: FROM ${tbl}_segdir WHERE $where
272: ORDER BY level DESC, idx ASC
273: " {
274: if {$start_block == 0} {
275: foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
276: } else {
277: db eval " SELECT block
278: FROM ${tbl}_segments
279: WHERE blockid>=$start_block AND blockid<$leaves_end_block
280: ORDER BY blockid
281: " {
282: foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
283:
284: }
285: }
286: }
287: }
288:
289: ##########################################################################
290:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>