Annotation of embedaddon/sqlite3/src/vdbesort.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2011 July 9
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: *************************************************************************
12: ** This file contains code for the VdbeSorter object, used in concert with
13: ** a VdbeCursor to sort large numbers of keys (as may be required, for
14: ** example, by CREATE INDEX statements on tables too large to fit in main
15: ** memory).
16: */
17:
18: #include "sqliteInt.h"
19: #include "vdbeInt.h"
20:
21: #ifndef SQLITE_OMIT_MERGE_SORT
22:
23: typedef struct VdbeSorterIter VdbeSorterIter;
24: typedef struct SorterRecord SorterRecord;
25:
26: /*
27: ** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
28: **
29: ** As keys are added to the sorter, they are written to disk in a series
30: ** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
31: ** the same as the cache-size allowed for temporary databases. In order
32: ** to allow the caller to extract keys from the sorter in sorted order,
33: ** all PMAs currently stored on disk must be merged together. This comment
34: ** describes the data structure used to do so. The structure supports
35: ** merging any number of arrays in a single pass with no redundant comparison
36: ** operations.
37: **
38: ** The aIter[] array contains an iterator for each of the PMAs being merged.
39: ** An aIter[] iterator either points to a valid key or else is at EOF. For
40: ** the purposes of the paragraphs below, we assume that the array is actually
41: ** N elements in size, where N is the smallest power of 2 greater to or equal
42: ** to the number of iterators being merged. The extra aIter[] elements are
43: ** treated as if they are empty (always at EOF).
44: **
45: ** The aTree[] array is also N elements in size. The value of N is stored in
46: ** the VdbeSorter.nTree variable.
47: **
48: ** The final (N/2) elements of aTree[] contain the results of comparing
49: ** pairs of iterator keys together. Element i contains the result of
50: ** comparing aIter[2*i-N] and aIter[2*i-N+1]. Whichever key is smaller, the
51: ** aTree element is set to the index of it.
52: **
53: ** For the purposes of this comparison, EOF is considered greater than any
54: ** other key value. If the keys are equal (only possible with two EOF
55: ** values), it doesn't matter which index is stored.
56: **
57: ** The (N/4) elements of aTree[] that preceed the final (N/2) described
58: ** above contains the index of the smallest of each block of 4 iterators.
59: ** And so on. So that aTree[1] contains the index of the iterator that
60: ** currently points to the smallest key value. aTree[0] is unused.
61: **
62: ** Example:
63: **
64: ** aIter[0] -> Banana
65: ** aIter[1] -> Feijoa
66: ** aIter[2] -> Elderberry
67: ** aIter[3] -> Currant
68: ** aIter[4] -> Grapefruit
69: ** aIter[5] -> Apple
70: ** aIter[6] -> Durian
71: ** aIter[7] -> EOF
72: **
73: ** aTree[] = { X, 5 0, 5 0, 3, 5, 6 }
74: **
75: ** The current element is "Apple" (the value of the key indicated by
76: ** iterator 5). When the Next() operation is invoked, iterator 5 will
77: ** be advanced to the next key in its segment. Say the next key is
78: ** "Eggplant":
79: **
80: ** aIter[5] -> Eggplant
81: **
82: ** The contents of aTree[] are updated first by comparing the new iterator
83: ** 5 key to the current key of iterator 4 (still "Grapefruit"). The iterator
84: ** 5 value is still smaller, so aTree[6] is set to 5. And so on up the tree.
85: ** The value of iterator 6 - "Durian" - is now smaller than that of iterator
86: ** 5, so aTree[3] is set to 6. Key 0 is smaller than key 6 (Banana<Durian),
87: ** so the value written into element 1 of the array is 0. As follows:
88: **
89: ** aTree[] = { X, 0 0, 6 0, 3, 5, 6 }
90: **
91: ** In other words, each time we advance to the next sorter element, log2(N)
92: ** key comparison operations are required, where N is the number of segments
93: ** being merged (rounded up to the next power of 2).
94: */
95: struct VdbeSorter {
96: int nInMemory; /* Current size of pRecord list as PMA */
97: int nTree; /* Used size of aTree/aIter (power of 2) */
98: VdbeSorterIter *aIter; /* Array of iterators to merge */
99: int *aTree; /* Current state of incremental merge */
100: i64 iWriteOff; /* Current write offset within file pTemp1 */
101: i64 iReadOff; /* Current read offset within file pTemp1 */
102: sqlite3_file *pTemp1; /* PMA file 1 */
103: int nPMA; /* Number of PMAs stored in pTemp1 */
104: SorterRecord *pRecord; /* Head of in-memory record list */
105: int mnPmaSize; /* Minimum PMA size, in bytes */
106: int mxPmaSize; /* Maximum PMA size, in bytes. 0==no limit */
107: UnpackedRecord *pUnpacked; /* Used to unpack keys */
108: };
109:
110: /*
111: ** The following type is an iterator for a PMA. It caches the current key in
112: ** variables nKey/aKey. If the iterator is at EOF, pFile==0.
113: */
114: struct VdbeSorterIter {
115: i64 iReadOff; /* Current read offset */
116: i64 iEof; /* 1 byte past EOF for this iterator */
117: sqlite3_file *pFile; /* File iterator is reading from */
118: int nAlloc; /* Bytes of space at aAlloc */
119: u8 *aAlloc; /* Allocated space */
120: int nKey; /* Number of bytes in key */
121: u8 *aKey; /* Pointer to current key */
122: };
123:
124: /*
125: ** A structure to store a single record. All in-memory records are connected
126: ** together into a linked list headed at VdbeSorter.pRecord using the
127: ** SorterRecord.pNext pointer.
128: */
129: struct SorterRecord {
130: void *pVal;
131: int nVal;
132: SorterRecord *pNext;
133: };
134:
135: /* Minimum allowable value for the VdbeSorter.nWorking variable */
136: #define SORTER_MIN_WORKING 10
137:
138: /* Maximum number of segments to merge in a single pass. */
139: #define SORTER_MAX_MERGE_COUNT 16
140:
141: /*
142: ** Free all memory belonging to the VdbeSorterIter object passed as the second
143: ** argument. All structure fields are set to zero before returning.
144: */
145: static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
146: sqlite3DbFree(db, pIter->aAlloc);
147: memset(pIter, 0, sizeof(VdbeSorterIter));
148: }
149:
150: /*
151: ** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
152: ** no error occurs, or an SQLite error code if one does.
153: */
154: static int vdbeSorterIterNext(
155: sqlite3 *db, /* Database handle (for sqlite3DbMalloc() ) */
156: VdbeSorterIter *pIter /* Iterator to advance */
157: ){
158: int rc; /* Return Code */
159: int nRead; /* Number of bytes read */
160: int nRec = 0; /* Size of record in bytes */
161: int iOff = 0; /* Size of serialized size varint in bytes */
162:
163: assert( pIter->iEof>=pIter->iReadOff );
164: if( pIter->iEof-pIter->iReadOff>5 ){
165: nRead = 5;
166: }else{
167: nRead = (int)(pIter->iEof - pIter->iReadOff);
168: }
169: if( nRead<=0 ){
170: /* This is an EOF condition */
171: vdbeSorterIterZero(db, pIter);
172: return SQLITE_OK;
173: }
174:
175: rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
176: if( rc==SQLITE_OK ){
177: iOff = getVarint32(pIter->aAlloc, nRec);
178: if( (iOff+nRec)>nRead ){
179: int nRead2; /* Number of extra bytes to read */
180: if( (iOff+nRec)>pIter->nAlloc ){
181: int nNew = pIter->nAlloc*2;
182: while( (iOff+nRec)>nNew ) nNew = nNew*2;
183: pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
184: if( !pIter->aAlloc ) return SQLITE_NOMEM;
185: pIter->nAlloc = nNew;
186: }
187:
188: nRead2 = iOff + nRec - nRead;
189: rc = sqlite3OsRead(
190: pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
191: );
192: }
193: }
194:
195: assert( rc!=SQLITE_OK || nRec>0 );
196: pIter->iReadOff += iOff+nRec;
197: pIter->nKey = nRec;
198: pIter->aKey = &pIter->aAlloc[iOff];
199: return rc;
200: }
201:
202: /*
203: ** Write a single varint, value iVal, to file-descriptor pFile. Return
204: ** SQLITE_OK if successful, or an SQLite error code if some error occurs.
205: **
206: ** The value of *piOffset when this function is called is used as the byte
207: ** offset in file pFile to write to. Before returning, *piOffset is
208: ** incremented by the number of bytes written.
209: */
210: static int vdbeSorterWriteVarint(
211: sqlite3_file *pFile, /* File to write to */
212: i64 iVal, /* Value to write as a varint */
213: i64 *piOffset /* IN/OUT: Write offset in file pFile */
214: ){
215: u8 aVarint[9]; /* Buffer large enough for a varint */
216: int nVarint; /* Number of used bytes in varint */
217: int rc; /* Result of write() call */
218:
219: nVarint = sqlite3PutVarint(aVarint, iVal);
220: rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
221: *piOffset += nVarint;
222:
223: return rc;
224: }
225:
226: /*
227: ** Read a single varint from file-descriptor pFile. Return SQLITE_OK if
228: ** successful, or an SQLite error code if some error occurs.
229: **
230: ** The value of *piOffset when this function is called is used as the
231: ** byte offset in file pFile from whence to read the varint. If successful
232: ** (i.e. if no IO error occurs), then *piOffset is set to the offset of
233: ** the first byte past the end of the varint before returning. *piVal is
234: ** set to the integer value read. If an error occurs, the final values of
235: ** both *piOffset and *piVal are undefined.
236: */
237: static int vdbeSorterReadVarint(
238: sqlite3_file *pFile, /* File to read from */
239: i64 *piOffset, /* IN/OUT: Read offset in pFile */
240: i64 *piVal /* OUT: Value read from file */
241: ){
242: u8 aVarint[9]; /* Buffer large enough for a varint */
243: i64 iOff = *piOffset; /* Offset in file to read from */
244: int rc; /* Return code */
245:
246: rc = sqlite3OsRead(pFile, aVarint, 9, iOff);
247: if( rc==SQLITE_OK ){
248: *piOffset += getVarint(aVarint, (u64 *)piVal);
249: }
250:
251: return rc;
252: }
253:
254: /*
255: ** Initialize iterator pIter to scan through the PMA stored in file pFile
256: ** starting at offset iStart and ending at offset iEof-1. This function
257: ** leaves the iterator pointing to the first key in the PMA (or EOF if the
258: ** PMA is empty).
259: */
260: static int vdbeSorterIterInit(
261: sqlite3 *db, /* Database handle */
262: VdbeSorter *pSorter, /* Sorter object */
263: i64 iStart, /* Start offset in pFile */
264: VdbeSorterIter *pIter, /* Iterator to populate */
265: i64 *pnByte /* IN/OUT: Increment this value by PMA size */
266: ){
267: int rc;
268:
269: assert( pSorter->iWriteOff>iStart );
270: assert( pIter->aAlloc==0 );
271: pIter->pFile = pSorter->pTemp1;
272: pIter->iReadOff = iStart;
273: pIter->nAlloc = 128;
274: pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
275: if( !pIter->aAlloc ){
276: rc = SQLITE_NOMEM;
277: }else{
278: i64 nByte; /* Total size of PMA in bytes */
279: rc = vdbeSorterReadVarint(pSorter->pTemp1, &pIter->iReadOff, &nByte);
280: *pnByte += nByte;
281: pIter->iEof = pIter->iReadOff + nByte;
282: }
283: if( rc==SQLITE_OK ){
284: rc = vdbeSorterIterNext(db, pIter);
285: }
286: return rc;
287: }
288:
289:
290: /*
291: ** Compare key1 (buffer pKey1, size nKey1 bytes) with key2 (buffer pKey2,
292: ** size nKey2 bytes). Argument pKeyInfo supplies the collation functions
293: ** used by the comparison. If an error occurs, return an SQLite error code.
294: ** Otherwise, return SQLITE_OK and set *pRes to a negative, zero or positive
295: ** value, depending on whether key1 is smaller, equal to or larger than key2.
296: **
297: ** If the bOmitRowid argument is non-zero, assume both keys end in a rowid
298: ** field. For the purposes of the comparison, ignore it. Also, if bOmitRowid
299: ** is true and key1 contains even a single NULL value, it is considered to
300: ** be less than key2. Even if key2 also contains NULL values.
301: **
302: ** If pKey2 is passed a NULL pointer, then it is assumed that the pCsr->aSpace
303: ** has been allocated and contains an unpacked record that is used as key2.
304: */
305: static void vdbeSorterCompare(
306: VdbeCursor *pCsr, /* Cursor object (for pKeyInfo) */
307: int bOmitRowid, /* Ignore rowid field at end of keys */
308: void *pKey1, int nKey1, /* Left side of comparison */
309: void *pKey2, int nKey2, /* Right side of comparison */
310: int *pRes /* OUT: Result of comparison */
311: ){
312: KeyInfo *pKeyInfo = pCsr->pKeyInfo;
313: VdbeSorter *pSorter = pCsr->pSorter;
314: UnpackedRecord *r2 = pSorter->pUnpacked;
315: int i;
316:
317: if( pKey2 ){
318: sqlite3VdbeRecordUnpack(pKeyInfo, nKey2, pKey2, r2);
319: }
320:
321: if( bOmitRowid ){
322: r2->nField = pKeyInfo->nField;
323: assert( r2->nField>0 );
324: for(i=0; i<r2->nField; i++){
325: if( r2->aMem[i].flags & MEM_Null ){
326: *pRes = -1;
327: return;
328: }
329: }
330: r2->flags |= UNPACKED_PREFIX_MATCH;
331: }
332:
333: *pRes = sqlite3VdbeRecordCompare(nKey1, pKey1, r2);
334: }
335:
336: /*
337: ** This function is called to compare two iterator keys when merging
338: ** multiple b-tree segments. Parameter iOut is the index of the aTree[]
339: ** value to recalculate.
340: */
341: static int vdbeSorterDoCompare(VdbeCursor *pCsr, int iOut){
342: VdbeSorter *pSorter = pCsr->pSorter;
343: int i1;
344: int i2;
345: int iRes;
346: VdbeSorterIter *p1;
347: VdbeSorterIter *p2;
348:
349: assert( iOut<pSorter->nTree && iOut>0 );
350:
351: if( iOut>=(pSorter->nTree/2) ){
352: i1 = (iOut - pSorter->nTree/2) * 2;
353: i2 = i1 + 1;
354: }else{
355: i1 = pSorter->aTree[iOut*2];
356: i2 = pSorter->aTree[iOut*2+1];
357: }
358:
359: p1 = &pSorter->aIter[i1];
360: p2 = &pSorter->aIter[i2];
361:
362: if( p1->pFile==0 ){
363: iRes = i2;
364: }else if( p2->pFile==0 ){
365: iRes = i1;
366: }else{
367: int res;
368: assert( pCsr->pSorter->pUnpacked!=0 ); /* allocated in vdbeSorterMerge() */
369: vdbeSorterCompare(
370: pCsr, 0, p1->aKey, p1->nKey, p2->aKey, p2->nKey, &res
371: );
372: if( res<=0 ){
373: iRes = i1;
374: }else{
375: iRes = i2;
376: }
377: }
378:
379: pSorter->aTree[iOut] = iRes;
380: return SQLITE_OK;
381: }
382:
383: /*
384: ** Initialize the temporary index cursor just opened as a sorter cursor.
385: */
386: int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
387: int pgsz; /* Page size of main database */
388: int mxCache; /* Cache size */
389: VdbeSorter *pSorter; /* The new sorter */
390: char *d; /* Dummy */
391:
392: assert( pCsr->pKeyInfo && pCsr->pBt==0 );
393: pCsr->pSorter = pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
394: if( pSorter==0 ){
395: return SQLITE_NOMEM;
396: }
397:
398: pSorter->pUnpacked = sqlite3VdbeAllocUnpackedRecord(pCsr->pKeyInfo, 0, 0, &d);
399: if( pSorter->pUnpacked==0 ) return SQLITE_NOMEM;
400: assert( pSorter->pUnpacked==(UnpackedRecord *)d );
401:
402: if( !sqlite3TempInMemory(db) ){
403: pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
404: pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
405: mxCache = db->aDb[0].pSchema->cache_size;
406: if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
407: pSorter->mxPmaSize = mxCache * pgsz;
408: }
409:
410: return SQLITE_OK;
411: }
412:
413: /*
414: ** Free the list of sorted records starting at pRecord.
415: */
416: static void vdbeSorterRecordFree(sqlite3 *db, SorterRecord *pRecord){
417: SorterRecord *p;
418: SorterRecord *pNext;
419: for(p=pRecord; p; p=pNext){
420: pNext = p->pNext;
421: sqlite3DbFree(db, p);
422: }
423: }
424:
425: /*
426: ** Free any cursor components allocated by sqlite3VdbeSorterXXX routines.
427: */
428: void sqlite3VdbeSorterClose(sqlite3 *db, VdbeCursor *pCsr){
429: VdbeSorter *pSorter = pCsr->pSorter;
430: if( pSorter ){
431: if( pSorter->aIter ){
432: int i;
433: for(i=0; i<pSorter->nTree; i++){
434: vdbeSorterIterZero(db, &pSorter->aIter[i]);
435: }
436: sqlite3DbFree(db, pSorter->aIter);
437: }
438: if( pSorter->pTemp1 ){
439: sqlite3OsCloseFree(pSorter->pTemp1);
440: }
441: vdbeSorterRecordFree(db, pSorter->pRecord);
442: sqlite3DbFree(db, pSorter->pUnpacked);
443: sqlite3DbFree(db, pSorter);
444: pCsr->pSorter = 0;
445: }
446: }
447:
448: /*
449: ** Allocate space for a file-handle and open a temporary file. If successful,
450: ** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
451: ** Otherwise, set *ppFile to 0 and return an SQLite error code.
452: */
453: static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
454: int dummy;
455: return sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
456: SQLITE_OPEN_TEMP_JOURNAL |
457: SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
458: SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE, &dummy
459: );
460: }
461:
462: /*
463: ** Merge the two sorted lists p1 and p2 into a single list.
464: ** Set *ppOut to the head of the new list.
465: */
466: static void vdbeSorterMerge(
467: VdbeCursor *pCsr, /* For pKeyInfo */
468: SorterRecord *p1, /* First list to merge */
469: SorterRecord *p2, /* Second list to merge */
470: SorterRecord **ppOut /* OUT: Head of merged list */
471: ){
472: SorterRecord *pFinal = 0;
473: SorterRecord **pp = &pFinal;
474: void *pVal2 = p2 ? p2->pVal : 0;
475:
476: while( p1 && p2 ){
477: int res;
478: vdbeSorterCompare(pCsr, 0, p1->pVal, p1->nVal, pVal2, p2->nVal, &res);
479: if( res<=0 ){
480: *pp = p1;
481: pp = &p1->pNext;
482: p1 = p1->pNext;
483: pVal2 = 0;
484: }else{
485: *pp = p2;
486: pp = &p2->pNext;
487: p2 = p2->pNext;
488: if( p2==0 ) break;
489: pVal2 = p2->pVal;
490: }
491: }
492: *pp = p1 ? p1 : p2;
493: *ppOut = pFinal;
494: }
495:
496: /*
497: ** Sort the linked list of records headed at pCsr->pRecord. Return SQLITE_OK
498: ** if successful, or an SQLite error code (i.e. SQLITE_NOMEM) if an error
499: ** occurs.
500: */
501: static int vdbeSorterSort(VdbeCursor *pCsr){
502: int i;
503: SorterRecord **aSlot;
504: SorterRecord *p;
505: VdbeSorter *pSorter = pCsr->pSorter;
506:
507: aSlot = (SorterRecord **)sqlite3MallocZero(64 * sizeof(SorterRecord *));
508: if( !aSlot ){
509: return SQLITE_NOMEM;
510: }
511:
512: p = pSorter->pRecord;
513: while( p ){
514: SorterRecord *pNext = p->pNext;
515: p->pNext = 0;
516: for(i=0; aSlot[i]; i++){
517: vdbeSorterMerge(pCsr, p, aSlot[i], &p);
518: aSlot[i] = 0;
519: }
520: aSlot[i] = p;
521: p = pNext;
522: }
523:
524: p = 0;
525: for(i=0; i<64; i++){
526: vdbeSorterMerge(pCsr, p, aSlot[i], &p);
527: }
528: pSorter->pRecord = p;
529:
530: sqlite3_free(aSlot);
531: return SQLITE_OK;
532: }
533:
534:
535: /*
536: ** Write the current contents of the in-memory linked-list to a PMA. Return
537: ** SQLITE_OK if successful, or an SQLite error code otherwise.
538: **
539: ** The format of a PMA is:
540: **
541: ** * A varint. This varint contains the total number of bytes of content
542: ** in the PMA (not including the varint itself).
543: **
544: ** * One or more records packed end-to-end in order of ascending keys.
545: ** Each record consists of a varint followed by a blob of data (the
546: ** key). The varint is the number of bytes in the blob of data.
547: */
548: static int vdbeSorterListToPMA(sqlite3 *db, VdbeCursor *pCsr){
549: int rc = SQLITE_OK; /* Return code */
550: VdbeSorter *pSorter = pCsr->pSorter;
551:
552: if( pSorter->nInMemory==0 ){
553: assert( pSorter->pRecord==0 );
554: return rc;
555: }
556:
557: rc = vdbeSorterSort(pCsr);
558:
559: /* If the first temporary PMA file has not been opened, open it now. */
560: if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
561: rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
562: assert( rc!=SQLITE_OK || pSorter->pTemp1 );
563: assert( pSorter->iWriteOff==0 );
564: assert( pSorter->nPMA==0 );
565: }
566:
567: if( rc==SQLITE_OK ){
568: i64 iOff = pSorter->iWriteOff;
569: SorterRecord *p;
570: SorterRecord *pNext = 0;
571: static const char eightZeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
572:
573: pSorter->nPMA++;
574: rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nInMemory, &iOff);
575: for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
576: pNext = p->pNext;
577: rc = vdbeSorterWriteVarint(pSorter->pTemp1, p->nVal, &iOff);
578:
579: if( rc==SQLITE_OK ){
580: rc = sqlite3OsWrite(pSorter->pTemp1, p->pVal, p->nVal, iOff);
581: iOff += p->nVal;
582: }
583:
584: sqlite3DbFree(db, p);
585: }
586:
587: /* This assert verifies that unless an error has occurred, the size of
588: ** the PMA on disk is the same as the expected size stored in
589: ** pSorter->nInMemory. */
590: assert( rc!=SQLITE_OK || pSorter->nInMemory==(
591: iOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nInMemory)
592: ));
593:
594: pSorter->iWriteOff = iOff;
595: if( rc==SQLITE_OK ){
596: /* Terminate each file with 8 extra bytes so that from any offset
597: ** in the file we can always read 9 bytes without a SHORT_READ error */
598: rc = sqlite3OsWrite(pSorter->pTemp1, eightZeros, 8, iOff);
599: }
600: pSorter->pRecord = p;
601: }
602:
603: return rc;
604: }
605:
606: /*
607: ** Add a record to the sorter.
608: */
609: int sqlite3VdbeSorterWrite(
610: sqlite3 *db, /* Database handle */
611: VdbeCursor *pCsr, /* Sorter cursor */
612: Mem *pVal /* Memory cell containing record */
613: ){
614: VdbeSorter *pSorter = pCsr->pSorter;
615: int rc = SQLITE_OK; /* Return Code */
616: SorterRecord *pNew; /* New list element */
617:
618: assert( pSorter );
619: pSorter->nInMemory += sqlite3VarintLen(pVal->n) + pVal->n;
620:
621: pNew = (SorterRecord *)sqlite3DbMallocRaw(db, pVal->n + sizeof(SorterRecord));
622: if( pNew==0 ){
623: rc = SQLITE_NOMEM;
624: }else{
625: pNew->pVal = (void *)&pNew[1];
626: memcpy(pNew->pVal, pVal->z, pVal->n);
627: pNew->nVal = pVal->n;
628: pNew->pNext = pSorter->pRecord;
629: pSorter->pRecord = pNew;
630: }
631:
632: /* See if the contents of the sorter should now be written out. They
633: ** are written out when either of the following are true:
634: **
635: ** * The total memory allocated for the in-memory list is greater
636: ** than (page-size * cache-size), or
637: **
638: ** * The total memory allocated for the in-memory list is greater
639: ** than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
640: */
641: if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
642: (pSorter->nInMemory>pSorter->mxPmaSize)
643: || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
644: )){
645: rc = vdbeSorterListToPMA(db, pCsr);
646: pSorter->nInMemory = 0;
647: }
648:
649: return rc;
650: }
651:
652: /*
653: ** Helper function for sqlite3VdbeSorterRewind().
654: */
655: static int vdbeSorterInitMerge(
656: sqlite3 *db, /* Database handle */
657: VdbeCursor *pCsr, /* Cursor handle for this sorter */
658: i64 *pnByte /* Sum of bytes in all opened PMAs */
659: ){
660: VdbeSorter *pSorter = pCsr->pSorter;
661: int rc = SQLITE_OK; /* Return code */
662: int i; /* Used to iterator through aIter[] */
663: i64 nByte = 0; /* Total bytes in all opened PMAs */
664:
665: /* Initialize the iterators. */
666: for(i=0; i<SORTER_MAX_MERGE_COUNT; i++){
667: VdbeSorterIter *pIter = &pSorter->aIter[i];
668: rc = vdbeSorterIterInit(db, pSorter, pSorter->iReadOff, pIter, &nByte);
669: pSorter->iReadOff = pIter->iEof;
670: assert( rc!=SQLITE_OK || pSorter->iReadOff<=pSorter->iWriteOff );
671: if( rc!=SQLITE_OK || pSorter->iReadOff>=pSorter->iWriteOff ) break;
672: }
673:
674: /* Initialize the aTree[] array. */
675: for(i=pSorter->nTree-1; rc==SQLITE_OK && i>0; i--){
676: rc = vdbeSorterDoCompare(pCsr, i);
677: }
678:
679: *pnByte = nByte;
680: return rc;
681: }
682:
683: /*
684: ** Once the sorter has been populated, this function is called to prepare
685: ** for iterating through its contents in sorted order.
686: */
687: int sqlite3VdbeSorterRewind(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
688: VdbeSorter *pSorter = pCsr->pSorter;
689: int rc; /* Return code */
690: sqlite3_file *pTemp2 = 0; /* Second temp file to use */
691: i64 iWrite2 = 0; /* Write offset for pTemp2 */
692: int nIter; /* Number of iterators used */
693: int nByte; /* Bytes of space required for aIter/aTree */
694: int N = 2; /* Power of 2 >= nIter */
695:
696: assert( pSorter );
697:
698: /* If no data has been written to disk, then do not do so now. Instead,
699: ** sort the VdbeSorter.pRecord list. The vdbe layer will read data directly
700: ** from the in-memory list. */
701: if( pSorter->nPMA==0 ){
702: *pbEof = !pSorter->pRecord;
703: assert( pSorter->aTree==0 );
704: return vdbeSorterSort(pCsr);
705: }
706:
707: /* Write the current b-tree to a PMA. Close the b-tree cursor. */
708: rc = vdbeSorterListToPMA(db, pCsr);
709: if( rc!=SQLITE_OK ) return rc;
710:
711: /* Allocate space for aIter[] and aTree[]. */
712: nIter = pSorter->nPMA;
713: if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
714: assert( nIter>0 );
715: while( N<nIter ) N += N;
716: nByte = N * (sizeof(int) + sizeof(VdbeSorterIter));
717: pSorter->aIter = (VdbeSorterIter *)sqlite3DbMallocZero(db, nByte);
718: if( !pSorter->aIter ) return SQLITE_NOMEM;
719: pSorter->aTree = (int *)&pSorter->aIter[N];
720: pSorter->nTree = N;
721:
722: do {
723: int iNew; /* Index of new, merged, PMA */
724:
725: for(iNew=0;
726: rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA;
727: iNew++
728: ){
729: i64 nWrite; /* Number of bytes in new PMA */
730:
731: /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
732: ** initialize an iterator for each of them and break out of the loop.
733: ** These iterators will be incrementally merged as the VDBE layer calls
734: ** sqlite3VdbeSorterNext().
735: **
736: ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
737: ** initialize interators for SORTER_MAX_MERGE_COUNT of them. These PMAs
738: ** are merged into a single PMA that is written to file pTemp2.
739: */
740: rc = vdbeSorterInitMerge(db, pCsr, &nWrite);
741: assert( rc!=SQLITE_OK || pSorter->aIter[ pSorter->aTree[1] ].pFile );
742: if( rc!=SQLITE_OK || pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
743: break;
744: }
745:
746: /* Open the second temp file, if it is not already open. */
747: if( pTemp2==0 ){
748: assert( iWrite2==0 );
749: rc = vdbeSorterOpenTempFile(db, &pTemp2);
750: }
751:
752: if( rc==SQLITE_OK ){
753: rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
754: }
755:
756: if( rc==SQLITE_OK ){
757: int bEof = 0;
758: while( rc==SQLITE_OK && bEof==0 ){
759: int nToWrite;
760: VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
761: assert( pIter->pFile );
762: nToWrite = pIter->nKey + sqlite3VarintLen(pIter->nKey);
763: rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nToWrite, iWrite2);
764: iWrite2 += nToWrite;
765: if( rc==SQLITE_OK ){
766: rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
767: }
768: }
769: }
770: }
771:
772: if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
773: break;
774: }else{
775: sqlite3_file *pTmp = pSorter->pTemp1;
776: pSorter->nPMA = iNew;
777: pSorter->pTemp1 = pTemp2;
778: pTemp2 = pTmp;
779: pSorter->iWriteOff = iWrite2;
780: pSorter->iReadOff = 0;
781: iWrite2 = 0;
782: }
783: }while( rc==SQLITE_OK );
784:
785: if( pTemp2 ){
786: sqlite3OsCloseFree(pTemp2);
787: }
788: *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
789: return rc;
790: }
791:
792: /*
793: ** Advance to the next element in the sorter.
794: */
795: int sqlite3VdbeSorterNext(sqlite3 *db, VdbeCursor *pCsr, int *pbEof){
796: VdbeSorter *pSorter = pCsr->pSorter;
797: int rc; /* Return code */
798:
799: if( pSorter->aTree ){
800: int iPrev = pSorter->aTree[1];/* Index of iterator to advance */
801: int i; /* Index of aTree[] to recalculate */
802:
803: rc = vdbeSorterIterNext(db, &pSorter->aIter[iPrev]);
804: for(i=(pSorter->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){
805: rc = vdbeSorterDoCompare(pCsr, i);
806: }
807:
808: *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
809: }else{
810: SorterRecord *pFree = pSorter->pRecord;
811: pSorter->pRecord = pFree->pNext;
812: pFree->pNext = 0;
813: vdbeSorterRecordFree(db, pFree);
814: *pbEof = !pSorter->pRecord;
815: rc = SQLITE_OK;
816: }
817: return rc;
818: }
819:
820: /*
821: ** Return a pointer to a buffer owned by the sorter that contains the
822: ** current key.
823: */
824: static void *vdbeSorterRowkey(
825: VdbeSorter *pSorter, /* Sorter object */
826: int *pnKey /* OUT: Size of current key in bytes */
827: ){
828: void *pKey;
829: if( pSorter->aTree ){
830: VdbeSorterIter *pIter;
831: pIter = &pSorter->aIter[ pSorter->aTree[1] ];
832: *pnKey = pIter->nKey;
833: pKey = pIter->aKey;
834: }else{
835: *pnKey = pSorter->pRecord->nVal;
836: pKey = pSorter->pRecord->pVal;
837: }
838: return pKey;
839: }
840:
841: /*
842: ** Copy the current sorter key into the memory cell pOut.
843: */
844: int sqlite3VdbeSorterRowkey(VdbeCursor *pCsr, Mem *pOut){
845: VdbeSorter *pSorter = pCsr->pSorter;
846: void *pKey; int nKey; /* Sorter key to copy into pOut */
847:
848: pKey = vdbeSorterRowkey(pSorter, &nKey);
849: if( sqlite3VdbeMemGrow(pOut, nKey, 0) ){
850: return SQLITE_NOMEM;
851: }
852: pOut->n = nKey;
853: MemSetTypeFlag(pOut, MEM_Blob);
854: memcpy(pOut->z, pKey, nKey);
855:
856: return SQLITE_OK;
857: }
858:
859: /*
860: ** Compare the key in memory cell pVal with the key that the sorter cursor
861: ** passed as the first argument currently points to. For the purposes of
862: ** the comparison, ignore the rowid field at the end of each record.
863: **
864: ** If an error occurs, return an SQLite error code (i.e. SQLITE_NOMEM).
865: ** Otherwise, set *pRes to a negative, zero or positive value if the
866: ** key in pVal is smaller than, equal to or larger than the current sorter
867: ** key.
868: */
869: int sqlite3VdbeSorterCompare(
870: VdbeCursor *pCsr, /* Sorter cursor */
871: Mem *pVal, /* Value to compare to current sorter key */
872: int *pRes /* OUT: Result of comparison */
873: ){
874: VdbeSorter *pSorter = pCsr->pSorter;
875: void *pKey; int nKey; /* Sorter key to compare pVal with */
876:
877: pKey = vdbeSorterRowkey(pSorter, &nKey);
878: vdbeSorterCompare(pCsr, 1, pVal->z, pVal->n, pKey, nKey, pRes);
879: return SQLITE_OK;
880: }
881:
882: #endif /* #ifndef SQLITE_OMIT_MERGE_SORT */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>