Merge latest changes from orderby-planning branch.

author dan <dan@noemail.net>

Fri, 28 Mar 2014 19:18:16 +0000 (19:18 +0000)

committer dan <dan@noemail.net>

Fri, 28 Mar 2014 19:18:16 +0000 (19:18 +0000)
author dan <dan@noemail.net>
Fri, 28 Mar 2014 19:18:16 +0000 (19:18 +0000)
committer dan <dan@noemail.net>
Fri, 28 Mar 2014 19:18:16 +0000 (19:18 +0000)
diff --cc manifest

index 9e18b497265ca1e982993f35745ba2ee25a877d9,a3dc17de19afae4d52aa49a6402fc853844b6b39..52c9a03b242f668ae4eeb5ac54f38f2590019534
--- 1/manifest
--- 2/manifest
+++ b/manifest
@@@ -1,9 -1,9 +1,9 @@@
- C Merge\sfrom\strunk\sthe\sfix\sfor\sthe\scrash\son\sa\scorrupt\sdatabase.
- D 2014-03-26T19:45:01.037
- -C Merge\sthe\slatest\schanges\sfrom\strunk.
- -D 2014-03-28T18:35:39.779
++C Merge\slatest\schanges\sfrom\sorderby-planning\sbranch.
++D 2014-03-28T19:18:16.969
   F Makefile.arm-wince-mingw32ce-gcc d6df77f1f48d690bd73162294bbba7f59507c72f
- -F Makefile.in 2ef13430cd359f7b361bb863504e227b25cc7f81
+ +F Makefile.in ad0921c4b2780d01868cf69b419a4f102308d125
   F Makefile.linux-gcc 91d710bdc4998cb015f39edf3cb314ec4f4d7e23
- -F Makefile.msc fdacba6fb574868c71fde6db6b77789a383a4c19
+ +F Makefile.msc 153eb9b9725bc7b8e4dbe963219298e0c4a644b0
   F Makefile.vxworks db21ed42a01d5740e656b16f92cb5d8d5e5dd315
   F README.md 64f270c43c38c46de749e419c22f0ae2f4499fe8
   F VERSION 9f823c026c6a32fc5f84d212a8aae0a221dba45c
@@@ -217,12 -217,12 +217,12 @@@ F src/printf.c e5a0005f8b3de21f85da6a70
   F src/random.c d10c1f85b6709ca97278428fd5db5bbb9c74eece
   F src/resolve.c 273d5f47c4e2c05b2d3d2bffeda939551ab59e66
   F src/rowset.c 64655f1a627c9c212d9ab497899e7424a34222e0
- F src/select.c 269c3e31a450fce642a10569221a49180348c88e
+ F src/select.c 20055cf917222e660c4222fea306bd13a0623caa
- -F src/shell.c cee9f46f2688a261601b1fd3d7f4b3cddf9b5cdf
+ +F src/shell.c f48b63f8e582e7998ecefd051d697f91fb1453df
   F src/sqlite.h.in a2ef671f92747a5a1c8a47bad5c585a8dd9eca80
   F src/sqlite3.rc 11094cc6a157a028b301a9f06b3d03089ea37c3e
   F src/sqlite3ext.h 886f5a34de171002ad46fae8c36a7d8051c190fc
- F src/sqliteInt.h fb667a3d602d405be6abf0fb21246aac7bb23e76
- -F src/sqliteInt.h cc9582a91b2910404ccda7b7e198815ea0f75948
++F src/sqliteInt.h 3f5190a4e07ca227035334da8d66ebe227071528
   F src/sqliteLimit.h 164b0e6749d31e0daa1a4589a169d31c0dec7b3d
   F src/status.c 7ac05a5c7017d0b9f0b4bcd701228b784f987158
   F src/table.c 2cd62736f845d82200acfa1287e33feb3c15d62e
@@@ -279,14 -278,14 +279,14 @@@ F src/update.c 5b3e74a03b3811e586b4f2b4
   F src/utf.c 6dc9ec9f1b3db43ae8ba0365377f11df1ee4c01c
   F src/util.c c46c90459ef9bdc0c6c73803cf4c55425b4771cf
   F src/vacuum.c 3728d74919d4fb1356f9e9a13e27773db60b7179
- F src/vdbe.c 74c7386e83eee56f921a17bb4a0396c9551f5bc7
- F src/vdbe.h fb2c48c198300a7c632f09fc940011d2ad2fc2ae
- F src/vdbeInt.h 2b9a6849166d0014c843ae3fd83a062be4efa325
+ F src/vdbe.c 02f2de0b2f3b198438e3e64a2ceba9407bb8348b
+ F src/vdbe.h 394464909ed682334aa3d5831aae0c2fe2abef94
+ F src/vdbeInt.h e6d83e5bfd62fc6685ba1ed6153f7099f82de9f7
   F src/vdbeapi.c 0ed6053f947edd0b30f64ce5aeb811872a3450a4
- F src/vdbeaux.c f81ef920dcf76aceaa1ce77081e9fc5d7a0993dd
+ F src/vdbeaux.c 1153175fb57a8454e1c8cf79b59b7bf92b26779d
   F src/vdbeblob.c 15377abfb59251bccedd5a9c7d014a895f0c04aa
   F src/vdbemem.c 6fc77594c60f6155404f3f8d71bf36d1fdeb4447
- F src/vdbesort.c c3e427de848b78e9e9feaa25f68fb64686bab6cd
- -F src/vdbesort.c 08d5e1ee199599d9571942f0560f84963c7a1a9b
++F src/vdbesort.c 01068b89364fa2bffeba9b929367ed04661e97f7
   F src/vdbetrace.c 6f52bc0c51e144b7efdcfb2a8f771167a8816767
   F src/vtab.c 21b932841e51ebd7d075e2d0ad1415dce8d2d5fd
   F src/wal.c 76e7fc6de229bea8b30bb2539110f03a494dc3a8
@@@ -817,7 -816,7 +817,7 @@@ F test/skipscan1.test bed8cbe9d554c8c27
   F test/skipscan2.test 5a4db0799c338ddbacb154aaa5589c0254b36a8d
   F test/soak.test 0b5b6375c9f4110c828070b826b3b4b0bb65cd5f
   F test/softheap1.test 40562fe6cac6d9827b7b42b86d45aedf12c15e24
- F test/sort.test 0e4456e729e5a92a625907c63dcdedfbe72c5dc5
- -F test/sort.test cb76a6e9db897b6871ef4dbc206ebc6dbc033bf4
++F test/sort.test 79dc647c4e9b123a64e57b7080b7f9a2df43f87a
   F test/speed1.test f2974a91d79f58507ada01864c0e323093065452
   F test/speed1p.explain d841e650a04728b39e6740296b852dccdca9b2cb
   F test/speed1p.test b180e98609c7677382cf618c0ec9b69f789033a8
@@@ -1160,7 -1159,7 +1160,7 @@@ F tool/vdbe_profile.tcl 67746953071a9f8
   F tool/warnings-clang.sh f6aa929dc20ef1f856af04a730772f59283631d4
   F tool/warnings.sh d1a6de74685f360ab718efda6265994b99bbea01
   F tool/win/sqlite.vsix 030f3eeaf2cb811a3692ab9c14d021a75ce41fff
- P 1cab83577c814feb35b4fb91af0d52a9751d99bc f585f5d7a0f9bf8c590388654a3638231eba8892
- R 60773da8b0081a052cb0d48785e94e94
- -P 81987c8ceb64f051528a6ca42673821d9ab7c0ff 27deb6e49bcc76714dbdc61b34748603155ac770
- -R 79fcad4572db3d584834fdf0de4170a6
--U drh
- Z 140bbab50f5dc3d30ae8cb405819380a
- -Z 871d90dec169e4d494b7ff32475fff97
++P 8cb2b02baa7ef9aa96319e977f0315328f944237 3047a25f1c41e83f0b4772f7c36fbfec0f12dc7e
++R 132477cf9a18d083088d044c20454723
++U dan
++Z b189ca5a1e8d41a36f80415af9011f68
diff --cc manifest.uuid

index 480b98fe060e776dde2274b19f69ba925d54e9ec,518ac58c2af0a42ed9c84347047442474c9fdab3..a6001606128ac5cbd773d130ac27ac0e4e034e87
--- 1/manifest.uuid
--- 2/manifest.uuid
+++ b/manifest.uuid
@@@ -1,1 -1,1 +1,1 @@@
- 8cb2b02baa7ef9aa96319e977f0315328f944237
- -3047a25f1c41e83f0b4772f7c36fbfec0f12dc7e
++4c7fb5423430f3b936befaa7c309f8e1968ee7d8
diff --cc src/sqliteInt.h
Simple merge
diff --cc src/vdbesort.c

index a3f7aba10ec3e3077df5140368df93be9a2d51fa,f229b1f18b8a3b0e4d480a9413c4286b1a577561..e01790d600de2a588b234d8f7ee9fe4c48ca3634
--- 1/src/vdbesort.c
--- 2/src/vdbesort.c
+++ b/src/vdbesort.c
@@@ -20,83 -20,9 +20,84 @@@
   
   
   typedef struct VdbeSorterIter VdbeSorterIter;
+ +typedef struct SorterThread SorterThread;
   typedef struct SorterRecord SorterRecord;
+ +typedef struct SorterMerger SorterMerger;
   typedef struct FileWriter FileWriter;
   
+ +
+ +/*
+ +** Maximum number of threads to use. Setting this value to 1 forces all
+ +** operations to be single-threaded.
+ +*/
+ +#ifndef SQLITE_MAX_SORTER_THREAD
+ +# define SQLITE_MAX_SORTER_THREAD 4
+ +#endif
+ +
+ +/*
+ +** Candidate values for SorterThread.eWork
+ +*/
+ +#define SORTER_THREAD_SORT   1
+ +#define SORTER_THREAD_TO_PMA 2
+ +#define SORTER_THREAD_CONS   3
+ +
+ +/*
+ +** Much of the work performed in this module to sort the list of records is 
+ +** broken down into smaller units that may be peformed in parallel. In order
+ +** to perform such a unit of work, an instance of the following structure
+ +** is configured and passed to vdbeSorterThreadMain() - either directly by 
+ +** the main thread or via a background thread.
+ +**
+ +** Exactly SQLITE_MAX_SORTER_THREAD instances of this structure are allocated
+ +** as part of each VdbeSorter object. Instances are never allocated any other
+ +** way.
+ +**
+ +** When a background thread is launched to perform work, SorterThread.bDone
+ +** is set to 0 and the SorterThread.pThread variable set to point to the
+ +** thread handle. SorterThread.bDone is set to 1 (to indicate to the main
+ +** thread that joining SorterThread.pThread will not block) before the thread
+ +** exits. SorterThread.pThread and bDone are always cleared after the 
+ +** background thread has been joined.
+ +**
+ +** One object (specifically, VdbeSorter.aThread[SQLITE_MAX_SORTER_THREAD-1])
+ +** is reserved for the foreground thread.
+ +**
+ +** The nature of the work performed is determined by SorterThread.eWork,
+ +** as follows:
+ +**
+ +**   SORTER_THREAD_SORT:
+ +**     Sort the linked list of records at SorterThread.pList.
+ +**
+ +**   SORTER_THREAD_TO_PMA:
+ +**     Sort the linked list of records at SorterThread.pList, and write
+ +**     the results to a new PMA in temp file SorterThread.pTemp1. Open
+ +**     the temp file if it is not already open.
+ +**
+ +**   SORTER_THREAD_CONS:
+ +**     Merge existing PMAs until SorterThread.nConsolidate or fewer
+ +**     remain in temp file SorterThread.pTemp1.
+ +*/
+ +struct SorterThread {
+ +  SQLiteThread *pThread;          /* Thread handle, or NULL */
+ +  int bDone;                      /* Set to true by pThread when finished */
+ +
+ +  sqlite3_vfs *pVfs;              /* VFS used to open temporary files */
+ +  KeyInfo *pKeyInfo;              /* How to compare records */
+ +  UnpackedRecord *pUnpacked;      /* Space to unpack a record */
+ +  int pgsz;                       /* Main database page size */
+ +
+ +  u8 eWork;                       /* One of the SORTER_THREAD_* constants */
+ +  int nConsolidate;               /* For THREAD_CONS, max final PMAs */
+ +  SorterRecord *pList;            /* List of records for pThread to sort */
+ +  int nInMemory;                  /* Expected size of PMA based on pList */
++  u8 *aListMemory;                /* Records memory (or NULL) */
+ +
+ +  int nPMA;                       /* Number of PMAs currently in pTemp1 */
+ +  i64 iTemp1Off;                  /* Offset to write to in pTemp1 */
+ +  sqlite3_file *pTemp1;           /* File to write PMAs to, or NULL */
+ +};
+ +
+ +
   /*
   ** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
   **
@@@ -166,24 -92,22 +167,27 @@@
   ** key comparison operations are required, where N is the number of segments
   ** being merged (rounded up to the next power of 2).
   */
+ +struct SorterMerger {
+ +  int nTree;                      /* Used size of aTree/aIter (power of 2) */
+ +  int *aTree;                     /* Current state of incremental merge */
+ +  VdbeSorterIter *aIter;          /* Array of iterators to merge data from */
+ +};
+ +
+ +/*
+ +** Main sorter structure. A single instance of this is allocated for each 
+ +** sorter cursor created by the VDBE.
+ +*/
   struct VdbeSorter {
- -  i64 iWriteOff;                  /* Current write offset within file pTemp1 */
- -  i64 iReadOff;                   /* Current read offset within file pTemp1 */
     int nInMemory;                  /* Current size of pRecord list as PMA */
- -  int nTree;                      /* Used size of aTree/aIter (power of 2) */
- -  int nPMA;                       /* Number of PMAs stored in pTemp1 */
     int mnPmaSize;                  /* Minimum PMA size, in bytes */
     int mxPmaSize;                  /* Maximum PMA size, in bytes.  0==no limit */
- -  VdbeSorterIter *aIter;          /* Array of iterators to merge */
- -  int *aTree;                     /* Current state of incremental merge */
- -  sqlite3_file *pTemp1;           /* PMA file 1 */
+ +  int bUsePMA;                    /* True if one or more PMAs created */
     SorterRecord *pRecord;          /* Head of in-memory record list */
- -  UnpackedRecord *pUnpacked;      /* Used to unpack keys */
- -  u8* aMemory;                    /* Block to allocate records from */
- -  int iMemory;                    /* Offset of free space in aMemory */
- -  int nMemory;                    /* Current size of allocation at aMemory */
+ +  SorterMerger *pMerger;          /* For final merge of PMAs (by caller) */ 
++  u8 *aMemory;                    /* Block of memory to alloc records from */
++  int iMemory;                    /* Offset of first free byte in aMemory */
++  int nMemory;                    /* Size of aMemory allocation in bytes */
+ +  SorterThread aThread[SQLITE_MAX_SORTER_THREAD];
   };
   
   /*
@@@ -200,6 -124,7 +204,7 @@@ struct VdbeSorterIter 
     u8 *aKey;                       /* Pointer to current key */
     u8 *aBuffer;                    /* Current read buffer */
     int nBuffer;                    /* Size of read buffer in bytes */
- -  u8 *aMap;                       /* Pointer to mapping of pFile */
++  u8 *aMap;                       /* Pointer to mapping of entire file */
   };
   
   /*
@@@ -220,17 -145,38 +225,39 @@@ struct FileWriter 
   
   /*
   ** A structure to store a single record. All in-memory records are connected
- ** together into a linked list headed at VdbeSorter.pRecord using the 
- ** SorterRecord.pNext pointer.
+ ** together into a linked list headed at VdbeSorter.pRecord.
+ **
+ ** How the linked list is connected depends on how memory is being managed
+ ** by this module. If using a separate allocation for each in-memory record
- -** (VdbeSorter.aMemory==0), then the list is always connected using the 
++** (VdbeSorter.aMemory==0), then the list is always connected using the
+ ** SorterRecord.u.pNext pointers.
+ **
+ ** Or, if using the single large allocation method (VdbeSorter.aMemory!=0),
+ ** then while records are being accumulated the list is linked using the
+ ** SorterRecord.u.iNext offset. This is because the aMemory[] array may
+ ** be sqlite3Realloc()ed while records are being accumulated. Once the VM
+ ** has finished passing records to the sorter, or when the in-memory buffer
+ ** is full, the list is sorted. As part of the sorting process, it is
+ ** converted to use the SorterRecord.u.pNext pointers. See function
+ ** vdbeSorterSort() for details.
   */
   struct SorterRecord {
-   void *pVal;
     int nVal;
-   SorterRecord *pNext;
+   union {
+     SorterRecord *pNext;          /* Pointer to next record in list */
+     int iNext;                    /* Offset within aMemory of next record */
+   } u;
   };
   
- -/* Minimum allowable value for the VdbeSorter.nWorking variable */
+ /* Return a pointer to the buffer containing the record data for SorterRecord
+ ** object p. Should be used as if:
+ **
+ **   void *SRVAL(SorterRecord *p) { return (void*)&p[1]; }
+ */
+ #define SRVAL(p) ((void*)((SorterRecord*)(p) + 1))
+ 
+ +/* The minimum PMA size is set to this value multiplied by the database
+ +** page size in bytes.  */
   #define SORTER_MIN_WORKING 10
   
   /* Maximum number of segments to merge in a single pass. */
@@@ -240,9 -186,10 +267,10 @@@
   ** Free all memory belonging to the VdbeSorterIter object passed as the second
   ** argument. All structure fields are set to zero before returning.
   */
- -static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
- -  sqlite3DbFree(db, pIter->aAlloc);
- -  sqlite3DbFree(db, pIter->aBuffer);
+ +static void vdbeSorterIterZero(VdbeSorterIter *pIter){
+ +  sqlite3_free(pIter->aAlloc);
+ +  sqlite3_free(pIter->aBuffer);
+   if( pIter->aMap ) sqlite3OsUnfetch(pIter->pFile, 0, pIter->aMap);
     memset(pIter, 0, sizeof(VdbeSorterIter));
   }
   
@@@ -342,21 -295,25 +377,25 @@@ static int vdbeSorterIterRead
   ** Read a varint from the stream of data accessed by p. Set *pnOut to
   ** the value read.
   */
- -static int vdbeSorterIterVarint(sqlite3 *db, VdbeSorterIter *p, u64 *pnOut){
+ +static int vdbeSorterIterVarint(VdbeSorterIter *p, u64 *pnOut){
     int iBuf;
   
-   iBuf = p->iReadOff % p->nBuffer;
-   if( iBuf && (p->nBuffer-iBuf)>=9 ){
-     p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
+   if( p->aMap ){
+     p->iReadOff += sqlite3GetVarint(&p->aMap[p->iReadOff], pnOut);
     }else{
-     u8 aVarint[16], *a;
-     int i = 0, rc;
-     do{
-       rc = vdbeSorterIterRead(p, 1, &a);
-       if( rc ) return rc;
-       aVarint[(i++)&0xf] = a[0];
-     }while( (a[0]&0x80)!=0 );
-     sqlite3GetVarint(aVarint, pnOut);
+     iBuf = p->iReadOff % p->nBuffer;
+     if( iBuf && (p->nBuffer-iBuf)>=9 ){
+       p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
+     }else{
+       u8 aVarint[16], *a;
+       int i = 0, rc;
+       do{
- -        rc = vdbeSorterIterRead(db, p, 1, &a);
++        rc = vdbeSorterIterRead(p, 1, &a);
+         if( rc ) return rc;
+         aVarint[(i++)&0xf] = a[0];
+       }while( (a[0]&0x80)!=0 );
+       sqlite3GetVarint(aVarint, pnOut);
+     }
     }
   
     return SQLITE_OK;
@@@ -399,42 -360,53 +438,51 @@@ static int vdbeSorterIterInit
     i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
   ){
     int rc = SQLITE_OK;
- -  int nBuf;
- -  void *pMap;
- -
- -  nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
+ +  int nBuf = pThread->pgsz;
++  void *pMap = 0;                 /* Mapping of temp file */
   
- -  assert( pSorter->iWriteOff>iStart );
+ +  assert( pThread->iTemp1Off>iStart );
     assert( pIter->aAlloc==0 );
     assert( pIter->aBuffer==0 );
- -  pIter->pFile = pSorter->pTemp1;
+ +  pIter->pFile = pThread->pTemp1;
     pIter->iReadOff = iStart;
     pIter->nAlloc = 128;
- -  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
+ +  pIter->aAlloc = (u8*)sqlite3Malloc(pIter->nAlloc);
-   pIter->nBuffer = nBuf;
-   pIter->aBuffer = (u8*)sqlite3Malloc(nBuf);
   
-   if( !pIter->aBuffer ){
-     rc = SQLITE_NOMEM;
- -  /* See if this PMA can be read using xFetch. */
- -  rc = sqlite3OsFetch(pIter->pFile, 0, pSorter->iWriteOff, &pMap);
- -  if( rc!=SQLITE_OK ) return rc;
- -  if( pMap ){
- -    pIter->aMap = (u8*)pMap;
--  }else{
-     int iBuf;
- -    pIter->nBuffer = nBuf;
- -    pIter->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
++  /* Try to xFetch() a mapping of the entire temp file. If this is possible,
++  ** the PMA will be read via the mapping. Otherwise, use xRead().  */
++  rc = sqlite3OsFetch(pIter->pFile, 0, pThread->iTemp1Off, &pMap);
   
-     iBuf = iStart % nBuf;
-     if( iBuf ){
-       int nRead = nBuf - iBuf;
-       if( (iStart + nRead) > pThread->iTemp1Off ){
-         nRead = (int)(pThread->iTemp1Off - iStart);
- -    if( !pIter->aBuffer ){
- -      rc = SQLITE_NOMEM;
++  if( rc==SQLITE_OK ){
++    if( pMap ){
++      pIter->aMap = (u8*)pMap;
+     }else{
- -      int iBuf;
- -
- -      iBuf = iStart % nBuf;
- -      if( iBuf ){
- -        int nRead = nBuf - iBuf;
- -        if( (iStart + nRead) > pSorter->iWriteOff ){
- -          nRead = (int)(pSorter->iWriteOff - iStart);
++      pIter->nBuffer = nBuf;
++      pIter->aBuffer = (u8*)sqlite3Malloc(nBuf);
++      if( !pIter->aBuffer ){
++        rc = SQLITE_NOMEM;
++      }else{
++        int iBuf = iStart % nBuf;
++        if( iBuf ){
++          int nRead = nBuf - iBuf;
++          if( (iStart + nRead) > pThread->iTemp1Off ){
++            nRead = (int)(pThread->iTemp1Off - iStart);
++          }
++          rc = sqlite3OsRead(
++              pThread->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart
++              );
++          assert( rc!=SQLITE_IOERR_SHORT_READ );
+         }
- -        rc = sqlite3OsRead(
- -            pSorter->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart
- -        );
- -        assert( rc!=SQLITE_IOERR_SHORT_READ );
         }
-       rc = sqlite3OsRead(
-           pThread->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart
-       );
-       assert( rc!=SQLITE_IOERR_SHORT_READ );
       }
+   }
   
-     if( rc==SQLITE_OK ){
-       u64 nByte;
-       pIter->iEof = pThread->iTemp1Off;
-       rc = vdbeSorterIterVarint(pIter, &nByte);
-       pIter->iEof = pIter->iReadOff + nByte;
-       *pnByte += nByte;
-     }
+   if( rc==SQLITE_OK ){
- -    u64 nByte;                       /* Size of PMA in bytes */
- -    pIter->iEof = pSorter->iWriteOff;
- -    rc = vdbeSorterIterVarint(db, pIter, &nByte);
++    u64 nByte;                    /* Size of PMA in bytes */
++    pIter->iEof = pThread->iTemp1Off;
++    rc = vdbeSorterIterVarint(pIter, &nByte);
+     pIter->iEof = pIter->iReadOff + nByte;
+     *pnByte += nByte;
     }
   
     if( rc==SQLITE_OK ){
@@@ -544,39 -514,40 +592,51 @@@ static int vdbeSorterDoCompare
   */
   int sqlite3VdbeSorterInit(sqlite3 *db, VdbeCursor *pCsr){
     int pgsz;                       /* Page size of main database */
+ +  int i;                          /* Used to iterate through aThread[] */
     int mxCache;                    /* Cache size */
     VdbeSorter *pSorter;            /* The new sorter */
- -  char *d;                        /* Dummy */
+ +  KeyInfo *pKeyInfo;              /* Copy of pCsr->pKeyInfo with db==0 */
+ +  int szKeyInfo;                  /* Size of pCsr->pKeyInfo in bytes */
++  int rc = SQLITE_OK;
   
     assert( pCsr->pKeyInfo && pCsr->pBt==0 );
- -  pCsr->pSorter = pSorter = sqlite3DbMallocZero(db, sizeof(VdbeSorter));
+ +  szKeyInfo = sizeof(KeyInfo) + (pCsr->pKeyInfo->nField-1)*sizeof(CollSeq*);
+ +  pSorter = (VdbeSorter*)sqlite3DbMallocZero(db, sizeof(VdbeSorter)+szKeyInfo);
+ +  pCsr->pSorter = pSorter;
     if( pSorter==0 ){
--    return SQLITE_NOMEM;
--  }
-   pKeyInfo = (KeyInfo*)&pSorter[1];
-   memcpy(pKeyInfo, pCsr->pKeyInfo, szKeyInfo);
-   pKeyInfo->db = 0;
-   pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
- -  
- -  pSorter->pUnpacked = sqlite3VdbeAllocUnpackedRecord(pCsr->pKeyInfo, 0, 0, &d);
- -  if( pSorter->pUnpacked==0 ) return SQLITE_NOMEM;
- -  assert( pSorter->pUnpacked==(UnpackedRecord *)d );
- -  pSorter->pUnpacked->nField = pCsr->pKeyInfo->nField;
- -
- -  if( !sqlite3TempInMemory(db) ){
++    rc = SQLITE_NOMEM;
++  }else{
++    pKeyInfo = (KeyInfo*)&pSorter[1];
++    memcpy(pKeyInfo, pCsr->pKeyInfo, szKeyInfo);
++    pKeyInfo->db = 0;
+     pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
- -    pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
- -    mxCache = db->aDb[0].pSchema->cache_size;
- -    if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
- -    pSorter->mxPmaSize = mxCache * pgsz;
- -
- -    /* If the application is using memsys3 or memsys5, use a separate 
- -    ** allocation for each sort-key in memory. Otherwise, use a single big
- -    ** allocation at pSorter->aMemory for all sort-keys.  */
- -    if( sqlite3GlobalConfig.pHeap==0 ){
- -      assert( pSorter->iMemory==0 );
- -      pSorter->nMemory = pgsz;
- -      pSorter->aMemory = (u8*)sqlite3Malloc(pSorter->nMemory);
- -      if( !pSorter->aMemory ) return SQLITE_NOMEM;
+ +
-   for(i=0; i<SQLITE_MAX_SORTER_THREAD; i++){
-     SorterThread *pThread = &pSorter->aThread[i];
-     pThread->pKeyInfo = pKeyInfo;
-     pThread->pVfs = db->pVfs;
-     pThread->pgsz = pgsz;
-   }
++    for(i=0; i<SQLITE_MAX_SORTER_THREAD; i++){
++      SorterThread *pThread = &pSorter->aThread[i];
++      pThread->pKeyInfo = pKeyInfo;
++      pThread->pVfs = db->pVfs;
++      pThread->pgsz = pgsz;
++    }
+ +
-   if( !sqlite3TempInMemory(db) ){
-     pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
-     mxCache = db->aDb[0].pSchema->cache_size;
-     if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
-     pSorter->mxPmaSize = mxCache * pgsz;
++    if( !sqlite3TempInMemory(db) ){
++      pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
++      mxCache = db->aDb[0].pSchema->cache_size;
++      if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
++      pSorter->mxPmaSize = mxCache * pgsz;
++
++      /* If the application is using memsys3 or memsys5, use a separate 
++      ** allocation for each sort-key in memory. Otherwise, use a single big
++      ** allocation at pSorter->aMemory for all sort-keys.  */
++      if( sqlite3GlobalConfig.pHeap==0 ){
++        assert( pSorter->iMemory==0 );
++        pSorter->nMemory = pgsz;
++        pSorter->aMemory = (u8*)sqlite3Malloc(pgsz);
++        if( !pSorter->aMemory ) rc = SQLITE_NOMEM;
++      }
+     }
     }
   
--  return SQLITE_OK;
++  return rc;
   }
   
   /*
@@@ -592,99 -563,35 +652,107 @@@ static void vdbeSorterRecordFree(sqlite
   }
   
   /*
- -** Reset a sorting cursor back to its original empty state.
+ +** Free all resources owned by the object indicated by argument pThread. All 
+ +** fields of *pThread are zeroed before returning.
   */
- -void sqlite3VdbeSorterReset(sqlite3 *db, VdbeSorter *pSorter){
- -  if( pSorter->aIter ){
- -    int i;
- -    for(i=0; i<pSorter->nTree; i++){
- -      vdbeSorterIterZero(db, &pSorter->aIter[i]);
+ +static void vdbeSorterThreadCleanup(sqlite3 *db, SorterThread *pThread){
+ +  sqlite3DbFree(db, pThread->pUnpacked);
+ +  pThread->pUnpacked = 0;
-   vdbeSorterRecordFree(0, pThread->pList);
++  if( pThread->aListMemory==0 ){
++    vdbeSorterRecordFree(0, pThread->pList);
++  }else{
++    sqlite3_free(pThread->aListMemory);
++    pThread->aListMemory = 0;
++  }
+ +  pThread->pList = 0;
+ +  if( pThread->pTemp1 ){
+ +    sqlite3OsCloseFree(pThread->pTemp1);
+ +    pThread->pTemp1 = 0;
+ +  }
+ +}
+ +
+ +/*
+ +** Join all threads.  
+ +*/
+ +static int vdbeSorterJoinAll(VdbeSorter *pSorter, int rcin){
+ +  int rc = rcin;
+ +  int i;
+ +  for(i=0; i<SQLITE_MAX_SORTER_THREAD; i++){
+ +    SorterThread *pThread = &pSorter->aThread[i];
+ +    if( pThread->pThread ){
+ +      void *pRet;
+ +      int rc2 = sqlite3ThreadJoin(pThread->pThread, &pRet);
+ +      pThread->pThread = 0;
+ +      pThread->bDone = 0;
+ +      if( rc==SQLITE_OK ) rc = rc2;
+ +      if( rc==SQLITE_OK ) rc = SQLITE_PTR_TO_INT(pRet);
+ +    }
+ +  }
+ +  return rc;
+ +}
+ +
+ +/*
+ +** Allocate a new SorterMerger object with space for nIter iterators.
+ +*/
+ +static SorterMerger *vdbeSorterMergerNew(int nIter){
+ +  int N = 2;                      /* Smallest power of two >= nIter */
+ +  int nByte;                      /* Total bytes of space to allocate */
+ +  SorterMerger *pNew;             /* Pointer to allocated object to return */
+ +
+ +  assert( nIter<=SORTER_MAX_MERGE_COUNT );
+ +  while( N<nIter ) N += N;
+ +  nByte = sizeof(SorterMerger) + N * (sizeof(int) + sizeof(VdbeSorterIter));
+ +
+ +  pNew = (SorterMerger*)sqlite3MallocZero(nByte);
+ +  if( pNew ){
+ +    pNew->nTree = N;
+ +    pNew->aIter = (VdbeSorterIter*)&pNew[1];
+ +    pNew->aTree = (int*)&pNew->aIter[N];
+ +  }
+ +  return pNew;
+ +}
+ +
+ +/*
+ +** Reset a merger
+ +*/
+ +static void vdbeSorterMergerReset(SorterMerger *pMerger){
+ +  int i;
+ +  if( pMerger ){
+ +    for(i=0; i<pMerger->nTree; i++){
+ +      vdbeSorterIterZero(&pMerger->aIter[i]);
       }
- -    sqlite3DbFree(db, pSorter->aIter);
- -    pSorter->aIter = 0;
     }
- -  if( pSorter->pTemp1 ){
- -    sqlite3OsCloseFree(pSorter->pTemp1);
- -    pSorter->pTemp1 = 0;
+ +}
+ +
+ +
+ +/*
+ +** Free the SorterMerger object passed as the only argument.
+ +*/
+ +static void vdbeSorterMergerFree(SorterMerger *pMerger){
+ +  vdbeSorterMergerReset(pMerger);
+ +  sqlite3_free(pMerger);
+ +}
+ +
+ +/*
+ +** Reset a sorting cursor back to its original empty state.
+ +*/
+ +void sqlite3VdbeSorterReset(sqlite3 *db, VdbeSorter *pSorter){
+ +  int i;
+ +  vdbeSorterJoinAll(pSorter, SQLITE_OK);
+ +  for(i=0; i<SQLITE_MAX_SORTER_THREAD; i++){
+ +    SorterThread *pThread = &pSorter->aThread[i];
+ +    vdbeSorterThreadCleanup(db, pThread);
     }
-   vdbeSorterRecordFree(0, pSorter->pRecord);
+   if( pSorter->aMemory==0 ){
- -    vdbeSorterRecordFree(db, pSorter->pRecord);
++    vdbeSorterRecordFree(0, pSorter->pRecord);
+   }
+ +  vdbeSorterMergerReset(pSorter->pMerger);
     pSorter->pRecord = 0;
- -  pSorter->iWriteOff = 0;
- -  pSorter->iReadOff = 0;
     pSorter->nInMemory = 0;
- -  pSorter->nTree = 0;
- -  pSorter->nPMA = 0;
- -  pSorter->aTree = 0;
+ +  pSorter->bUsePMA = 0;
+   pSorter->iMemory = 0;
   }
   
- -
   /*
   ** Free any cursor components allocated by sqlite3VdbeSorterXXX routines.
   */
@@@ -692,7 -599,8 +760,8 @@@ void sqlite3VdbeSorterClose(sqlite3 *db
     VdbeSorter *pSorter = pCsr->pSorter;
     if( pSorter ){
       sqlite3VdbeSorterReset(db, pSorter);
- -    sqlite3DbFree(db, pSorter->pUnpacked);
- -    sqlite3DbFree(db, pSorter->aMemory);
+ +    vdbeSorterMergerFree(pSorter->pMerger);
++    sqlite3_free(pSorter->aMemory);
       sqlite3DbFree(db, pSorter);
       pCsr->pSorter = 0;
     }
@@@ -703,13 -611,18 +772,18 @@@
   ** set *ppFile to point to the malloc'd file-handle and return SQLITE_OK.
   ** Otherwise, set *ppFile to 0 and return an SQLite error code.
   */
- -static int vdbeSorterOpenTempFile(sqlite3 *db, sqlite3_file **ppFile){
+ +static int vdbeSorterOpenTempFile(sqlite3_vfs *pVfs, sqlite3_file **ppFile){
-   int dummy;
-   return sqlite3OsOpenMalloc(pVfs, 0, ppFile,
+   int rc;
- -  rc = sqlite3OsOpenMalloc(db->pVfs, 0, ppFile,
++  rc = sqlite3OsOpenMalloc(pVfs, 0, ppFile,
         SQLITE_OPEN_TEMP_JOURNAL |
         SQLITE_OPEN_READWRITE    | SQLITE_OPEN_CREATE |
-       SQLITE_OPEN_EXCLUSIVE    | SQLITE_OPEN_DELETEONCLOSE, &dummy
+       SQLITE_OPEN_EXCLUSIVE    | SQLITE_OPEN_DELETEONCLOSE, &rc
     );
+   if( rc==SQLITE_OK ){
+     i64 max = SQLITE_MAX_MMAP_SIZE;
+     sqlite3OsFileControlHint( *ppFile, SQLITE_FCNTL_MMAP_SIZE, (void*)&max);
+   }
+   return rc;
   }
   
   /*
@@@ -728,11 -644,11 +802,11 @@@ static void vdbeSorterMerge
   
     while( p1 && p2 ){
       int res;
-     vdbeSorterCompare(pThread, 0, p1->pVal, p1->nVal, pVal2, p2->nVal, &res);
- -    vdbeSorterCompare(pCsr, 0, SRVAL(p1), p1->nVal, pVal2, p2->nVal, &res);
++    vdbeSorterCompare(pThread, 0, SRVAL(p1), p1->nVal, pVal2, p2->nVal, &res);
       if( res<=0 ){
         *pp = p1;
-       pp = &p1->pNext;
-       p1 = p1->pNext;
+       pp = &p1->u.pNext;
+       p1 = p1->u.pNext;
         pVal2 = 0;
       }else{
         *pp = p2;
@@@ -761,12 -683,22 +835,23 @@@ static int vdbeSorterSort(SorterThread 
       return SQLITE_NOMEM;
     }
   
- -  p = pSorter->pRecord;
+ +  p = pThread->pList;
     while( p ){
-     SorterRecord *pNext = p->pNext;
-     p->pNext = 0;
+     SorterRecord *pNext;
- -    if( pSorter->aMemory ){
- -      assert( p->u.iNext<pSorter->nMemory );
- -      if( (u8*)p==pSorter->aMemory ){
++    if( pThread->aListMemory ){
++      if( (u8*)p==pThread->aListMemory ){
+         pNext = 0;
+       }else{
- -        pNext = (SorterRecord*)&pSorter->aMemory[p->u.iNext];
++        assert( p->u.iNext<sqlite3MallocSize(pThread->aListMemory) );
++        pNext = (SorterRecord*)&pThread->aListMemory[p->u.iNext];
+       }
+     }else{
+       pNext = p->u.pNext;
+     }
++
+     p->u.pNext = 0;
       for(i=0; aSlot[i]; i++){
- -      vdbeSorterMerge(pCsr, p, aSlot[i], &p);
+ +      vdbeSorterMerge(pThread, p, aSlot[i], &p);
         aSlot[i] = 0;
       }
       aSlot[i] = p;
@@@ -867,6 -801,29 +952,30 @@@ static void fileWriterWriteVarint(FileW
     fileWriterWrite(p, aByte, nByte);
   }
   
- -** is guaranteed to be nByte bytes or smaller in size. This function 
+ #if SQLITE_MAX_MMAP_SIZE>0
+ /*
+ ** The first argument is a file-handle open on a temporary file. The file
- -** Whether or not the file does end up memory mapped of course depends on 
++** is guaranteed to be nByte bytes or smaller in size. This function
+ ** attempts to extend the file to nByte bytes in size and to ensure that
+ ** the VFS has memory mapped it.
+ **
++** Whether or not the file does end up memory mapped of course depends on
+ ** the specific VFS implementation.
+ */
+ static int vdbeSorterExtendFile(sqlite3_file *pFile, i64 nByte){
+   int rc = sqlite3OsTruncate(pFile, nByte);
+   if( rc==SQLITE_OK ){
+     void *p = 0;
+     sqlite3OsFetch(pFile, 0, nByte, &p);
+     sqlite3OsUnfetch(pFile, 0, p);
+   }
+   return rc;
+ }
+ #else
+ # define vdbeSorterExtendFile(x,y) SQLITE_OK
+ #endif
+ 
++
   /*
   ** Write the current contents of the in-memory linked-list to a PMA. Return
   ** SQLITE_OK if successful, or an SQLite error code otherwise.
@@@ -880,236 -837,54 +989,267 @@@
   **       Each record consists of a varint followed by a blob of data (the 
   **       key). The varint is the number of bytes in the blob of data.
   */
- -static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){
+ +static int vdbeSorterListToPMA(SorterThread *pThread){
     int rc = SQLITE_OK;             /* Return code */
- -  VdbeSorter *pSorter = pCsr->pSorter;
- -  FileWriter writer;
+ +  FileWriter writer;              /* Object used to write to the file */
   
     memset(&writer, 0, sizeof(FileWriter));
- -
- -  if( pSorter->nInMemory==0 ){
- -    assert( pSorter->pRecord==0 );
- -    return rc;
- -  }
- -
- -  rc = vdbeSorterSort(pCsr);
+ +  assert( pThread->nInMemory>0 );
   
     /* If the first temporary PMA file has not been opened, open it now. */
- -  if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
- -    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
- -    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
- -    assert( pSorter->iWriteOff==0 );
- -    assert( pSorter->nPMA==0 );
+ +  if( pThread->pTemp1==0 ){
+ +    rc = vdbeSorterOpenTempFile(pThread->pVfs, &pThread->pTemp1);
+ +    assert( rc!=SQLITE_OK || pThread->pTemp1 );
+ +    assert( pThread->iTemp1Off==0 );
+ +    assert( pThread->nPMA==0 );
     }
   
- -        pSorter->pTemp1, pSorter->iWriteOff + pSorter->nInMemory + 9
+   /* Try to get the file to memory map */
+   if( rc==SQLITE_OK ){
+     rc = vdbeSorterExtendFile(
++        pThread->pTemp1, pThread->iTemp1Off + pThread->nInMemory + 9
+     );
+   }
+ 
     if( rc==SQLITE_OK ){
       SorterRecord *p;
       SorterRecord *pNext = 0;
   
- -    fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
- -    pSorter->nPMA++;
- -    fileWriterWriteVarint(&writer, pSorter->nInMemory);
- -    for(p=pSorter->pRecord; p; p=pNext){
+ +    fileWriterInit(pThread->pTemp1, &writer, pThread->pgsz, pThread->iTemp1Off);
+ +    pThread->nPMA++;
+ +    fileWriterWriteVarint(&writer, pThread->nInMemory);
+ +    for(p=pThread->pList; p; p=pNext){
-       pNext = p->pNext;
+       pNext = p->u.pNext;
         fileWriterWriteVarint(&writer, p->nVal);
-       fileWriterWrite(&writer, p->pVal, p->nVal);
-       sqlite3_free(p);
+       fileWriterWrite(&writer, SRVAL(p), p->nVal);
- -      if( pSorter->aMemory==0 ) sqlite3DbFree(db, p);
++      if( pThread->aListMemory==0 ) sqlite3_free(p);
+ +    }
+ +    pThread->pList = p;
+ +    rc = fileWriterFinish(&writer, &pThread->iTemp1Off);
+ +  }
+ +
++  assert( pThread->pList==0 || rc!=SQLITE_OK );
+ +  return rc;
+ +}
+ +
+ +/*
+ +** Advance the SorterMerger iterator passed as the second argument to
+ +** the next entry. Set *pbEof to true if this means the iterator has 
+ +** reached EOF.
+ +**
+ +** Return SQLITE_OK if successful or an error code if an error occurs.
+ +*/
+ +static int vdbeSorterNext(
+ +  SorterThread *pThread, 
+ +  SorterMerger *pMerger, 
+ +  int *pbEof
+ +){
+ +  int rc;
+ +  int iPrev = pMerger->aTree[1];/* Index of iterator to advance */
+ +  int i;                        /* Index of aTree[] to recalculate */
+ +
+ +  /* Advance the current iterator */
+ +  rc = vdbeSorterIterNext(&pMerger->aIter[iPrev]);
+ +
+ +  /* Update contents of aTree[] */
+ +  for(i=(pMerger->nTree+iPrev)/2; rc==SQLITE_OK && i>0; i=i/2){
+ +    rc = vdbeSorterDoCompare(pThread, pMerger, i);
+ +  }
+ +
+ +  *pbEof = (pMerger->aIter[pMerger->aTree[1]].pFile==0);
+ +  return rc;
+ +}
+ +
+ +/*
+ +** The main routine for sorter-thread operations.
+ +*/
+ +static void *vdbeSorterThreadMain(void *pCtx){
+ +  int rc = SQLITE_OK;
+ +  SorterThread *pThread = (SorterThread*)pCtx;
+ +
+ +  assert( pThread->eWork==SORTER_THREAD_SORT
+ +       || pThread->eWork==SORTER_THREAD_TO_PMA
+ +       || pThread->eWork==SORTER_THREAD_CONS
+ +  );
+ +  assert( pThread->bDone==0 );
+ +
+ +  if( pThread->pUnpacked==0 ){
+ +    char *pFree;
+ +    pThread->pUnpacked = sqlite3VdbeAllocUnpackedRecord(
+ +        pThread->pKeyInfo, 0, 0, &pFree
+ +    );
+ +    assert( pThread->pUnpacked==(UnpackedRecord*)pFree );
+ +    if( pFree==0 ){
+ +      rc = SQLITE_NOMEM;
+ +      goto thread_out;
+ +    }
++    pThread->pUnpacked->nField = pThread->pKeyInfo->nField;
+ +  }
+ +
+ +  if( pThread->eWork==SORTER_THREAD_CONS ){
+ +    assert( pThread->pList==0 );
+ +    while( pThread->nPMA>pThread->nConsolidate && rc==SQLITE_OK ){
+ +      int nIter = MIN(pThread->nPMA, SORTER_MAX_MERGE_COUNT);
+ +      sqlite3_file *pTemp2 = 0;     /* Second temp file to use */
+ +      SorterMerger *pMerger;        /* Object for reading/merging PMA data */
+ +      i64 iReadOff = 0;             /* Offset in pTemp1 to read from */
+ +      i64 iWriteOff = 0;            /* Offset in pTemp2 to write to */
+ +      int i;
+ +      
+ +      /* Allocate a merger object to merge PMAs together. */
+ +      pMerger = vdbeSorterMergerNew(nIter);
+ +      if( pMerger==0 ){
+ +        rc = SQLITE_NOMEM;
+ +        break;
+ +      }
+ +
+ +      /* Open a second temp file to write merged data to */
+ +      rc = vdbeSorterOpenTempFile(pThread->pVfs, &pTemp2);
++      if( rc==SQLITE_OK ){
++        rc = vdbeSorterExtendFile(pTemp2, pThread->iTemp1Off);
++      }
+ +      if( rc!=SQLITE_OK ){
+ +        vdbeSorterMergerFree(pMerger);
+ +        break;
+ +      }
+ +
+ +      /* This loop runs once for each output PMA. Each output PMA is made
+ +      ** of data merged from up to SORTER_MAX_MERGE_COUNT input PMAs. */
+ +      for(i=0; i<pThread->nPMA; i+=SORTER_MAX_MERGE_COUNT){
+ +        FileWriter writer;        /* Object for writing data to pTemp2 */
+ +        i64 nOut = 0;             /* Bytes of data in output PMA */
+ +        int bEof = 0;
+ +        int rc2;
+ +
+ +        /* Configure the merger object to read and merge data from the next 
+ +        ** SORTER_MAX_MERGE_COUNT PMAs in pTemp1 (or from all remaining PMAs,
+ +        ** if that is fewer). */
+ +        int iIter;
+ +        for(iIter=0; iIter<SORTER_MAX_MERGE_COUNT; iIter++){
+ +          VdbeSorterIter *pIter = &pMerger->aIter[iIter];
+ +          rc = vdbeSorterIterInit(pThread, iReadOff, pIter, &nOut);
+ +          iReadOff = pIter->iEof;
+ +          if( iReadOff>=pThread->iTemp1Off || rc!=SQLITE_OK ) break;
+ +        }
+ +        for(iIter=pMerger->nTree-1; rc==SQLITE_OK && iIter>0; iIter--){
+ +          rc = vdbeSorterDoCompare(pThread, pMerger, iIter);
+ +        }
+ +
+ +        fileWriterInit(pTemp2, &writer, pThread->pgsz, iWriteOff);
+ +        fileWriterWriteVarint(&writer, nOut);
+ +        while( rc==SQLITE_OK && bEof==0 ){
+ +          VdbeSorterIter *pIter = &pMerger->aIter[ pMerger->aTree[1] ];
+ +          assert( pIter->pFile!=0 );        /* pIter is not at EOF */
+ +          fileWriterWriteVarint(&writer, pIter->nKey);
+ +          fileWriterWrite(&writer, pIter->aKey, pIter->nKey);
+ +          rc = vdbeSorterNext(pThread, pMerger, &bEof);
+ +        }
+ +        rc2 = fileWriterFinish(&writer, &iWriteOff);
+ +        if( rc==SQLITE_OK ) rc = rc2;
+ +      }
+ +
+ +      vdbeSorterMergerFree(pMerger);
+ +      sqlite3OsCloseFree(pThread->pTemp1);
+ +      pThread->pTemp1 = pTemp2;
+ +      pThread->nPMA = (i / SORTER_MAX_MERGE_COUNT);
+ +      pThread->iTemp1Off = iWriteOff;
+ +    }
+ +  }else{
+ +    /* Sort the pThread->pList list */
+ +    rc = vdbeSorterSort(pThread);
+ +
+ +    /* If required, write the list out to a PMA. */
+ +    if( rc==SQLITE_OK && pThread->eWork==SORTER_THREAD_TO_PMA ){
+ +#ifdef SQLITE_DEBUG
+ +      i64 nExpect = pThread->nInMemory
+ +        + sqlite3VarintLen(pThread->nInMemory)
+ +        + pThread->iTemp1Off;
+ +#endif
+ +      rc = vdbeSorterListToPMA(pThread);
+ +      assert( rc!=SQLITE_OK || (nExpect==pThread->iTemp1Off) );
+ +    }
+ +  }
+ +
+ + thread_out:
+ +  pThread->bDone = 1;
+ +  return SQLITE_INT_TO_PTR(rc);
+ +}
+ +
+ +/*
+ +** Run the activity scheduled by the object passed as the only argument
+ +** in the current thread.
+ +*/
+ +static int vdbeSorterRunThread(SorterThread *pThread){
+ +  int rc = SQLITE_PTR_TO_INT( vdbeSorterThreadMain((void*)pThread) );
+ +  assert( pThread->bDone );
+ +  pThread->bDone = 0;
+ +  return rc;
+ +}
+ +
+ +/*
+ +** Flush the current contents of VdbeSorter.pRecord to a new PMA, possibly
+ +** using a background thread.
+ +**
+ +** If argument bFg is non-zero, the operation always uses the calling thread.
+ +*/
+ +static int vdbeSorterFlushPMA(sqlite3 *db, const VdbeCursor *pCsr, int bFg){
+ +  VdbeSorter *pSorter = pCsr->pSorter;
+ +  int rc = SQLITE_OK;
+ +  int i;
+ +  SorterThread *pThread;        /* Thread context used to create new PMA */
+ +
+ +  pSorter->bUsePMA = 1;
+ +  for(i=0; ALWAYS( i<SQLITE_MAX_SORTER_THREAD ); i++){
+ +    pThread = &pSorter->aThread[i];
+ +    if( pThread->bDone ){
+ +      void *pRet;
+ +      assert( pThread->pThread );
+ +      rc = sqlite3ThreadJoin(pThread->pThread, &pRet);
+ +      pThread->pThread = 0;
+ +      pThread->bDone = 0;
+ +      if( rc==SQLITE_OK ){
+ +        rc = SQLITE_PTR_TO_INT(pRet);
+ +      }
+ +    }
+ +    if( pThread->pThread==0 ) break;
+ +  }
+ +
+ +  if( rc==SQLITE_OK ){
++    int bUseFg = (bFg || i==(SQLITE_MAX_SORTER_THREAD-1));
++
+ +    assert( pThread->pThread==0 && pThread->bDone==0 );
+ +    pThread->eWork = SORTER_THREAD_TO_PMA;
+ +    pThread->pList = pSorter->pRecord;
+ +    pThread->nInMemory = pSorter->nInMemory;
+ +    pSorter->nInMemory = 0;
+ +    pSorter->pRecord = 0;
+ +
-     if( bFg || i<(SQLITE_MAX_SORTER_THREAD-1) ){
++    if( pSorter->aMemory ){
++      u8 *aMem = pThread->aListMemory;
++      pThread->aListMemory = pSorter->aMemory;
++      pSorter->aMemory = aMem;
++    }
++
++    if( bUseFg==0 ){
++      /* Launch a background thread for this operation */
+ +      void *pCtx = (void*)pThread;
++      if( pSorter->aMemory==0 ){
++        pSorter->aMemory = sqlite3Malloc(pSorter->nMemory);
++        if( pSorter->aMemory==0 ) return SQLITE_NOMEM;
++      }else{
++        pSorter->nMemory = sqlite3MallocSize(pSorter->aMemory);
++      }
+ +      rc = sqlite3ThreadCreate(&pThread->pThread, vdbeSorterThreadMain, pCtx);
+ +    }else{
+ +      /* Use the foreground thread for this operation */
++      u8 *aMem;
+ +      rc = vdbeSorterRunThread(pThread);
++      aMem = pThread->aListMemory;
++      pThread->aListMemory = pSorter->aMemory;
++      pSorter->aMemory = aMem;
       }
- -    pSorter->pRecord = p;
- -    rc = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
     }
   
- -  if( pSorter->aMemory ) pSorter->pRecord = 0;
- -  assert( pSorter->pRecord==0 || rc!=SQLITE_OK );
     return rc;
   }
   
@@@ -1148,13 -922,63 +1287,57 @@@ int sqlite3VdbeSorterWrite
     **   * The total memory allocated for the in-memory list is greater 
     **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
     */
-   if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
-         (pSorter->nInMemory>pSorter->mxPmaSize)
-      || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
-   )){
+   nReq = pVal->n + sizeof(SorterRecord);
+   nPMA = pVal->n + sqlite3VarintLen(pVal->n);
+   if( pSorter->aMemory ){
+     bFlush = pSorter->iMemory && (pSorter->iMemory+nReq) > pSorter->mxPmaSize;
+   }else{
+     bFlush = (
+         (pSorter->nInMemory > pSorter->mxPmaSize)
+      || (pSorter->nInMemory > pSorter->mnPmaSize && sqlite3HeapNearlyFull())
+     );
+   }
+   if( bFlush ){
- -#ifdef SQLITE_DEBUG
- -    i64 nExpect = pSorter->iWriteOff
- -      + sqlite3VarintLen(pSorter->nInMemory)
- -      + pSorter->nInMemory;
- -#endif
- -    rc = vdbeSorterListToPMA(db, pCsr);
+ +    rc = vdbeSorterFlushPMA(db, pCsr, 0);
+     pSorter->nInMemory = 0;
+     pSorter->iMemory = 0;
- -    assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
+     assert( rc!=SQLITE_OK || pSorter->pRecord==0 );
     }
   
- -    pNew = (SorterRecord *)sqlite3DbMallocRaw(db, pVal->n+sizeof(SorterRecord));
+   pSorter->nInMemory += nPMA;
+ 
+   if( pSorter->aMemory ){
+     int nMin = pSorter->iMemory + nReq;
+ 
+     if( nMin>pSorter->nMemory ){
+       u8 *aNew;
+       int nNew = pSorter->nMemory * 2;
+       while( nNew < nMin ) nNew = nNew*2;
+       if( nNew > pSorter->mxPmaSize ) nNew = pSorter->mxPmaSize;
+       if( nNew < nMin ) nNew = nMin;
+ 
+       aNew = sqlite3Realloc(pSorter->aMemory, nNew);
+       if( !aNew ) return SQLITE_NOMEM;
+       pSorter->pRecord = aNew + ((u8*)pSorter->pRecord - pSorter->aMemory);
+       pSorter->aMemory = aNew;
+       pSorter->nMemory = nNew;
+     }
+ 
+     pNew = (SorterRecord*)&pSorter->aMemory[pSorter->iMemory];
+     pSorter->iMemory += ROUND8(nReq);
+     pNew->u.iNext = (u8*)(pSorter->pRecord) - pSorter->aMemory;
+   }else{
++    pNew = (SorterRecord *)sqlite3Malloc(pVal->n+sizeof(SorterRecord));
+     if( pNew==0 ){
+       return SQLITE_NOMEM;
+     }
+     pNew->u.pNext = pSorter->pRecord;
+   }
+ 
+   memcpy(SRVAL(pNew), pVal->z, pVal->n);
+   pNew->nVal = pVal->n;
+   pSorter->pRecord = pNew;
+ 
     return rc;
   }
   
@@@ -1183,19 -1031,10 +1366,22 @@@ int sqlite3VdbeSorterRewind(sqlite3 *db
     /* If no data has been written to disk, then do not do so now. Instead,
     ** sort the VdbeSorter.pRecord list. The vdbe layer will read data directly
     ** from the in-memory list.  */
- -  if( pSorter->nPMA==0 ){
- -    *pbEof = !pSorter->pRecord;
- -    assert( pSorter->aTree==0 );
- -    return vdbeSorterSort(pCsr);
+ +  if( pSorter->bUsePMA==0 ){
+ +    if( pSorter->pRecord ){
+ +      SorterThread *pThread = &pSorter->aThread[0];
+ +      *pbEof = 0;
+ +      pThread->pList = pSorter->pRecord;
+ +      pThread->eWork = SORTER_THREAD_SORT;
++      assert( pThread->aListMemory==0 );
++      pThread->aListMemory = pSorter->aMemory;
+ +      rc = vdbeSorterRunThread(pThread);
++      pThread->aListMemory = 0;
+ +      pSorter->pRecord = pThread->pList;
+ +      pThread->pList = 0;
+ +    }else{
+ +      *pbEof = 1;
+ +    }
+ +    return rc;
     }
   
     /* Write the current in-memory list to a PMA. */
@@@ -1278,13 -1133,68 +1464,13 @@@ int sqlite3VdbeSorterNext(sqlite3 *db, 
     VdbeSorter *pSorter = pCsr->pSorter;
     int rc;                         /* Return code */
   
- -  if( pSorter->aTree ){
- -    int iPrev = pSorter->aTree[1];/* Index of iterator to advance */
- -    rc = vdbeSorterIterNext(db, &pSorter->aIter[iPrev]);
- -    if( rc==SQLITE_OK ){
- -      int i;                      /* Index of aTree[] to recalculate */
- -      VdbeSorterIter *pIter1;     /* First iterator to compare */
- -      VdbeSorterIter *pIter2;     /* Second iterator to compare */
- -      u8 *pKey2;                  /* To pIter2->aKey, or 0 if record cached */
- -
- -      /* Find the first two iterators to compare. The one that was just
- -      ** advanced (iPrev) and the one next to it in the array.  */
- -      pIter1 = &pSorter->aIter[(iPrev & 0xFFFE)];
- -      pIter2 = &pSorter->aIter[(iPrev | 0x0001)];
- -      pKey2 = pIter2->aKey;
- -
- -      for(i=(pSorter->nTree+iPrev)/2; i>0; i=i/2){
- -        /* Compare pIter1 and pIter2. Store the result in variable iRes. */
- -        int iRes;
- -        if( pIter1->pFile==0 ){
- -          iRes = +1;
- -        }else if( pIter2->pFile==0 ){
- -          iRes = -1;
- -        }else{
- -          vdbeSorterCompare(pCsr, 0, 
- -              pIter1->aKey, pIter1->nKey, pKey2, pIter2->nKey, &iRes
- -          );
- -        }
- -
- -        /* If pIter1 contained the smaller value, set aTree[i] to its index.
- -        ** Then set pIter2 to the next iterator to compare to pIter1. In this
- -        ** case there is no cache of pIter2 in pSorter->pUnpacked, so set
- -        ** pKey2 to point to the record belonging to pIter2.
- -        **
- -        ** Alternatively, if pIter2 contains the smaller of the two values,
- -        ** set aTree[i] to its index and update pIter1. If vdbeSorterCompare()
- -        ** was actually called above, then pSorter->pUnpacked now contains
- -        ** a value equivalent to pIter2. So set pKey2 to NULL to prevent
- -        ** vdbeSorterCompare() from decoding pIter2 again.  
- -        **
- -        ** If the two values were equal, then the value from the oldest
- -        ** PMA should be considered smaller. The VdbeSorter.aIter[] array
- -        ** is sorted from oldest to newest, so pIter1 contains older values
- -        ** than pIter2 iff (pIter1<pIter2).  */
- -        if( iRes<0 || (iRes==0 && pIter1<pIter2) ){
- -          pSorter->aTree[i] = (int)(pIter1 - pSorter->aIter);
- -          pIter2 = &pSorter->aIter[ pSorter->aTree[i ^ 0x0001] ];
- -          pKey2 = pIter2->aKey;
- -        }else{
- -          if( pIter1->pFile ) pKey2 = 0;
- -          pSorter->aTree[i] = (int)(pIter2 - pSorter->aIter);
- -          pIter1 = &pSorter->aIter[ pSorter->aTree[i ^ 0x0001] ];
- -        }
- -      }
- -      *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
- -    }
+ +  if( pSorter->pMerger ){
+ +    rc = vdbeSorterNext(&pSorter->aThread[0], pSorter->pMerger, pbEof);
     }else{
       SorterRecord *pFree = pSorter->pRecord;
-     pSorter->pRecord = pFree->pNext;
-     pFree->pNext = 0;
-     vdbeSorterRecordFree(db, pFree);
+     pSorter->pRecord = pFree->u.pNext;
+     pFree->u.pNext = 0;
- -    if( pSorter->aMemory==0 ){
- -      vdbeSorterRecordFree(db, pFree);
- -    }
++    if( pSorter->aMemory==0 ) vdbeSorterRecordFree(db, pFree);
       *pbEof = !pSorter->pRecord;
       rc = SQLITE_OK;
     }
diff --cc test/sort.test

index 08d496b2596a0380455034930d974f4e20a816c2,b543ffad241cb64d02e168ff528a81822574a098..ccbfdda2b32e39e88607ed60ea19de3affde256c
--- 1/test/sort.test
--- 2/test/sort.test
+++ b/test/sort.test
@@@ -464,4 -464,27 +464,26 @@@ do_test sort-12.1 
     }
   } {1 2 xxx 1 3 yyy 1 1 zzz}
   
- -
+ #-------------------------------------------------------------------------
+ # Check that the sorter in vdbesort.c sorts in a stable fashion.
+ #
+ do_execsql_test sort-13.0 {
+   CREATE TABLE t10(a, b);
+ }
+ do_test sort-13.1 {
+   db transaction {
+     for {set i 0} {$i < 100000} {incr i} {
+       execsql { INSERT INTO t10 VALUES( $i/10, $i%10 ) }
+     }
+   }
+ } {}
+ do_execsql_test sort-13.2 {
+   SELECT a, b FROM t10 ORDER BY a;
+ } [db eval {SELECT a, b FROM t10 ORDER BY a, b}]
+ do_execsql_test sort-13.3 {
+   PRAGMA cache_size = 5;
+   SELECT a, b FROM t10 ORDER BY a;
+ } [db eval {SELECT a, b FROM t10 ORDER BY a, b}]
+ 
+ 
   finish_test
author	dan <dan@noemail.net>
	Fri, 28 Mar 2014 19:18:16 +0000 (19:18 +0000)
committer	dan <dan@noemail.net>
	Fri, 28 Mar 2014 19:18:16 +0000 (19:18 +0000)
		1	2
manifest	patch \|	diff1 \|	diff2 \|	blob \| history
manifest.uuid	patch \|	diff1 \|	diff2 \|	blob \| history
src/sqliteInt.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/vdbesort.c	patch \|	diff1 \|	diff2 \|	blob \| history
test/sort.test	patch \|	diff1 \|	diff2 \|	blob \| history