** May you share freely, never taking more than you give.
**
*************************************************************************
-** $Id: btree.c,v 1.145 2004/05/20 22:16:29 drh Exp $
+** $Id: btree.c,v 1.146 2004/05/22 02:55:23 drh Exp $
**
** This file implements a external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
** All of the integer values are big-endian (most significant byte first).
**
-** The file change counter is incremented every time the database is more
+** The file change counter is incremented when the database is changed more
** than once within the same second. This counter, together with the
** modification time of the file, allows other processes to know
** when the file has changed and thus when they need to flush their
** space in a page that can be consumed by a single cell for standard
** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
** is to limit the maximum cell size so that at least 4 cells will fit
-** on one pages. Thus the default max embedded payload fraction is 64.
+** on one page. Thus the default max embedded payload fraction is 64.
**
** If the payload for a cell is larger than the max payload, then extra
** payload is spilled to overflow pages. Once an overflow page is allocated,
** A variable-length integer is 1 to 9 bytes where the lower 7 bits of each
** byte are used. The integer consists of all bytes that have bit 8 set and
** the first byte with bit 8 clear. The most significant byte of the integer
-** appears first.
+** appears first. A variable-length integer may not be more than 9 bytes long.
+** As a special case, all 8 bytes of the 9th byte are used as data. This
+** allows a 64-bit integer to be encoded in 9 bytes.
**
** 0x00 becomes 0x00000000
** 0x7f becomes 0x0000007f
** increasing order. Because a freeblock is 4 bytes in size, the minimum
** size allocation on a btree page is 4 bytes. Because a freeblock must be
** at least 4 bytes in size, any group of 3 or fewer unused bytes cannot
-** exist on the freeblock chain. The total number of such fragmented bytes
-** is recorded in the page header at offset 5.
+** exist on the freeblock chain. A group of 3 or fewer free bytes is called
+** a fragment. The total number of bytes in all fragments is recorded.
+** in the page header at offset 5.
**
** SIZE DESCRIPTION
** 2 Byte offset of the next freeblock
/* The following value is the maximum cell size assuming a maximum page
** size give above.
*/
-#define MX_CELL_SIZE (MX_PAGE_SIZE-10)
+#define MX_CELL_SIZE (MX_PAGE_SIZE-6)
/* The maximum number of cells on a single page of the database. This
** assumes a minimum cell size of 3 bytes. Such small cells will be
** exceedingly rare, but they are possible.
*/
-#define MX_CELL ((MX_PAGE_SIZE-10)/3)
+#define MX_CELL ((MX_PAGE_SIZE-6)/3)
/* Forward declarations */
typedef struct MemPage MemPage;
** The pageDestructor() routine handles that chore.
*/
struct MemPage {
- u32 notUsed;
u8 isInit; /* True if previously initialized */
u8 idxShift; /* True if Cell indices have changed */
u8 isOverfull; /* Some aCell[] do not fit on page */
u8 leafData; /* True if tables stores data on leaves only */
u8 hasData; /* True if this page stores data */
u8 hdrOffset; /* 100 for page 1. 0 otherwise */
- u8 needRelink; /* True if need to run relinkCellList() */
+ u8 needRelink; /* True if cell not linked properly in aData */
int idxParent; /* Index in pParent->aCell[] of this node */
int nFree; /* Number of free bytes on the page */
int nCell; /* Number of entries on this page */
unsigned char **aCell; /* Pointer to start of each cell */
struct Btree *pBt; /* Pointer back to BTree structure */
+ /* When page content is move from one page to the other (by the movePage()
+ ** subroutine) only the information about is moved. The information below
+ ** is fixed. */
unsigned char *aData; /* Pointer back to the start of the page */
Pgno pgno; /* Page number for this page */
MemPage *pParent; /* The parent of this page. NULL for root */
BtCursor *pCursor; /* A list of all open cursors */
MemPage *pPage1; /* First page of the database */
u8 inTrans; /* True if a transaction is in progress */
- u8 inStmt; /* True if there is a checkpoint on the transaction */
+ u8 inStmt; /* True if we are in a statement subtransaction */
u8 readOnly; /* True if the underlying file is readonly */
+ u8 maxEmbedFrac; /* Maximum payload as % of total page size */
+ u8 minEmbedFrac; /* Minimum payload as % of total page size */
+ u8 minLeafFrac; /* Minimum leaf payload as % of total page size */
int pageSize; /* Total number of bytes on a page */
int usableSize; /* Number of usable bytes on each page */
int maxLocal; /* Maximum local payload in non-LEAFDATA tables */
int minLocal; /* Minimum local payload in non-LEAFDATA tables */
int maxLeaf; /* Maximum local payload in a LEAFDATA table */
int minLeaf; /* Minimum local payload in a LEAFDATA table */
- u8 maxEmbedFrac; /* Maximum payload as % of total page size */
- u8 minEmbedFrac; /* Minimum payload as % of total page size */
- u8 minLeafFrac; /* Minimum leaf payload as % of total page size */
};
typedef Btree Bt;
/*
** An instance of the following structure is used to hold information
-** about a cell. The parseCell() function fills the structure in.
+** about a cell. The parseCell() function fills in this structure
+** based on information extract from the raw disk page.
*/
typedef struct CellInfo CellInfo;
struct CellInfo {
i64 nKey; /* The key for INTKEY tables, or number of bytes in key */
u32 nData; /* Number of bytes of data */
- u16 nHeader; /* Size of the header in bytes */
+ u16 nHeader; /* Size of the cell header in bytes */
u16 nLocal; /* Amount of payload held locally */
- u16 iOverflow; /* Offset to overflow page number. Zero if none */
- u16 nSize; /* Size of the cell */
+ u16 iOverflow; /* Offset to overflow page number. Zero if no overflow */
+ u16 nSize; /* Total size of the cell (on the main b-tree page) */
};
/*
CellInfo info; /* A parse of the cell we are pointing at */
u8 infoValid; /* True if information in BtCursor.info is valid */
u8 wrFlag; /* True if writable */
- u8 iMatch; /* compare result from last sqlite3BtreeMoveto() */
u8 isValid; /* TRUE if points to a valid entry */
u8 status; /* Set to SQLITE_ABORT if cursors is invalidated */
};
/*
-** Read or write a two-, four-, and eight-byte big-endian integer values.
+** Read or write a two- and four-byte big-endian integer values.
*/
static u32 get2byte(unsigned char *p){
return (p[0]<<8) | p[1];
}
/*
-** Routines to read and write variable-length integers.
+** Routines to read and write variable-length integers. These used to
+** be defined locally, but now we use the varint routines in the util.c
+** file.
*/
#define getVarint sqlite3GetVarint
#define getVarint32 sqlite3GetVarint32
*/
static void parseCell(
MemPage *pPage, /* Page containing the cell */
- unsigned char *pCell, /* The cell */
+ unsigned char *pCell, /* Pointer to the first byte of the cell */
CellInfo *pInfo /* Fill in this structure */
){
int n;
int nPayload;
Btree *pBt;
int minLocal, maxLocal;
- if( pPage->leaf ){
- n = 2;
- }else{
- n = 6;
- }
+ assert( pPage->leaf==0 || pPage->leaf==1 );
+ n = 6 - 4*pPage->leaf;
if( pPage->hasData ){
n += getVarint32(&pCell[n], &pInfo->nData);
}else{
pBt = pPage->pBt;
if( pPage->leafData ){
minLocal = pBt->minLeaf;
- maxLocal = pBt->usableSize - 23;
+ maxLocal = pBt->maxLeaf;
}else{
minLocal = pBt->minLocal;
maxLocal = pBt->maxLocal;
** allocating the new chunk.
**
** Algorithm: Carve a piece off of the first freeblock that is
-** nByte in size or that larger.
+** nByte in size or larger.
*/
static int allocateSpace(MemPage *pPage, int nByte){
int addr, pc, hdr;
}
/*
-** Set the checkpoint for the current transaction. The checkpoint serves
-** as a sub-transaction that can be rolled back independently of the
-** main transaction. You must start a transaction before starting a
-** checkpoint. The checkpoint is ended automatically if the transaction
+** Start a statement subtransaction. The subtransaction can
+** can be rolled back independently of the main transaction.
+** You must start a transaction before starting a subtransaction.
+** The subtransaction is ended automatically if the main transaction
** commits or rolls back.
**
-** Only one checkpoint may be active at a time. It is an error to try
-** to start a new checkpoint if another checkpoint is already active.
+** Only one subtransaction may be active at a time. It is an error to try
+** to start a new subtransaction if another subtransaction is already active.
+**
+** Statement subtransactions are used around individual SQL statements
+** that are contained within a BEGIN...COMMIT block. If a constraint
+** error occurs within the statement, the effect of that one statement
+** can be rolled back without having to rollback the entire transaction.
*/
int sqlite3BtreeBeginStmt(Btree *pBt){
int rc;
/*
-** Commit a checkpoint to transaction currently in progress. If no
-** checkpoint is active, this is a no-op.
+** Commit the statment subtransaction currently in progress. If no
+** subtransaction is active, this is a no-op.
*/
int sqlite3BtreeCommitStmt(Btree *pBt){
int rc;
}
/*
-** Rollback the checkpoint to the current transaction. If there
-** is no active checkpoint or transaction, this routine is a no-op.
+** Rollback the active statement subtransaction. If no subtransaction
+** is active this routine is a no-op.
**
-** All cursors will be invalided by this operation. Any attempt
+** All cursors will be invalidated by this operation. Any attempt
** to use a cursor that was open at the beginning of this operation
** will result in an error.
*/
/*
** Make sure the BtCursor.info field of the given cursor is valid.
+** If it is not already valid, call parseCell() to fill it in.
+**
+** BtCursor.info is a cache of the information in the current cell.
+** Using this cache reduces the number of calls to parseCell().
*/
static void getCellInfo(BtCursor *pCur){
MemPage *pPage = pCur->pPage;
** a total of "amt" bytes. Put the result in zBuf.
**
** This routine does not make a distinction between key and data.
-** It just reads bytes from the payload area.
+** It just reads bytes from the payload area. Data might appear
+** on the main page or be scattered out on multiple overflow pages.
*/
static int getPayload(
BtCursor *pCur, /* Cursor pointing to entry to read from */
/*
** Move the cursor down to a new child page. The newPgno argument is the
-** page number of the child page in the byte order of the disk image.
+** page number of the child page to move to.
*/
static int moveToChild(BtCursor *pCur, u32 newPgno){
int rc;
** before or after the key.
**
** The result of comparing the key with the entry to which the
-** cursor is left pointing is stored in pCur->iMatch. The same
-** value is also written to *pRes if pRes!=NULL. The meaning of
+** cursor is written to *pRes if pRes!=NULL. The meaning of
** this value is as follows:
**
** *pRes<0 The cursor is left pointing at an entry that
upr = lwr - 1;
break;
}else{
- pCur->iMatch = c;
if( pRes ) *pRes = 0;
return SQLITE_OK;
}
chldPg = get4byte(&pPage->aCell[lwr][2]);
}
if( chldPg==0 ){
- pCur->iMatch = c;
assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
if( pRes ) *pRes = c;
return SQLITE_OK;
put2byte(data+prevpc, 0);
}
+#if 0 /* Never Used */
/*
** Rebuild the linked list of cells on a page so that the cells
** occur in the order specified by the pPage->aCell[] array.
put2byte(&pPage->aData[idxFrom], 0);
pPage->needRelink = 0;
}
+#endif
/*
** GCC does not define the offsetof() macro so we'll have to do it
#define NB (NN*2+1) /* Total pages involved in the balance */
/*
-** This routine redistributes Cells on pPage and up to two siblings
+** This routine redistributes Cells on pPage and up to NN*2 siblings
** of pPage so that all pages have about the same amount of free space.
** Usually one sibling on either side of pPage is used in the balancing,
** though both siblings might come from one side if pPage is the first
-** or last child of its parent. If pPage has fewer than two siblings
+** or last child of its parent. If pPage has fewer than 2*NN siblings
** (something which can only happen if pPage is the root page or a
** child of root) then all available siblings participate in the balancing.
**
** The number of siblings of pPage might be increased or decreased by
-** one in an effort to keep pages between 66% and 100% full. The root page
-** is special and is allowed to be less than 66% full. If pPage is
+** one in an effort to keep pages nearly full but not over full. The root page
+** is special and is allowed to be nearly empty. If pPage is
** the root page, then the depth of the tree might be increased
** or decreased by one, as necessary, to keep the root page from being
-** overfull or empty.
-**
-** This routine alwyas calls relinkCellList() on its input page regardless of
-** whether or not it does any real balancing. Client routines will typically
-** invoke insertCell() or dropCell() before calling this routine, so we
-** need to call relinkCellList() to clean up the mess that those other
-** routines left behind.
+** overfull or completely empty.
**
** Note that when this routine is called, some of the Cells on pPage
** might not actually be stored in pPage->aData[]. This can happen
pBt = pPage->pBt;
if( !pPage->isOverfull && pPage->nFree<pBt->usableSize*2/3
&& pPage->nCell>=2){
- relinkCellList(pPage);
+ assert( pPage->needRelink==0 );
return SQLITE_OK;
}
if( pPage->nCell==0 ){
if( pPage->leaf ){
/* The table is completely empty */
- relinkCellList(pPage);
+ assert( pPage->needRelink==0 );
TRACE(("BALANCE: empty table %d\n", pPage->pgno));
}else{
/* The root page is empty but has one child. Transfer the
if( !pPage->isOverfull ){
/* It is OK for the root page to be less than half full.
*/
- relinkCellList(pPage);
+ assert( pPage->needRelink==0 );
TRACE(("BALANCE: root page %d is low - no changes\n", pPage->pgno));
return SQLITE_OK;
}
j = cntNew[i];
assert( pNew->nCell>0 );
assert( !pNew->isOverfull );
- relinkCellList(pNew);
+ assert( pNew->needRelink==0 );
if( i<nNew-1 && j<nCell ){
u8 *pCell;
u8 *pTemp;
** Create a new BTree table. Write into *piTable the page
** number for the root page of the new table.
**
-** In the current implementation, BTree tables and BTree indices are the
-** the same. In the future, we may change this so that BTree tables
-** are restricted to having a 4-byte integer key and arbitrary data and
-** BTree indices are restricted to having an arbitrary key and no data.
-** But for now, this routine also serves to create indices.
+** The type of type is determined by the flags parameter. Only the
+** following values of flags are currently in use. Other values for
+** flags might not work:
+**
+** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
+** BTREE_ZERODATA Used for SQL indices
*/
int sqlite3BtreeCreateTable(Btree *pBt, int *piTable, int flags){
MemPage *pRoot;
}
/*
-** Delete all information from a single table in the database.
+** Delete all information from a single table in the database. iTable is
+** the page number of the root of the table. After this routine returns,
+** the root page is empty, but still exists.
+**
+** This routine will fail with SQLITE_LOCKED if there are any open
+** read cursors on the table. Open write cursors are moved to the
+** root of the table.
*/
int sqlite3BtreeClearTable(Btree *pBt, int iTable){
int rc;
/*
** Erase all information in a table and add the root of the table to
** the freelist. Except, the root of the principle table (the one on
-** page 2) is never added to the freelist.
+** page 1) is never added to the freelist.
+**
+** This routine will fail with SQLITE_LOCKED if there are any open
+** cursors on the table.
*/
int sqlite3BtreeDropTable(Btree *pBt, int iTable){
int rc;