000001  /*
000002  ** 2004 April 6
000003  **
000004  ** The author disclaims copyright to this source code.  In place of
000005  ** a legal notice, here is a blessing:
000006  **
000007  **    May you do good and not evil.
000008  **    May you find forgiveness for yourself and forgive others.
000009  **    May you share freely, never taking more than you give.
000010  **
000011  *************************************************************************
000012  ** This file implements an external (disk-based) database using BTrees.
000013  ** See the header comment on "btreeInt.h" for additional information.
000014  ** Including a description of file format and an overview of operation.
000015  */
000016  #include "btreeInt.h"
000017  
000018  /*
000019  ** The header string that appears at the beginning of every
000020  ** SQLite database.
000021  */
000022  static const char zMagicHeader[] = SQLITE_FILE_HEADER;
000023  
000024  /*
000025  ** Set this global variable to 1 to enable tracing using the TRACE
000026  ** macro.
000027  */
000028  #if 0
000029  int sqlite3BtreeTrace=1;  /* True to enable tracing */
000030  # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
000031  #else
000032  # define TRACE(X)
000033  #endif
000034  
000035  /*
000036  ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
000037  ** But if the value is zero, make it 65536.
000038  **
000039  ** This routine is used to extract the "offset to cell content area" value
000040  ** from the header of a btree page.  If the page size is 65536 and the page
000041  ** is empty, the offset should be 65536, but the 2-byte value stores zero.
000042  ** This routine makes the necessary adjustment to 65536.
000043  */
000044  #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
000045  
000046  /*
000047  ** Values passed as the 5th argument to allocateBtreePage()
000048  */
000049  #define BTALLOC_ANY   0           /* Allocate any page */
000050  #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
000051  #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
000052  
000053  /*
000054  ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 
000055  ** defined, or 0 if it is. For example:
000056  **
000057  **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
000058  */
000059  #ifndef SQLITE_OMIT_AUTOVACUUM
000060  #define IfNotOmitAV(expr) (expr)
000061  #else
000062  #define IfNotOmitAV(expr) 0
000063  #endif
000064  
000065  #ifndef SQLITE_OMIT_SHARED_CACHE
000066  /*
000067  ** A list of BtShared objects that are eligible for participation
000068  ** in shared cache.  This variable has file scope during normal builds,
000069  ** but the test harness needs to access it so we make it global for 
000070  ** test builds.
000071  **
000072  ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
000073  */
000074  #ifdef SQLITE_TEST
000075  BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000076  #else
000077  static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000078  #endif
000079  #endif /* SQLITE_OMIT_SHARED_CACHE */
000080  
000081  #ifndef SQLITE_OMIT_SHARED_CACHE
000082  /*
000083  ** Enable or disable the shared pager and schema features.
000084  **
000085  ** This routine has no effect on existing database connections.
000086  ** The shared cache setting effects only future calls to
000087  ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
000088  */
000089  int sqlite3_enable_shared_cache(int enable){
000090    sqlite3GlobalConfig.sharedCacheEnabled = enable;
000091    return SQLITE_OK;
000092  }
000093  #endif
000094  
000095  
000096  
000097  #ifdef SQLITE_OMIT_SHARED_CACHE
000098    /*
000099    ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
000100    ** and clearAllSharedCacheTableLocks()
000101    ** manipulate entries in the BtShared.pLock linked list used to store
000102    ** shared-cache table level locks. If the library is compiled with the
000103    ** shared-cache feature disabled, then there is only ever one user
000104    ** of each BtShared structure and so this locking is not necessary. 
000105    ** So define the lock related functions as no-ops.
000106    */
000107    #define querySharedCacheTableLock(a,b,c) SQLITE_OK
000108    #define setSharedCacheTableLock(a,b,c) SQLITE_OK
000109    #define clearAllSharedCacheTableLocks(a)
000110    #define downgradeAllSharedCacheTableLocks(a)
000111    #define hasSharedCacheTableLock(a,b,c,d) 1
000112    #define hasReadConflicts(a, b) 0
000113  #endif
000114  
000115  /*
000116  ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
000117  ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
000118  **
000119  ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
000120  ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
000121  ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
000122  ** with the page number and filename associated with the (MemPage*).
000123  */
000124  #ifdef SQLITE_DEBUG
000125  int corruptPageError(int lineno, MemPage *p){
000126    char *zMsg;
000127    sqlite3BeginBenignMalloc();
000128    zMsg = sqlite3_mprintf("database corruption page %d of %s",
000129        (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
000130    );
000131    sqlite3EndBenignMalloc();
000132    if( zMsg ){
000133      sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
000134    }
000135    sqlite3_free(zMsg);
000136    return SQLITE_CORRUPT_BKPT;
000137  }
000138  # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
000139  #else
000140  # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
000141  #endif
000142  
000143  #ifndef SQLITE_OMIT_SHARED_CACHE
000144  
000145  #ifdef SQLITE_DEBUG
000146  /*
000147  **** This function is only used as part of an assert() statement. ***
000148  **
000149  ** Check to see if pBtree holds the required locks to read or write to the 
000150  ** table with root page iRoot.   Return 1 if it does and 0 if not.
000151  **
000152  ** For example, when writing to a table with root-page iRoot via 
000153  ** Btree connection pBtree:
000154  **
000155  **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
000156  **
000157  ** When writing to an index that resides in a sharable database, the 
000158  ** caller should have first obtained a lock specifying the root page of
000159  ** the corresponding table. This makes things a bit more complicated,
000160  ** as this module treats each table as a separate structure. To determine
000161  ** the table corresponding to the index being written, this
000162  ** function has to search through the database schema.
000163  **
000164  ** Instead of a lock on the table/index rooted at page iRoot, the caller may
000165  ** hold a write-lock on the schema table (root page 1). This is also
000166  ** acceptable.
000167  */
000168  static int hasSharedCacheTableLock(
000169    Btree *pBtree,         /* Handle that must hold lock */
000170    Pgno iRoot,            /* Root page of b-tree */
000171    int isIndex,           /* True if iRoot is the root of an index b-tree */
000172    int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
000173  ){
000174    Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
000175    Pgno iTab = 0;
000176    BtLock *pLock;
000177  
000178    /* If this database is not shareable, or if the client is reading
000179    ** and has the read-uncommitted flag set, then no lock is required. 
000180    ** Return true immediately.
000181    */
000182    if( (pBtree->sharable==0)
000183     || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
000184    ){
000185      return 1;
000186    }
000187  
000188    /* If the client is reading  or writing an index and the schema is
000189    ** not loaded, then it is too difficult to actually check to see if
000190    ** the correct locks are held.  So do not bother - just return true.
000191    ** This case does not come up very often anyhow.
000192    */
000193    if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
000194      return 1;
000195    }
000196  
000197    /* Figure out the root-page that the lock should be held on. For table
000198    ** b-trees, this is just the root page of the b-tree being read or
000199    ** written. For index b-trees, it is the root page of the associated
000200    ** table.  */
000201    if( isIndex ){
000202      HashElem *p;
000203      for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
000204        Index *pIdx = (Index *)sqliteHashData(p);
000205        if( pIdx->tnum==(int)iRoot ){
000206          if( iTab ){
000207            /* Two or more indexes share the same root page.  There must
000208            ** be imposter tables.  So just return true.  The assert is not
000209            ** useful in that case. */
000210            return 1;
000211          }
000212          iTab = pIdx->pTable->tnum;
000213        }
000214      }
000215    }else{
000216      iTab = iRoot;
000217    }
000218  
000219    /* Search for the required lock. Either a write-lock on root-page iTab, a 
000220    ** write-lock on the schema table, or (if the client is reading) a
000221    ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
000222    for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
000223      if( pLock->pBtree==pBtree 
000224       && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
000225       && pLock->eLock>=eLockType 
000226      ){
000227        return 1;
000228      }
000229    }
000230  
000231    /* Failed to find the required lock. */
000232    return 0;
000233  }
000234  #endif /* SQLITE_DEBUG */
000235  
000236  #ifdef SQLITE_DEBUG
000237  /*
000238  **** This function may be used as part of assert() statements only. ****
000239  **
000240  ** Return true if it would be illegal for pBtree to write into the
000241  ** table or index rooted at iRoot because other shared connections are
000242  ** simultaneously reading that same table or index.
000243  **
000244  ** It is illegal for pBtree to write if some other Btree object that
000245  ** shares the same BtShared object is currently reading or writing
000246  ** the iRoot table.  Except, if the other Btree object has the
000247  ** read-uncommitted flag set, then it is OK for the other object to
000248  ** have a read cursor.
000249  **
000250  ** For example, before writing to any part of the table or index
000251  ** rooted at page iRoot, one should call:
000252  **
000253  **    assert( !hasReadConflicts(pBtree, iRoot) );
000254  */
000255  static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
000256    BtCursor *p;
000257    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000258      if( p->pgnoRoot==iRoot 
000259       && p->pBtree!=pBtree
000260       && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
000261      ){
000262        return 1;
000263      }
000264    }
000265    return 0;
000266  }
000267  #endif    /* #ifdef SQLITE_DEBUG */
000268  
000269  /*
000270  ** Query to see if Btree handle p may obtain a lock of type eLock 
000271  ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
000272  ** SQLITE_OK if the lock may be obtained (by calling
000273  ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
000274  */
000275  static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
000276    BtShared *pBt = p->pBt;
000277    BtLock *pIter;
000278  
000279    assert( sqlite3BtreeHoldsMutex(p) );
000280    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000281    assert( p->db!=0 );
000282    assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
000283    
000284    /* If requesting a write-lock, then the Btree must have an open write
000285    ** transaction on this file. And, obviously, for this to be so there 
000286    ** must be an open write transaction on the file itself.
000287    */
000288    assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
000289    assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
000290    
000291    /* This routine is a no-op if the shared-cache is not enabled */
000292    if( !p->sharable ){
000293      return SQLITE_OK;
000294    }
000295  
000296    /* If some other connection is holding an exclusive lock, the
000297    ** requested lock may not be obtained.
000298    */
000299    if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
000300      sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
000301      return SQLITE_LOCKED_SHAREDCACHE;
000302    }
000303  
000304    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000305      /* The condition (pIter->eLock!=eLock) in the following if(...) 
000306      ** statement is a simplification of:
000307      **
000308      **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
000309      **
000310      ** since we know that if eLock==WRITE_LOCK, then no other connection
000311      ** may hold a WRITE_LOCK on any table in this file (since there can
000312      ** only be a single writer).
000313      */
000314      assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
000315      assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
000316      if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
000317        sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
000318        if( eLock==WRITE_LOCK ){
000319          assert( p==pBt->pWriter );
000320          pBt->btsFlags |= BTS_PENDING;
000321        }
000322        return SQLITE_LOCKED_SHAREDCACHE;
000323      }
000324    }
000325    return SQLITE_OK;
000326  }
000327  #endif /* !SQLITE_OMIT_SHARED_CACHE */
000328  
000329  #ifndef SQLITE_OMIT_SHARED_CACHE
000330  /*
000331  ** Add a lock on the table with root-page iTable to the shared-btree used
000332  ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
000333  ** WRITE_LOCK.
000334  **
000335  ** This function assumes the following:
000336  **
000337  **   (a) The specified Btree object p is connected to a sharable
000338  **       database (one with the BtShared.sharable flag set), and
000339  **
000340  **   (b) No other Btree objects hold a lock that conflicts
000341  **       with the requested lock (i.e. querySharedCacheTableLock() has
000342  **       already been called and returned SQLITE_OK).
000343  **
000344  ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 
000345  ** is returned if a malloc attempt fails.
000346  */
000347  static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
000348    BtShared *pBt = p->pBt;
000349    BtLock *pLock = 0;
000350    BtLock *pIter;
000351  
000352    assert( sqlite3BtreeHoldsMutex(p) );
000353    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000354    assert( p->db!=0 );
000355  
000356    /* A connection with the read-uncommitted flag set will never try to
000357    ** obtain a read-lock using this function. The only read-lock obtained
000358    ** by a connection in read-uncommitted mode is on the sqlite_master 
000359    ** table, and that lock is obtained in BtreeBeginTrans().  */
000360    assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
000361  
000362    /* This function should only be called on a sharable b-tree after it 
000363    ** has been determined that no other b-tree holds a conflicting lock.  */
000364    assert( p->sharable );
000365    assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
000366  
000367    /* First search the list for an existing lock on this table. */
000368    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000369      if( pIter->iTable==iTable && pIter->pBtree==p ){
000370        pLock = pIter;
000371        break;
000372      }
000373    }
000374  
000375    /* If the above search did not find a BtLock struct associating Btree p
000376    ** with table iTable, allocate one and link it into the list.
000377    */
000378    if( !pLock ){
000379      pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
000380      if( !pLock ){
000381        return SQLITE_NOMEM_BKPT;
000382      }
000383      pLock->iTable = iTable;
000384      pLock->pBtree = p;
000385      pLock->pNext = pBt->pLock;
000386      pBt->pLock = pLock;
000387    }
000388  
000389    /* Set the BtLock.eLock variable to the maximum of the current lock
000390    ** and the requested lock. This means if a write-lock was already held
000391    ** and a read-lock requested, we don't incorrectly downgrade the lock.
000392    */
000393    assert( WRITE_LOCK>READ_LOCK );
000394    if( eLock>pLock->eLock ){
000395      pLock->eLock = eLock;
000396    }
000397  
000398    return SQLITE_OK;
000399  }
000400  #endif /* !SQLITE_OMIT_SHARED_CACHE */
000401  
000402  #ifndef SQLITE_OMIT_SHARED_CACHE
000403  /*
000404  ** Release all the table locks (locks obtained via calls to
000405  ** the setSharedCacheTableLock() procedure) held by Btree object p.
000406  **
000407  ** This function assumes that Btree p has an open read or write 
000408  ** transaction. If it does not, then the BTS_PENDING flag
000409  ** may be incorrectly cleared.
000410  */
000411  static void clearAllSharedCacheTableLocks(Btree *p){
000412    BtShared *pBt = p->pBt;
000413    BtLock **ppIter = &pBt->pLock;
000414  
000415    assert( sqlite3BtreeHoldsMutex(p) );
000416    assert( p->sharable || 0==*ppIter );
000417    assert( p->inTrans>0 );
000418  
000419    while( *ppIter ){
000420      BtLock *pLock = *ppIter;
000421      assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
000422      assert( pLock->pBtree->inTrans>=pLock->eLock );
000423      if( pLock->pBtree==p ){
000424        *ppIter = pLock->pNext;
000425        assert( pLock->iTable!=1 || pLock==&p->lock );
000426        if( pLock->iTable!=1 ){
000427          sqlite3_free(pLock);
000428        }
000429      }else{
000430        ppIter = &pLock->pNext;
000431      }
000432    }
000433  
000434    assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
000435    if( pBt->pWriter==p ){
000436      pBt->pWriter = 0;
000437      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000438    }else if( pBt->nTransaction==2 ){
000439      /* This function is called when Btree p is concluding its 
000440      ** transaction. If there currently exists a writer, and p is not
000441      ** that writer, then the number of locks held by connections other
000442      ** than the writer must be about to drop to zero. In this case
000443      ** set the BTS_PENDING flag to 0.
000444      **
000445      ** If there is not currently a writer, then BTS_PENDING must
000446      ** be zero already. So this next line is harmless in that case.
000447      */
000448      pBt->btsFlags &= ~BTS_PENDING;
000449    }
000450  }
000451  
000452  /*
000453  ** This function changes all write-locks held by Btree p into read-locks.
000454  */
000455  static void downgradeAllSharedCacheTableLocks(Btree *p){
000456    BtShared *pBt = p->pBt;
000457    if( pBt->pWriter==p ){
000458      BtLock *pLock;
000459      pBt->pWriter = 0;
000460      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000461      for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
000462        assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
000463        pLock->eLock = READ_LOCK;
000464      }
000465    }
000466  }
000467  
000468  #endif /* SQLITE_OMIT_SHARED_CACHE */
000469  
000470  static void releasePage(MemPage *pPage);         /* Forward reference */
000471  static void releasePageOne(MemPage *pPage);      /* Forward reference */
000472  static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
000473  
000474  /*
000475  ***** This routine is used inside of assert() only ****
000476  **
000477  ** Verify that the cursor holds the mutex on its BtShared
000478  */
000479  #ifdef SQLITE_DEBUG
000480  static int cursorHoldsMutex(BtCursor *p){
000481    return sqlite3_mutex_held(p->pBt->mutex);
000482  }
000483  
000484  /* Verify that the cursor and the BtShared agree about what is the current
000485  ** database connetion. This is important in shared-cache mode. If the database 
000486  ** connection pointers get out-of-sync, it is possible for routines like
000487  ** btreeInitPage() to reference an stale connection pointer that references a
000488  ** a connection that has already closed.  This routine is used inside assert()
000489  ** statements only and for the purpose of double-checking that the btree code
000490  ** does keep the database connection pointers up-to-date.
000491  */
000492  static int cursorOwnsBtShared(BtCursor *p){
000493    assert( cursorHoldsMutex(p) );
000494    return (p->pBtree->db==p->pBt->db);
000495  }
000496  #endif
000497  
000498  /*
000499  ** Invalidate the overflow cache of the cursor passed as the first argument.
000500  ** on the shared btree structure pBt.
000501  */
000502  #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
000503  
000504  /*
000505  ** Invalidate the overflow page-list cache for all cursors opened
000506  ** on the shared btree structure pBt.
000507  */
000508  static void invalidateAllOverflowCache(BtShared *pBt){
000509    BtCursor *p;
000510    assert( sqlite3_mutex_held(pBt->mutex) );
000511    for(p=pBt->pCursor; p; p=p->pNext){
000512      invalidateOverflowCache(p);
000513    }
000514  }
000515  
000516  #ifndef SQLITE_OMIT_INCRBLOB
000517  /*
000518  ** This function is called before modifying the contents of a table
000519  ** to invalidate any incrblob cursors that are open on the
000520  ** row or one of the rows being modified.
000521  **
000522  ** If argument isClearTable is true, then the entire contents of the
000523  ** table is about to be deleted. In this case invalidate all incrblob
000524  ** cursors open on any row within the table with root-page pgnoRoot.
000525  **
000526  ** Otherwise, if argument isClearTable is false, then the row with
000527  ** rowid iRow is being replaced or deleted. In this case invalidate
000528  ** only those incrblob cursors open on that specific row.
000529  */
000530  static void invalidateIncrblobCursors(
000531    Btree *pBtree,          /* The database file to check */
000532    Pgno pgnoRoot,          /* The table that might be changing */
000533    i64 iRow,               /* The rowid that might be changing */
000534    int isClearTable        /* True if all rows are being deleted */
000535  ){
000536    BtCursor *p;
000537    if( pBtree->hasIncrblobCur==0 ) return;
000538    assert( sqlite3BtreeHoldsMutex(pBtree) );
000539    pBtree->hasIncrblobCur = 0;
000540    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000541      if( (p->curFlags & BTCF_Incrblob)!=0 ){
000542        pBtree->hasIncrblobCur = 1;
000543        if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
000544          p->eState = CURSOR_INVALID;
000545        }
000546      }
000547    }
000548  }
000549  
000550  #else
000551    /* Stub function when INCRBLOB is omitted */
000552    #define invalidateIncrblobCursors(w,x,y,z)
000553  #endif /* SQLITE_OMIT_INCRBLOB */
000554  
000555  /*
000556  ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 
000557  ** when a page that previously contained data becomes a free-list leaf 
000558  ** page.
000559  **
000560  ** The BtShared.pHasContent bitvec exists to work around an obscure
000561  ** bug caused by the interaction of two useful IO optimizations surrounding
000562  ** free-list leaf pages:
000563  **
000564  **   1) When all data is deleted from a page and the page becomes
000565  **      a free-list leaf page, the page is not written to the database
000566  **      (as free-list leaf pages contain no meaningful data). Sometimes
000567  **      such a page is not even journalled (as it will not be modified,
000568  **      why bother journalling it?).
000569  **
000570  **   2) When a free-list leaf page is reused, its content is not read
000571  **      from the database or written to the journal file (why should it
000572  **      be, if it is not at all meaningful?).
000573  **
000574  ** By themselves, these optimizations work fine and provide a handy
000575  ** performance boost to bulk delete or insert operations. However, if
000576  ** a page is moved to the free-list and then reused within the same
000577  ** transaction, a problem comes up. If the page is not journalled when
000578  ** it is moved to the free-list and it is also not journalled when it
000579  ** is extracted from the free-list and reused, then the original data
000580  ** may be lost. In the event of a rollback, it may not be possible
000581  ** to restore the database to its original configuration.
000582  **
000583  ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 
000584  ** moved to become a free-list leaf page, the corresponding bit is
000585  ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
000586  ** optimization 2 above is omitted if the corresponding bit is already
000587  ** set in BtShared.pHasContent. The contents of the bitvec are cleared
000588  ** at the end of every transaction.
000589  */
000590  static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
000591    int rc = SQLITE_OK;
000592    if( !pBt->pHasContent ){
000593      assert( pgno<=pBt->nPage );
000594      pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
000595      if( !pBt->pHasContent ){
000596        rc = SQLITE_NOMEM_BKPT;
000597      }
000598    }
000599    if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
000600      rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
000601    }
000602    return rc;
000603  }
000604  
000605  /*
000606  ** Query the BtShared.pHasContent vector.
000607  **
000608  ** This function is called when a free-list leaf page is removed from the
000609  ** free-list for reuse. It returns false if it is safe to retrieve the
000610  ** page from the pager layer with the 'no-content' flag set. True otherwise.
000611  */
000612  static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
000613    Bitvec *p = pBt->pHasContent;
000614    return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
000615  }
000616  
000617  /*
000618  ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
000619  ** invoked at the conclusion of each write-transaction.
000620  */
000621  static void btreeClearHasContent(BtShared *pBt){
000622    sqlite3BitvecDestroy(pBt->pHasContent);
000623    pBt->pHasContent = 0;
000624  }
000625  
000626  /*
000627  ** Release all of the apPage[] pages for a cursor.
000628  */
000629  static void btreeReleaseAllCursorPages(BtCursor *pCur){
000630    int i;
000631    if( pCur->iPage>=0 ){
000632      for(i=0; i<pCur->iPage; i++){
000633        releasePageNotNull(pCur->apPage[i]);
000634      }
000635      releasePageNotNull(pCur->pPage);
000636      pCur->iPage = -1;
000637    }
000638  }
000639  
000640  /*
000641  ** The cursor passed as the only argument must point to a valid entry
000642  ** when this function is called (i.e. have eState==CURSOR_VALID). This
000643  ** function saves the current cursor key in variables pCur->nKey and
000644  ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 
000645  ** code otherwise.
000646  **
000647  ** If the cursor is open on an intkey table, then the integer key
000648  ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
000649  ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 
000650  ** set to point to a malloced buffer pCur->nKey bytes in size containing 
000651  ** the key.
000652  */
000653  static int saveCursorKey(BtCursor *pCur){
000654    int rc = SQLITE_OK;
000655    assert( CURSOR_VALID==pCur->eState );
000656    assert( 0==pCur->pKey );
000657    assert( cursorHoldsMutex(pCur) );
000658  
000659    if( pCur->curIntKey ){
000660      /* Only the rowid is required for a table btree */
000661      pCur->nKey = sqlite3BtreeIntegerKey(pCur);
000662    }else{
000663      /* For an index btree, save the complete key content. It is possible
000664      ** that the current key is corrupt. In that case, it is possible that
000665      ** the sqlite3VdbeRecordUnpack() function may overread the buffer by
000666      ** up to the size of 1 varint plus 1 8-byte value when the cursor 
000667      ** position is restored. Hence the 17 bytes of padding allocated 
000668      ** below. */
000669      void *pKey;
000670      pCur->nKey = sqlite3BtreePayloadSize(pCur);
000671      pKey = sqlite3Malloc( pCur->nKey + 9 + 8 );
000672      if( pKey ){
000673        rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
000674        if( rc==SQLITE_OK ){
000675          memset(((u8*)pKey)+pCur->nKey, 0, 9+8);
000676          pCur->pKey = pKey;
000677        }else{
000678          sqlite3_free(pKey);
000679        }
000680      }else{
000681        rc = SQLITE_NOMEM_BKPT;
000682      }
000683    }
000684    assert( !pCur->curIntKey || !pCur->pKey );
000685    return rc;
000686  }
000687  
000688  /*
000689  ** Save the current cursor position in the variables BtCursor.nKey 
000690  ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
000691  **
000692  ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
000693  ** prior to calling this routine.  
000694  */
000695  static int saveCursorPosition(BtCursor *pCur){
000696    int rc;
000697  
000698    assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
000699    assert( 0==pCur->pKey );
000700    assert( cursorHoldsMutex(pCur) );
000701  
000702    if( pCur->eState==CURSOR_SKIPNEXT ){
000703      pCur->eState = CURSOR_VALID;
000704    }else{
000705      pCur->skipNext = 0;
000706    }
000707  
000708    rc = saveCursorKey(pCur);
000709    if( rc==SQLITE_OK ){
000710      btreeReleaseAllCursorPages(pCur);
000711      pCur->eState = CURSOR_REQUIRESEEK;
000712    }
000713  
000714    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
000715    return rc;
000716  }
000717  
000718  /* Forward reference */
000719  static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
000720  
000721  /*
000722  ** Save the positions of all cursors (except pExcept) that are open on
000723  ** the table with root-page iRoot.  "Saving the cursor position" means that
000724  ** the location in the btree is remembered in such a way that it can be
000725  ** moved back to the same spot after the btree has been modified.  This
000726  ** routine is called just before cursor pExcept is used to modify the
000727  ** table, for example in BtreeDelete() or BtreeInsert().
000728  **
000729  ** If there are two or more cursors on the same btree, then all such 
000730  ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
000731  ** routine enforces that rule.  This routine only needs to be called in
000732  ** the uncommon case when pExpect has the BTCF_Multiple flag set.
000733  **
000734  ** If pExpect!=NULL and if no other cursors are found on the same root-page,
000735  ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
000736  ** pointless call to this routine.
000737  **
000738  ** Implementation note:  This routine merely checks to see if any cursors
000739  ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
000740  ** event that cursors are in need to being saved.
000741  */
000742  static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
000743    BtCursor *p;
000744    assert( sqlite3_mutex_held(pBt->mutex) );
000745    assert( pExcept==0 || pExcept->pBt==pBt );
000746    for(p=pBt->pCursor; p; p=p->pNext){
000747      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
000748    }
000749    if( p ) return saveCursorsOnList(p, iRoot, pExcept);
000750    if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
000751    return SQLITE_OK;
000752  }
000753  
000754  /* This helper routine to saveAllCursors does the actual work of saving
000755  ** the cursors if and when a cursor is found that actually requires saving.
000756  ** The common case is that no cursors need to be saved, so this routine is
000757  ** broken out from its caller to avoid unnecessary stack pointer movement.
000758  */
000759  static int SQLITE_NOINLINE saveCursorsOnList(
000760    BtCursor *p,         /* The first cursor that needs saving */
000761    Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
000762    BtCursor *pExcept    /* Do not save this cursor */
000763  ){
000764    do{
000765      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
000766        if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
000767          int rc = saveCursorPosition(p);
000768          if( SQLITE_OK!=rc ){
000769            return rc;
000770          }
000771        }else{
000772          testcase( p->iPage>=0 );
000773          btreeReleaseAllCursorPages(p);
000774        }
000775      }
000776      p = p->pNext;
000777    }while( p );
000778    return SQLITE_OK;
000779  }
000780  
000781  /*
000782  ** Clear the current cursor position.
000783  */
000784  void sqlite3BtreeClearCursor(BtCursor *pCur){
000785    assert( cursorHoldsMutex(pCur) );
000786    sqlite3_free(pCur->pKey);
000787    pCur->pKey = 0;
000788    pCur->eState = CURSOR_INVALID;
000789  }
000790  
000791  /*
000792  ** In this version of BtreeMoveto, pKey is a packed index record
000793  ** such as is generated by the OP_MakeRecord opcode.  Unpack the
000794  ** record and then call BtreeMovetoUnpacked() to do the work.
000795  */
000796  static int btreeMoveto(
000797    BtCursor *pCur,     /* Cursor open on the btree to be searched */
000798    const void *pKey,   /* Packed key if the btree is an index */
000799    i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
000800    int bias,           /* Bias search to the high end */
000801    int *pRes           /* Write search results here */
000802  ){
000803    int rc;                    /* Status code */
000804    UnpackedRecord *pIdxKey;   /* Unpacked index key */
000805  
000806    if( pKey ){
000807      KeyInfo *pKeyInfo = pCur->pKeyInfo;
000808      assert( nKey==(i64)(int)nKey );
000809      pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo);
000810      if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
000811      sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey);
000812      if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){
000813        rc = SQLITE_CORRUPT_BKPT;
000814        goto moveto_done;
000815      }
000816    }else{
000817      pIdxKey = 0;
000818    }
000819    rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
000820  moveto_done:
000821    if( pIdxKey ){
000822      sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
000823    }
000824    return rc;
000825  }
000826  
000827  /*
000828  ** Restore the cursor to the position it was in (or as close to as possible)
000829  ** when saveCursorPosition() was called. Note that this call deletes the 
000830  ** saved position info stored by saveCursorPosition(), so there can be
000831  ** at most one effective restoreCursorPosition() call after each 
000832  ** saveCursorPosition().
000833  */
000834  static int btreeRestoreCursorPosition(BtCursor *pCur){
000835    int rc;
000836    int skipNext = 0;
000837    assert( cursorOwnsBtShared(pCur) );
000838    assert( pCur->eState>=CURSOR_REQUIRESEEK );
000839    if( pCur->eState==CURSOR_FAULT ){
000840      return pCur->skipNext;
000841    }
000842    pCur->eState = CURSOR_INVALID;
000843    if( sqlite3FaultSim(410) ){
000844      rc = SQLITE_IOERR;
000845    }else{
000846      rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
000847    }
000848    if( rc==SQLITE_OK ){
000849      sqlite3_free(pCur->pKey);
000850      pCur->pKey = 0;
000851      assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
000852      if( skipNext ) pCur->skipNext = skipNext;
000853      if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
000854        pCur->eState = CURSOR_SKIPNEXT;
000855      }
000856    }
000857    return rc;
000858  }
000859  
000860  #define restoreCursorPosition(p) \
000861    (p->eState>=CURSOR_REQUIRESEEK ? \
000862           btreeRestoreCursorPosition(p) : \
000863           SQLITE_OK)
000864  
000865  /*
000866  ** Determine whether or not a cursor has moved from the position where
000867  ** it was last placed, or has been invalidated for any other reason.
000868  ** Cursors can move when the row they are pointing at is deleted out
000869  ** from under them, for example.  Cursor might also move if a btree
000870  ** is rebalanced.
000871  **
000872  ** Calling this routine with a NULL cursor pointer returns false.
000873  **
000874  ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
000875  ** back to where it ought to be if this routine returns true.
000876  */
000877  int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
000878    assert( EIGHT_BYTE_ALIGNMENT(pCur)
000879         || pCur==sqlite3BtreeFakeValidCursor() );
000880    assert( offsetof(BtCursor, eState)==0 );
000881    assert( sizeof(pCur->eState)==1 );
000882    return CURSOR_VALID != *(u8*)pCur;
000883  }
000884  
000885  /*
000886  ** Return a pointer to a fake BtCursor object that will always answer
000887  ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
000888  ** cursor returned must not be used with any other Btree interface.
000889  */
000890  BtCursor *sqlite3BtreeFakeValidCursor(void){
000891    static u8 fakeCursor = CURSOR_VALID;
000892    assert( offsetof(BtCursor, eState)==0 );
000893    return (BtCursor*)&fakeCursor;
000894  }
000895  
000896  /*
000897  ** This routine restores a cursor back to its original position after it
000898  ** has been moved by some outside activity (such as a btree rebalance or
000899  ** a row having been deleted out from under the cursor).  
000900  **
000901  ** On success, the *pDifferentRow parameter is false if the cursor is left
000902  ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
000903  ** was pointing to has been deleted, forcing the cursor to point to some
000904  ** nearby row.
000905  **
000906  ** This routine should only be called for a cursor that just returned
000907  ** TRUE from sqlite3BtreeCursorHasMoved().
000908  */
000909  int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
000910    int rc;
000911  
000912    assert( pCur!=0 );
000913    assert( pCur->eState!=CURSOR_VALID );
000914    rc = restoreCursorPosition(pCur);
000915    if( rc ){
000916      *pDifferentRow = 1;
000917      return rc;
000918    }
000919    if( pCur->eState!=CURSOR_VALID ){
000920      *pDifferentRow = 1;
000921    }else{
000922      *pDifferentRow = 0;
000923    }
000924    return SQLITE_OK;
000925  }
000926  
000927  #ifdef SQLITE_ENABLE_CURSOR_HINTS
000928  /*
000929  ** Provide hints to the cursor.  The particular hint given (and the type
000930  ** and number of the varargs parameters) is determined by the eHintType
000931  ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
000932  */
000933  void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
000934    /* Used only by system that substitute their own storage engine */
000935  }
000936  #endif
000937  
000938  /*
000939  ** Provide flag hints to the cursor.
000940  */
000941  void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
000942    assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
000943    pCur->hints = x;
000944  }
000945  
000946  
000947  #ifndef SQLITE_OMIT_AUTOVACUUM
000948  /*
000949  ** Given a page number of a regular database page, return the page
000950  ** number for the pointer-map page that contains the entry for the
000951  ** input page number.
000952  **
000953  ** Return 0 (not a valid page) for pgno==1 since there is
000954  ** no pointer map associated with page 1.  The integrity_check logic
000955  ** requires that ptrmapPageno(*,1)!=1.
000956  */
000957  static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
000958    int nPagesPerMapPage;
000959    Pgno iPtrMap, ret;
000960    assert( sqlite3_mutex_held(pBt->mutex) );
000961    if( pgno<2 ) return 0;
000962    nPagesPerMapPage = (pBt->usableSize/5)+1;
000963    iPtrMap = (pgno-2)/nPagesPerMapPage;
000964    ret = (iPtrMap*nPagesPerMapPage) + 2; 
000965    if( ret==PENDING_BYTE_PAGE(pBt) ){
000966      ret++;
000967    }
000968    return ret;
000969  }
000970  
000971  /*
000972  ** Write an entry into the pointer map.
000973  **
000974  ** This routine updates the pointer map entry for page number 'key'
000975  ** so that it maps to type 'eType' and parent page number 'pgno'.
000976  **
000977  ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
000978  ** a no-op.  If an error occurs, the appropriate error code is written
000979  ** into *pRC.
000980  */
000981  static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
000982    DbPage *pDbPage;  /* The pointer map page */
000983    u8 *pPtrmap;      /* The pointer map data */
000984    Pgno iPtrmap;     /* The pointer map page number */
000985    int offset;       /* Offset in pointer map page */
000986    int rc;           /* Return code from subfunctions */
000987  
000988    if( *pRC ) return;
000989  
000990    assert( sqlite3_mutex_held(pBt->mutex) );
000991    /* The master-journal page number must never be used as a pointer map page */
000992    assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
000993  
000994    assert( pBt->autoVacuum );
000995    if( key==0 ){
000996      *pRC = SQLITE_CORRUPT_BKPT;
000997      return;
000998    }
000999    iPtrmap = PTRMAP_PAGENO(pBt, key);
001000    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
001001    if( rc!=SQLITE_OK ){
001002      *pRC = rc;
001003      return;
001004    }
001005    if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){
001006      /* The first byte of the extra data is the MemPage.isInit byte.
001007      ** If that byte is set, it means this page is also being used
001008      ** as a btree page. */
001009      *pRC = SQLITE_CORRUPT_BKPT;
001010      goto ptrmap_exit;
001011    }
001012    offset = PTRMAP_PTROFFSET(iPtrmap, key);
001013    if( offset<0 ){
001014      *pRC = SQLITE_CORRUPT_BKPT;
001015      goto ptrmap_exit;
001016    }
001017    assert( offset <= (int)pBt->usableSize-5 );
001018    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
001019  
001020    if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
001021      TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
001022      *pRC= rc = sqlite3PagerWrite(pDbPage);
001023      if( rc==SQLITE_OK ){
001024        pPtrmap[offset] = eType;
001025        put4byte(&pPtrmap[offset+1], parent);
001026      }
001027    }
001028  
001029  ptrmap_exit:
001030    sqlite3PagerUnref(pDbPage);
001031  }
001032  
001033  /*
001034  ** Read an entry from the pointer map.
001035  **
001036  ** This routine retrieves the pointer map entry for page 'key', writing
001037  ** the type and parent page number to *pEType and *pPgno respectively.
001038  ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
001039  */
001040  static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
001041    DbPage *pDbPage;   /* The pointer map page */
001042    int iPtrmap;       /* Pointer map page index */
001043    u8 *pPtrmap;       /* Pointer map page data */
001044    int offset;        /* Offset of entry in pointer map */
001045    int rc;
001046  
001047    assert( sqlite3_mutex_held(pBt->mutex) );
001048  
001049    iPtrmap = PTRMAP_PAGENO(pBt, key);
001050    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
001051    if( rc!=0 ){
001052      return rc;
001053    }
001054    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
001055  
001056    offset = PTRMAP_PTROFFSET(iPtrmap, key);
001057    if( offset<0 ){
001058      sqlite3PagerUnref(pDbPage);
001059      return SQLITE_CORRUPT_BKPT;
001060    }
001061    assert( offset <= (int)pBt->usableSize-5 );
001062    assert( pEType!=0 );
001063    *pEType = pPtrmap[offset];
001064    if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
001065  
001066    sqlite3PagerUnref(pDbPage);
001067    if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
001068    return SQLITE_OK;
001069  }
001070  
001071  #else /* if defined SQLITE_OMIT_AUTOVACUUM */
001072    #define ptrmapPut(w,x,y,z,rc)
001073    #define ptrmapGet(w,x,y,z) SQLITE_OK
001074    #define ptrmapPutOvflPtr(x, y, z, rc)
001075  #endif
001076  
001077  /*
001078  ** Given a btree page and a cell index (0 means the first cell on
001079  ** the page, 1 means the second cell, and so forth) return a pointer
001080  ** to the cell content.
001081  **
001082  ** findCellPastPtr() does the same except it skips past the initial
001083  ** 4-byte child pointer found on interior pages, if there is one.
001084  **
001085  ** This routine works only for pages that do not contain overflow cells.
001086  */
001087  #define findCell(P,I) \
001088    ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001089  #define findCellPastPtr(P,I) \
001090    ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001091  
001092  
001093  /*
001094  ** This is common tail processing for btreeParseCellPtr() and
001095  ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
001096  ** on a single B-tree page.  Make necessary adjustments to the CellInfo
001097  ** structure.
001098  */
001099  static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
001100    MemPage *pPage,         /* Page containing the cell */
001101    u8 *pCell,              /* Pointer to the cell text. */
001102    CellInfo *pInfo         /* Fill in this structure */
001103  ){
001104    /* If the payload will not fit completely on the local page, we have
001105    ** to decide how much to store locally and how much to spill onto
001106    ** overflow pages.  The strategy is to minimize the amount of unused
001107    ** space on overflow pages while keeping the amount of local storage
001108    ** in between minLocal and maxLocal.
001109    **
001110    ** Warning:  changing the way overflow payload is distributed in any
001111    ** way will result in an incompatible file format.
001112    */
001113    int minLocal;  /* Minimum amount of payload held locally */
001114    int maxLocal;  /* Maximum amount of payload held locally */
001115    int surplus;   /* Overflow payload available for local storage */
001116  
001117    minLocal = pPage->minLocal;
001118    maxLocal = pPage->maxLocal;
001119    surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
001120    testcase( surplus==maxLocal );
001121    testcase( surplus==maxLocal+1 );
001122    if( surplus <= maxLocal ){
001123      pInfo->nLocal = (u16)surplus;
001124    }else{
001125      pInfo->nLocal = (u16)minLocal;
001126    }
001127    pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
001128  }
001129  
001130  /*
001131  ** The following routines are implementations of the MemPage.xParseCell()
001132  ** method.
001133  **
001134  ** Parse a cell content block and fill in the CellInfo structure.
001135  **
001136  ** btreeParseCellPtr()        =>   table btree leaf nodes
001137  ** btreeParseCellNoPayload()  =>   table btree internal nodes
001138  ** btreeParseCellPtrIndex()   =>   index btree nodes
001139  **
001140  ** There is also a wrapper function btreeParseCell() that works for
001141  ** all MemPage types and that references the cell by index rather than
001142  ** by pointer.
001143  */
001144  static void btreeParseCellPtrNoPayload(
001145    MemPage *pPage,         /* Page containing the cell */
001146    u8 *pCell,              /* Pointer to the cell text. */
001147    CellInfo *pInfo         /* Fill in this structure */
001148  ){
001149    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001150    assert( pPage->leaf==0 );
001151    assert( pPage->childPtrSize==4 );
001152  #ifndef SQLITE_DEBUG
001153    UNUSED_PARAMETER(pPage);
001154  #endif
001155    pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
001156    pInfo->nPayload = 0;
001157    pInfo->nLocal = 0;
001158    pInfo->pPayload = 0;
001159    return;
001160  }
001161  static void btreeParseCellPtr(
001162    MemPage *pPage,         /* Page containing the cell */
001163    u8 *pCell,              /* Pointer to the cell text. */
001164    CellInfo *pInfo         /* Fill in this structure */
001165  ){
001166    u8 *pIter;              /* For scanning through pCell */
001167    u32 nPayload;           /* Number of bytes of cell payload */
001168    u64 iKey;               /* Extracted Key value */
001169  
001170    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001171    assert( pPage->leaf==0 || pPage->leaf==1 );
001172    assert( pPage->intKeyLeaf );
001173    assert( pPage->childPtrSize==0 );
001174    pIter = pCell;
001175  
001176    /* The next block of code is equivalent to:
001177    **
001178    **     pIter += getVarint32(pIter, nPayload);
001179    **
001180    ** The code is inlined to avoid a function call.
001181    */
001182    nPayload = *pIter;
001183    if( nPayload>=0x80 ){
001184      u8 *pEnd = &pIter[8];
001185      nPayload &= 0x7f;
001186      do{
001187        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001188      }while( (*pIter)>=0x80 && pIter<pEnd );
001189    }
001190    pIter++;
001191  
001192    /* The next block of code is equivalent to:
001193    **
001194    **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
001195    **
001196    ** The code is inlined to avoid a function call.
001197    */
001198    iKey = *pIter;
001199    if( iKey>=0x80 ){
001200      u8 *pEnd = &pIter[7];
001201      iKey &= 0x7f;
001202      while(1){
001203        iKey = (iKey<<7) | (*++pIter & 0x7f);
001204        if( (*pIter)<0x80 ) break;
001205        if( pIter>=pEnd ){
001206          iKey = (iKey<<8) | *++pIter;
001207          break;
001208        }
001209      }
001210    }
001211    pIter++;
001212  
001213    pInfo->nKey = *(i64*)&iKey;
001214    pInfo->nPayload = nPayload;
001215    pInfo->pPayload = pIter;
001216    testcase( nPayload==pPage->maxLocal );
001217    testcase( nPayload==pPage->maxLocal+1 );
001218    if( nPayload<=pPage->maxLocal ){
001219      /* This is the (easy) common case where the entire payload fits
001220      ** on the local page.  No overflow is required.
001221      */
001222      pInfo->nSize = nPayload + (u16)(pIter - pCell);
001223      if( pInfo->nSize<4 ) pInfo->nSize = 4;
001224      pInfo->nLocal = (u16)nPayload;
001225    }else{
001226      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001227    }
001228  }
001229  static void btreeParseCellPtrIndex(
001230    MemPage *pPage,         /* Page containing the cell */
001231    u8 *pCell,              /* Pointer to the cell text. */
001232    CellInfo *pInfo         /* Fill in this structure */
001233  ){
001234    u8 *pIter;              /* For scanning through pCell */
001235    u32 nPayload;           /* Number of bytes of cell payload */
001236  
001237    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001238    assert( pPage->leaf==0 || pPage->leaf==1 );
001239    assert( pPage->intKeyLeaf==0 );
001240    pIter = pCell + pPage->childPtrSize;
001241    nPayload = *pIter;
001242    if( nPayload>=0x80 ){
001243      u8 *pEnd = &pIter[8];
001244      nPayload &= 0x7f;
001245      do{
001246        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001247      }while( *(pIter)>=0x80 && pIter<pEnd );
001248    }
001249    pIter++;
001250    pInfo->nKey = nPayload;
001251    pInfo->nPayload = nPayload;
001252    pInfo->pPayload = pIter;
001253    testcase( nPayload==pPage->maxLocal );
001254    testcase( nPayload==pPage->maxLocal+1 );
001255    if( nPayload<=pPage->maxLocal ){
001256      /* This is the (easy) common case where the entire payload fits
001257      ** on the local page.  No overflow is required.
001258      */
001259      pInfo->nSize = nPayload + (u16)(pIter - pCell);
001260      if( pInfo->nSize<4 ) pInfo->nSize = 4;
001261      pInfo->nLocal = (u16)nPayload;
001262    }else{
001263      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001264    }
001265  }
001266  static void btreeParseCell(
001267    MemPage *pPage,         /* Page containing the cell */
001268    int iCell,              /* The cell index.  First cell is 0 */
001269    CellInfo *pInfo         /* Fill in this structure */
001270  ){
001271    pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
001272  }
001273  
001274  /*
001275  ** The following routines are implementations of the MemPage.xCellSize
001276  ** method.
001277  **
001278  ** Compute the total number of bytes that a Cell needs in the cell
001279  ** data area of the btree-page.  The return number includes the cell
001280  ** data header and the local payload, but not any overflow page or
001281  ** the space used by the cell pointer.
001282  **
001283  ** cellSizePtrNoPayload()    =>   table internal nodes
001284  ** cellSizePtr()             =>   all index nodes & table leaf nodes
001285  */
001286  static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
001287    u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
001288    u8 *pEnd;                                /* End mark for a varint */
001289    u32 nSize;                               /* Size value to return */
001290  
001291  #ifdef SQLITE_DEBUG
001292    /* The value returned by this function should always be the same as
001293    ** the (CellInfo.nSize) value found by doing a full parse of the
001294    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001295    ** this function verifies that this invariant is not violated. */
001296    CellInfo debuginfo;
001297    pPage->xParseCell(pPage, pCell, &debuginfo);
001298  #endif
001299  
001300    nSize = *pIter;
001301    if( nSize>=0x80 ){
001302      pEnd = &pIter[8];
001303      nSize &= 0x7f;
001304      do{
001305        nSize = (nSize<<7) | (*++pIter & 0x7f);
001306      }while( *(pIter)>=0x80 && pIter<pEnd );
001307    }
001308    pIter++;
001309    if( pPage->intKey ){
001310      /* pIter now points at the 64-bit integer key value, a variable length 
001311      ** integer. The following block moves pIter to point at the first byte
001312      ** past the end of the key value. */
001313      pEnd = &pIter[9];
001314      while( (*pIter++)&0x80 && pIter<pEnd );
001315    }
001316    testcase( nSize==pPage->maxLocal );
001317    testcase( nSize==pPage->maxLocal+1 );
001318    if( nSize<=pPage->maxLocal ){
001319      nSize += (u32)(pIter - pCell);
001320      if( nSize<4 ) nSize = 4;
001321    }else{
001322      int minLocal = pPage->minLocal;
001323      nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
001324      testcase( nSize==pPage->maxLocal );
001325      testcase( nSize==pPage->maxLocal+1 );
001326      if( nSize>pPage->maxLocal ){
001327        nSize = minLocal;
001328      }
001329      nSize += 4 + (u16)(pIter - pCell);
001330    }
001331    assert( nSize==debuginfo.nSize || CORRUPT_DB );
001332    return (u16)nSize;
001333  }
001334  static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
001335    u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
001336    u8 *pEnd;              /* End mark for a varint */
001337  
001338  #ifdef SQLITE_DEBUG
001339    /* The value returned by this function should always be the same as
001340    ** the (CellInfo.nSize) value found by doing a full parse of the
001341    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001342    ** this function verifies that this invariant is not violated. */
001343    CellInfo debuginfo;
001344    pPage->xParseCell(pPage, pCell, &debuginfo);
001345  #else
001346    UNUSED_PARAMETER(pPage);
001347  #endif
001348  
001349    assert( pPage->childPtrSize==4 );
001350    pEnd = pIter + 9;
001351    while( (*pIter++)&0x80 && pIter<pEnd );
001352    assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
001353    return (u16)(pIter - pCell);
001354  }
001355  
001356  
001357  #ifdef SQLITE_DEBUG
001358  /* This variation on cellSizePtr() is used inside of assert() statements
001359  ** only. */
001360  static u16 cellSize(MemPage *pPage, int iCell){
001361    return pPage->xCellSize(pPage, findCell(pPage, iCell));
001362  }
001363  #endif
001364  
001365  #ifndef SQLITE_OMIT_AUTOVACUUM
001366  /*
001367  ** The cell pCell is currently part of page pSrc but will ultimately be part
001368  ** of pPage.  (pSrc and pPager are often the same.)  If pCell contains a
001369  ** pointer to an overflow page, insert an entry into the pointer-map for
001370  ** the overflow page that will be valid after pCell has been moved to pPage.
001371  */
001372  static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){
001373    CellInfo info;
001374    if( *pRC ) return;
001375    assert( pCell!=0 );
001376    pPage->xParseCell(pPage, pCell, &info);
001377    if( info.nLocal<info.nPayload ){
001378      Pgno ovfl;
001379      if( SQLITE_WITHIN(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){
001380        testcase( pSrc!=pPage );
001381        *pRC = SQLITE_CORRUPT_BKPT;
001382        return;
001383      }
001384      ovfl = get4byte(&pCell[info.nSize-4]);
001385      ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
001386    }
001387  }
001388  #endif
001389  
001390  
001391  /*
001392  ** Defragment the page given. This routine reorganizes cells within the
001393  ** page so that there are no free-blocks on the free-block list.
001394  **
001395  ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
001396  ** present in the page after this routine returns.
001397  **
001398  ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
001399  ** b-tree page so that there are no freeblocks or fragment bytes, all
001400  ** unused bytes are contained in the unallocated space region, and all
001401  ** cells are packed tightly at the end of the page.
001402  */
001403  static int defragmentPage(MemPage *pPage, int nMaxFrag){
001404    int i;                     /* Loop counter */
001405    int pc;                    /* Address of the i-th cell */
001406    int hdr;                   /* Offset to the page header */
001407    int size;                  /* Size of a cell */
001408    int usableSize;            /* Number of usable bytes on a page */
001409    int cellOffset;            /* Offset to the cell pointer array */
001410    int cbrk;                  /* Offset to the cell content area */
001411    int nCell;                 /* Number of cells on the page */
001412    unsigned char *data;       /* The page data */
001413    unsigned char *temp;       /* Temp area for cell content */
001414    unsigned char *src;        /* Source of content */
001415    int iCellFirst;            /* First allowable cell index */
001416    int iCellLast;             /* Last possible cell index */
001417  
001418    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001419    assert( pPage->pBt!=0 );
001420    assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
001421    assert( pPage->nOverflow==0 );
001422    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001423    temp = 0;
001424    src = data = pPage->aData;
001425    hdr = pPage->hdrOffset;
001426    cellOffset = pPage->cellOffset;
001427    nCell = pPage->nCell;
001428    assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB );
001429    iCellFirst = cellOffset + 2*nCell;
001430    usableSize = pPage->pBt->usableSize;
001431  
001432    /* This block handles pages with two or fewer free blocks and nMaxFrag
001433    ** or fewer fragmented bytes. In this case it is faster to move the
001434    ** two (or one) blocks of cells using memmove() and add the required
001435    ** offsets to each pointer in the cell-pointer array than it is to 
001436    ** reconstruct the entire page.  */
001437    if( (int)data[hdr+7]<=nMaxFrag ){
001438      int iFree = get2byte(&data[hdr+1]);
001439      if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
001440      if( iFree ){
001441        int iFree2 = get2byte(&data[iFree]);
001442        if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
001443        if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
001444          u8 *pEnd = &data[cellOffset + nCell*2];
001445          u8 *pAddr;
001446          int sz2 = 0;
001447          int sz = get2byte(&data[iFree+2]);
001448          int top = get2byte(&data[hdr+5]);
001449          if( top>=iFree ){
001450            return SQLITE_CORRUPT_PAGE(pPage);
001451          }
001452          if( iFree2 ){
001453            if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage);
001454            sz2 = get2byte(&data[iFree2+2]);
001455            if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage);
001456            memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
001457            sz += sz2;
001458          }else if( iFree+sz>usableSize ){
001459            return SQLITE_CORRUPT_PAGE(pPage);
001460          }
001461  
001462          cbrk = top+sz;
001463          assert( cbrk+(iFree-top) <= usableSize );
001464          memmove(&data[cbrk], &data[top], iFree-top);
001465          for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
001466            pc = get2byte(pAddr);
001467            if( pc<iFree ){ put2byte(pAddr, pc+sz); }
001468            else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
001469          }
001470          goto defragment_out;
001471        }
001472      }
001473    }
001474  
001475    cbrk = usableSize;
001476    iCellLast = usableSize - 4;
001477    for(i=0; i<nCell; i++){
001478      u8 *pAddr;     /* The i-th cell pointer */
001479      pAddr = &data[cellOffset + i*2];
001480      pc = get2byte(pAddr);
001481      testcase( pc==iCellFirst );
001482      testcase( pc==iCellLast );
001483      /* These conditions have already been verified in btreeInitPage()
001484      ** if PRAGMA cell_size_check=ON.
001485      */
001486      if( pc<iCellFirst || pc>iCellLast ){
001487        return SQLITE_CORRUPT_PAGE(pPage);
001488      }
001489      assert( pc>=iCellFirst && pc<=iCellLast );
001490      size = pPage->xCellSize(pPage, &src[pc]);
001491      cbrk -= size;
001492      if( cbrk<iCellFirst || pc+size>usableSize ){
001493        return SQLITE_CORRUPT_PAGE(pPage);
001494      }
001495      assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
001496      testcase( cbrk+size==usableSize );
001497      testcase( pc+size==usableSize );
001498      put2byte(pAddr, cbrk);
001499      if( temp==0 ){
001500        int x;
001501        if( cbrk==pc ) continue;
001502        temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
001503        x = get2byte(&data[hdr+5]);
001504        memcpy(&temp[x], &data[x], (cbrk+size) - x);
001505        src = temp;
001506      }
001507      memcpy(&data[cbrk], &src[pc], size);
001508    }
001509    data[hdr+7] = 0;
001510  
001511   defragment_out:
001512    assert( pPage->nFree>=0 );
001513    if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
001514      return SQLITE_CORRUPT_PAGE(pPage);
001515    }
001516    assert( cbrk>=iCellFirst );
001517    put2byte(&data[hdr+5], cbrk);
001518    data[hdr+1] = 0;
001519    data[hdr+2] = 0;
001520    memset(&data[iCellFirst], 0, cbrk-iCellFirst);
001521    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001522    return SQLITE_OK;
001523  }
001524  
001525  /*
001526  ** Search the free-list on page pPg for space to store a cell nByte bytes in
001527  ** size. If one can be found, return a pointer to the space and remove it
001528  ** from the free-list.
001529  **
001530  ** If no suitable space can be found on the free-list, return NULL.
001531  **
001532  ** This function may detect corruption within pPg.  If corruption is
001533  ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
001534  **
001535  ** Slots on the free list that are between 1 and 3 bytes larger than nByte
001536  ** will be ignored if adding the extra space to the fragmentation count
001537  ** causes the fragmentation count to exceed 60.
001538  */
001539  static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
001540    const int hdr = pPg->hdrOffset;            /* Offset to page header */
001541    u8 * const aData = pPg->aData;             /* Page data */
001542    int iAddr = hdr + 1;                       /* Address of ptr to pc */
001543    int pc = get2byte(&aData[iAddr]);          /* Address of a free slot */
001544    int x;                                     /* Excess size of the slot */
001545    int maxPC = pPg->pBt->usableSize - nByte;  /* Max address for a usable slot */
001546    int size;                                  /* Size of the free slot */
001547  
001548    assert( pc>0 );
001549    while( pc<=maxPC ){
001550      /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
001551      ** freeblock form a big-endian integer which is the size of the freeblock
001552      ** in bytes, including the 4-byte header. */
001553      size = get2byte(&aData[pc+2]);
001554      if( (x = size - nByte)>=0 ){
001555        testcase( x==4 );
001556        testcase( x==3 );
001557        if( x<4 ){
001558          /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
001559          ** number of bytes in fragments may not exceed 60. */
001560          if( aData[hdr+7]>57 ) return 0;
001561  
001562          /* Remove the slot from the free-list. Update the number of
001563          ** fragmented bytes within the page. */
001564          memcpy(&aData[iAddr], &aData[pc], 2);
001565          aData[hdr+7] += (u8)x;
001566        }else if( x+pc > maxPC ){
001567          /* This slot extends off the end of the usable part of the page */
001568          *pRc = SQLITE_CORRUPT_PAGE(pPg);
001569          return 0;
001570        }else{
001571          /* The slot remains on the free-list. Reduce its size to account
001572          ** for the portion used by the new allocation. */
001573          put2byte(&aData[pc+2], x);
001574        }
001575        return &aData[pc + x];
001576      }
001577      iAddr = pc;
001578      pc = get2byte(&aData[pc]);
001579      if( pc<=iAddr+size ){
001580        if( pc ){
001581          /* The next slot in the chain is not past the end of the current slot */
001582          *pRc = SQLITE_CORRUPT_PAGE(pPg);
001583        }
001584        return 0;
001585      }
001586    }
001587    if( pc>maxPC+nByte-4 ){
001588      /* The free slot chain extends off the end of the page */
001589      *pRc = SQLITE_CORRUPT_PAGE(pPg);
001590    }
001591    return 0;
001592  }
001593  
001594  /*
001595  ** Allocate nByte bytes of space from within the B-Tree page passed
001596  ** as the first argument. Write into *pIdx the index into pPage->aData[]
001597  ** of the first byte of allocated space. Return either SQLITE_OK or
001598  ** an error code (usually SQLITE_CORRUPT).
001599  **
001600  ** The caller guarantees that there is sufficient space to make the
001601  ** allocation.  This routine might need to defragment in order to bring
001602  ** all the space together, however.  This routine will avoid using
001603  ** the first two bytes past the cell pointer area since presumably this
001604  ** allocation is being made in order to insert a new cell, so we will
001605  ** also end up needing a new cell pointer.
001606  */
001607  static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
001608    const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
001609    u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
001610    int top;                             /* First byte of cell content area */
001611    int rc = SQLITE_OK;                  /* Integer return code */
001612    int gap;        /* First byte of gap between cell pointers and cell content */
001613    
001614    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001615    assert( pPage->pBt );
001616    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001617    assert( nByte>=0 );  /* Minimum cell size is 4 */
001618    assert( pPage->nFree>=nByte );
001619    assert( pPage->nOverflow==0 );
001620    assert( nByte < (int)(pPage->pBt->usableSize-8) );
001621  
001622    assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
001623    gap = pPage->cellOffset + 2*pPage->nCell;
001624    assert( gap<=65536 );
001625    /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
001626    ** and the reserved space is zero (the usual value for reserved space)
001627    ** then the cell content offset of an empty page wants to be 65536.
001628    ** However, that integer is too large to be stored in a 2-byte unsigned
001629    ** integer, so a value of 0 is used in its place. */
001630    top = get2byte(&data[hdr+5]);
001631    assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */
001632    if( gap>top ){
001633      if( top==0 && pPage->pBt->usableSize==65536 ){
001634        top = 65536;
001635      }else{
001636        return SQLITE_CORRUPT_PAGE(pPage);
001637      }
001638    }
001639  
001640    /* If there is enough space between gap and top for one more cell pointer,
001641    ** and if the freelist is not empty, then search the
001642    ** freelist looking for a slot big enough to satisfy the request.
001643    */
001644    testcase( gap+2==top );
001645    testcase( gap+1==top );
001646    testcase( gap==top );
001647    if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
001648      u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
001649      if( pSpace ){
001650        assert( pSpace+nByte<=data+pPage->pBt->usableSize );
001651        if( (*pIdx = (int)(pSpace-data))<=gap ){
001652          return SQLITE_CORRUPT_PAGE(pPage);
001653        }else{
001654          return SQLITE_OK;
001655        }
001656      }else if( rc ){
001657        return rc;
001658      }
001659    }
001660  
001661    /* The request could not be fulfilled using a freelist slot.  Check
001662    ** to see if defragmentation is necessary.
001663    */
001664    testcase( gap+2+nByte==top );
001665    if( gap+2+nByte>top ){
001666      assert( pPage->nCell>0 || CORRUPT_DB );
001667      assert( pPage->nFree>=0 );
001668      rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
001669      if( rc ) return rc;
001670      top = get2byteNotZero(&data[hdr+5]);
001671      assert( gap+2+nByte<=top );
001672    }
001673  
001674  
001675    /* Allocate memory from the gap in between the cell pointer array
001676    ** and the cell content area.  The btreeComputeFreeSpace() call has already
001677    ** validated the freelist.  Given that the freelist is valid, there
001678    ** is no way that the allocation can extend off the end of the page.
001679    ** The assert() below verifies the previous sentence.
001680    */
001681    top -= nByte;
001682    put2byte(&data[hdr+5], top);
001683    assert( top+nByte <= (int)pPage->pBt->usableSize );
001684    *pIdx = top;
001685    return SQLITE_OK;
001686  }
001687  
001688  /*
001689  ** Return a section of the pPage->aData to the freelist.
001690  ** The first byte of the new free block is pPage->aData[iStart]
001691  ** and the size of the block is iSize bytes.
001692  **
001693  ** Adjacent freeblocks are coalesced.
001694  **
001695  ** Even though the freeblock list was checked by btreeComputeFreeSpace(),
001696  ** that routine will not detect overlap between cells or freeblocks.  Nor
001697  ** does it detect cells or freeblocks that encrouch into the reserved bytes
001698  ** at the end of the page.  So do additional corruption checks inside this
001699  ** routine and return SQLITE_CORRUPT if any problems are found.
001700  */
001701  static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
001702    u16 iPtr;                             /* Address of ptr to next freeblock */
001703    u16 iFreeBlk;                         /* Address of the next freeblock */
001704    u8 hdr;                               /* Page header size.  0 or 100 */
001705    u8 nFrag = 0;                         /* Reduction in fragmentation */
001706    u16 iOrigSize = iSize;                /* Original value of iSize */
001707    u16 x;                                /* Offset to cell content area */
001708    u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
001709    unsigned char *data = pPage->aData;   /* Page content */
001710  
001711    assert( pPage->pBt!=0 );
001712    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001713    assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
001714    assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
001715    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001716    assert( iSize>=4 );   /* Minimum cell size is 4 */
001717    assert( iStart<=pPage->pBt->usableSize-4 );
001718  
001719    /* The list of freeblocks must be in ascending order.  Find the 
001720    ** spot on the list where iStart should be inserted.
001721    */
001722    hdr = pPage->hdrOffset;
001723    iPtr = hdr + 1;
001724    if( data[iPtr+1]==0 && data[iPtr]==0 ){
001725      iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
001726    }else{
001727      while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
001728        if( iFreeBlk<iPtr+4 ){
001729          if( iFreeBlk==0 ) break;
001730          return SQLITE_CORRUPT_PAGE(pPage);
001731        }
001732        iPtr = iFreeBlk;
001733      }
001734      if( iFreeBlk>pPage->pBt->usableSize-4 ){
001735        return SQLITE_CORRUPT_PAGE(pPage);
001736      }
001737      assert( iFreeBlk>iPtr || iFreeBlk==0 );
001738    
001739      /* At this point:
001740      **    iFreeBlk:   First freeblock after iStart, or zero if none
001741      **    iPtr:       The address of a pointer to iFreeBlk
001742      **
001743      ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
001744      */
001745      if( iFreeBlk && iEnd+3>=iFreeBlk ){
001746        nFrag = iFreeBlk - iEnd;
001747        if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
001748        iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
001749        if( iEnd > pPage->pBt->usableSize ){
001750          return SQLITE_CORRUPT_PAGE(pPage);
001751        }
001752        iSize = iEnd - iStart;
001753        iFreeBlk = get2byte(&data[iFreeBlk]);
001754      }
001755    
001756      /* If iPtr is another freeblock (that is, if iPtr is not the freelist
001757      ** pointer in the page header) then check to see if iStart should be
001758      ** coalesced onto the end of iPtr.
001759      */
001760      if( iPtr>hdr+1 ){
001761        int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
001762        if( iPtrEnd+3>=iStart ){
001763          if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
001764          nFrag += iStart - iPtrEnd;
001765          iSize = iEnd - iPtr;
001766          iStart = iPtr;
001767        }
001768      }
001769      if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
001770      data[hdr+7] -= nFrag;
001771    }
001772    x = get2byte(&data[hdr+5]);
001773    if( iStart<=x ){
001774      /* The new freeblock is at the beginning of the cell content area,
001775      ** so just extend the cell content area rather than create another
001776      ** freelist entry */
001777      if( iStart<x || iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
001778      put2byte(&data[hdr+1], iFreeBlk);
001779      put2byte(&data[hdr+5], iEnd);
001780    }else{
001781      /* Insert the new freeblock into the freelist */
001782      put2byte(&data[iPtr], iStart);
001783    }
001784    if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
001785      /* Overwrite deleted information with zeros when the secure_delete
001786      ** option is enabled */
001787      memset(&data[iStart], 0, iSize);
001788    }
001789    put2byte(&data[iStart], iFreeBlk);
001790    put2byte(&data[iStart+2], iSize);
001791    pPage->nFree += iOrigSize;
001792    return SQLITE_OK;
001793  }
001794  
001795  /*
001796  ** Decode the flags byte (the first byte of the header) for a page
001797  ** and initialize fields of the MemPage structure accordingly.
001798  **
001799  ** Only the following combinations are supported.  Anything different
001800  ** indicates a corrupt database files:
001801  **
001802  **         PTF_ZERODATA
001803  **         PTF_ZERODATA | PTF_LEAF
001804  **         PTF_LEAFDATA | PTF_INTKEY
001805  **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
001806  */
001807  static int decodeFlags(MemPage *pPage, int flagByte){
001808    BtShared *pBt;     /* A copy of pPage->pBt */
001809  
001810    assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
001811    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001812    pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
001813    flagByte &= ~PTF_LEAF;
001814    pPage->childPtrSize = 4-4*pPage->leaf;
001815    pPage->xCellSize = cellSizePtr;
001816    pBt = pPage->pBt;
001817    if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
001818      /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
001819      ** interior table b-tree page. */
001820      assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
001821      /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
001822      ** leaf table b-tree page. */
001823      assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
001824      pPage->intKey = 1;
001825      if( pPage->leaf ){
001826        pPage->intKeyLeaf = 1;
001827        pPage->xParseCell = btreeParseCellPtr;
001828      }else{
001829        pPage->intKeyLeaf = 0;
001830        pPage->xCellSize = cellSizePtrNoPayload;
001831        pPage->xParseCell = btreeParseCellPtrNoPayload;
001832      }
001833      pPage->maxLocal = pBt->maxLeaf;
001834      pPage->minLocal = pBt->minLeaf;
001835    }else if( flagByte==PTF_ZERODATA ){
001836      /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
001837      ** interior index b-tree page. */
001838      assert( (PTF_ZERODATA)==2 );
001839      /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
001840      ** leaf index b-tree page. */
001841      assert( (PTF_ZERODATA|PTF_LEAF)==10 );
001842      pPage->intKey = 0;
001843      pPage->intKeyLeaf = 0;
001844      pPage->xParseCell = btreeParseCellPtrIndex;
001845      pPage->maxLocal = pBt->maxLocal;
001846      pPage->minLocal = pBt->minLocal;
001847    }else{
001848      /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
001849      ** an error. */
001850      return SQLITE_CORRUPT_PAGE(pPage);
001851    }
001852    pPage->max1bytePayload = pBt->max1bytePayload;
001853    return SQLITE_OK;
001854  }
001855  
001856  /*
001857  ** Compute the amount of freespace on the page.  In other words, fill
001858  ** in the pPage->nFree field.
001859  */
001860  static int btreeComputeFreeSpace(MemPage *pPage){
001861    int pc;            /* Address of a freeblock within pPage->aData[] */
001862    u8 hdr;            /* Offset to beginning of page header */
001863    u8 *data;          /* Equal to pPage->aData */
001864    int usableSize;    /* Amount of usable space on each page */
001865    int nFree;         /* Number of unused bytes on the page */
001866    int top;           /* First byte of the cell content area */
001867    int iCellFirst;    /* First allowable cell or freeblock offset */
001868    int iCellLast;     /* Last possible cell or freeblock offset */
001869  
001870    assert( pPage->pBt!=0 );
001871    assert( pPage->pBt->db!=0 );
001872    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001873    assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
001874    assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
001875    assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
001876    assert( pPage->isInit==1 );
001877    assert( pPage->nFree<0 );
001878  
001879    usableSize = pPage->pBt->usableSize;
001880    hdr = pPage->hdrOffset;
001881    data = pPage->aData;
001882    /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
001883    ** the start of the cell content area. A zero value for this integer is
001884    ** interpreted as 65536. */
001885    top = get2byteNotZero(&data[hdr+5]);
001886    iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
001887    iCellLast = usableSize - 4;
001888  
001889    /* Compute the total free space on the page
001890    ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
001891    ** start of the first freeblock on the page, or is zero if there are no
001892    ** freeblocks. */
001893    pc = get2byte(&data[hdr+1]);
001894    nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
001895    if( pc>0 ){
001896      u32 next, size;
001897      if( pc<iCellFirst ){
001898        /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
001899        ** always be at least one cell before the first freeblock.
001900        */
001901        return SQLITE_CORRUPT_PAGE(pPage); 
001902      }
001903      while( 1 ){
001904        if( pc>iCellLast ){
001905          /* Freeblock off the end of the page */
001906          return SQLITE_CORRUPT_PAGE(pPage);
001907        }
001908        next = get2byte(&data[pc]);
001909        size = get2byte(&data[pc+2]);
001910        nFree = nFree + size;
001911        if( next<=pc+size+3 ) break;
001912        pc = next;
001913      }
001914      if( next>0 ){
001915        /* Freeblock not in ascending order */
001916        return SQLITE_CORRUPT_PAGE(pPage);
001917      }
001918      if( pc+size>(unsigned int)usableSize ){
001919        /* Last freeblock extends past page end */
001920        return SQLITE_CORRUPT_PAGE(pPage);
001921      }
001922    }
001923  
001924    /* At this point, nFree contains the sum of the offset to the start
001925    ** of the cell-content area plus the number of free bytes within
001926    ** the cell-content area. If this is greater than the usable-size
001927    ** of the page, then the page must be corrupted. This check also
001928    ** serves to verify that the offset to the start of the cell-content
001929    ** area, according to the page header, lies within the page.
001930    */
001931    if( nFree>usableSize || nFree<iCellFirst ){
001932      return SQLITE_CORRUPT_PAGE(pPage);
001933    }
001934    pPage->nFree = (u16)(nFree - iCellFirst);
001935    return SQLITE_OK;
001936  }
001937  
001938  /*
001939  ** Do additional sanity check after btreeInitPage() if
001940  ** PRAGMA cell_size_check=ON 
001941  */
001942  static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){
001943    int iCellFirst;    /* First allowable cell or freeblock offset */
001944    int iCellLast;     /* Last possible cell or freeblock offset */
001945    int i;             /* Index into the cell pointer array */
001946    int sz;            /* Size of a cell */
001947    int pc;            /* Address of a freeblock within pPage->aData[] */
001948    u8 *data;          /* Equal to pPage->aData */
001949    int usableSize;    /* Maximum usable space on the page */
001950    int cellOffset;    /* Start of cell content area */
001951  
001952    iCellFirst = pPage->cellOffset + 2*pPage->nCell;
001953    usableSize = pPage->pBt->usableSize;
001954    iCellLast = usableSize - 4;
001955    data = pPage->aData;
001956    cellOffset = pPage->cellOffset;
001957    if( !pPage->leaf ) iCellLast--;
001958    for(i=0; i<pPage->nCell; i++){
001959      pc = get2byteAligned(&data[cellOffset+i*2]);
001960      testcase( pc==iCellFirst );
001961      testcase( pc==iCellLast );
001962      if( pc<iCellFirst || pc>iCellLast ){
001963        return SQLITE_CORRUPT_PAGE(pPage);
001964      }
001965      sz = pPage->xCellSize(pPage, &data[pc]);
001966      testcase( pc+sz==usableSize );
001967      if( pc+sz>usableSize ){
001968        return SQLITE_CORRUPT_PAGE(pPage);
001969      }
001970    }
001971    return SQLITE_OK;
001972  }
001973  
001974  /*
001975  ** Initialize the auxiliary information for a disk block.
001976  **
001977  ** Return SQLITE_OK on success.  If we see that the page does
001978  ** not contain a well-formed database page, then return 
001979  ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
001980  ** guarantee that the page is well-formed.  It only shows that
001981  ** we failed to detect any corruption.
001982  */
001983  static int btreeInitPage(MemPage *pPage){
001984    u8 *data;          /* Equal to pPage->aData */
001985    BtShared *pBt;        /* The main btree structure */
001986  
001987    assert( pPage->pBt!=0 );
001988    assert( pPage->pBt->db!=0 );
001989    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001990    assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
001991    assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
001992    assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
001993    assert( pPage->isInit==0 );
001994  
001995    pBt = pPage->pBt;
001996    data = pPage->aData + pPage->hdrOffset;
001997    /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
001998    ** the b-tree page type. */
001999    if( decodeFlags(pPage, data[0]) ){
002000      return SQLITE_CORRUPT_PAGE(pPage);
002001    }
002002    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
002003    pPage->maskPage = (u16)(pBt->pageSize - 1);
002004    pPage->nOverflow = 0;
002005    pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize;
002006    pPage->aCellIdx = data + pPage->childPtrSize + 8;
002007    pPage->aDataEnd = pPage->aData + pBt->usableSize;
002008    pPage->aDataOfst = pPage->aData + pPage->childPtrSize;
002009    /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
002010    ** number of cells on the page. */
002011    pPage->nCell = get2byte(&data[3]);
002012    if( pPage->nCell>MX_CELL(pBt) ){
002013      /* To many cells for a single page.  The page must be corrupt */
002014      return SQLITE_CORRUPT_PAGE(pPage);
002015    }
002016    testcase( pPage->nCell==MX_CELL(pBt) );
002017    /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
002018    ** possible for a root page of a table that contains no rows) then the
002019    ** offset to the cell content area will equal the page size minus the
002020    ** bytes of reserved space. */
002021    assert( pPage->nCell>0
002022         || get2byteNotZero(&data[5])==(int)pBt->usableSize
002023         || CORRUPT_DB );
002024    pPage->nFree = -1;  /* Indicate that this value is yet uncomputed */
002025    pPage->isInit = 1;
002026    if( pBt->db->flags & SQLITE_CellSizeCk ){
002027      return btreeCellSizeCheck(pPage);
002028    }
002029    return SQLITE_OK;
002030  }
002031  
002032  /*
002033  ** Set up a raw page so that it looks like a database page holding
002034  ** no entries.
002035  */
002036  static void zeroPage(MemPage *pPage, int flags){
002037    unsigned char *data = pPage->aData;
002038    BtShared *pBt = pPage->pBt;
002039    u8 hdr = pPage->hdrOffset;
002040    u16 first;
002041  
002042    assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
002043    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002044    assert( sqlite3PagerGetData(pPage->pDbPage) == data );
002045    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
002046    assert( sqlite3_mutex_held(pBt->mutex) );
002047    if( pBt->btsFlags & BTS_FAST_SECURE ){
002048      memset(&data[hdr], 0, pBt->usableSize - hdr);
002049    }
002050    data[hdr] = (char)flags;
002051    first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
002052    memset(&data[hdr+1], 0, 4);
002053    data[hdr+7] = 0;
002054    put2byte(&data[hdr+5], pBt->usableSize);
002055    pPage->nFree = (u16)(pBt->usableSize - first);
002056    decodeFlags(pPage, flags);
002057    pPage->cellOffset = first;
002058    pPage->aDataEnd = &data[pBt->usableSize];
002059    pPage->aCellIdx = &data[first];
002060    pPage->aDataOfst = &data[pPage->childPtrSize];
002061    pPage->nOverflow = 0;
002062    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
002063    pPage->maskPage = (u16)(pBt->pageSize - 1);
002064    pPage->nCell = 0;
002065    pPage->isInit = 1;
002066  }
002067  
002068  
002069  /*
002070  ** Convert a DbPage obtained from the pager into a MemPage used by
002071  ** the btree layer.
002072  */
002073  static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
002074    MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
002075    if( pgno!=pPage->pgno ){
002076      pPage->aData = sqlite3PagerGetData(pDbPage);
002077      pPage->pDbPage = pDbPage;
002078      pPage->pBt = pBt;
002079      pPage->pgno = pgno;
002080      pPage->hdrOffset = pgno==1 ? 100 : 0;
002081    }
002082    assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
002083    return pPage; 
002084  }
002085  
002086  /*
002087  ** Get a page from the pager.  Initialize the MemPage.pBt and
002088  ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
002089  **
002090  ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
002091  ** about the content of the page at this time.  So do not go to the disk
002092  ** to fetch the content.  Just fill in the content with zeros for now.
002093  ** If in the future we call sqlite3PagerWrite() on this page, that
002094  ** means we have started to be concerned about content and the disk
002095  ** read should occur at that point.
002096  */
002097  static int btreeGetPage(
002098    BtShared *pBt,       /* The btree */
002099    Pgno pgno,           /* Number of the page to fetch */
002100    MemPage **ppPage,    /* Return the page in this parameter */
002101    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
002102  ){
002103    int rc;
002104    DbPage *pDbPage;
002105  
002106    assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
002107    assert( sqlite3_mutex_held(pBt->mutex) );
002108    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
002109    if( rc ) return rc;
002110    *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
002111    return SQLITE_OK;
002112  }
002113  
002114  /*
002115  ** Retrieve a page from the pager cache. If the requested page is not
002116  ** already in the pager cache return NULL. Initialize the MemPage.pBt and
002117  ** MemPage.aData elements if needed.
002118  */
002119  static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
002120    DbPage *pDbPage;
002121    assert( sqlite3_mutex_held(pBt->mutex) );
002122    pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
002123    if( pDbPage ){
002124      return btreePageFromDbPage(pDbPage, pgno, pBt);
002125    }
002126    return 0;
002127  }
002128  
002129  /*
002130  ** Return the size of the database file in pages. If there is any kind of
002131  ** error, return ((unsigned int)-1).
002132  */
002133  static Pgno btreePagecount(BtShared *pBt){
002134    return pBt->nPage;
002135  }
002136  u32 sqlite3BtreeLastPage(Btree *p){
002137    assert( sqlite3BtreeHoldsMutex(p) );
002138    assert( ((p->pBt->nPage)&0x80000000)==0 );
002139    return btreePagecount(p->pBt);
002140  }
002141  
002142  /*
002143  ** Get a page from the pager and initialize it.
002144  **
002145  ** If pCur!=0 then the page is being fetched as part of a moveToChild()
002146  ** call.  Do additional sanity checking on the page in this case.
002147  ** And if the fetch fails, this routine must decrement pCur->iPage.
002148  **
002149  ** The page is fetched as read-write unless pCur is not NULL and is
002150  ** a read-only cursor.
002151  **
002152  ** If an error occurs, then *ppPage is undefined. It
002153  ** may remain unchanged, or it may be set to an invalid value.
002154  */
002155  static int getAndInitPage(
002156    BtShared *pBt,                  /* The database file */
002157    Pgno pgno,                      /* Number of the page to get */
002158    MemPage **ppPage,               /* Write the page pointer here */
002159    BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
002160    int bReadOnly                   /* True for a read-only page */
002161  ){
002162    int rc;
002163    DbPage *pDbPage;
002164    assert( sqlite3_mutex_held(pBt->mutex) );
002165    assert( pCur==0 || ppPage==&pCur->pPage );
002166    assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
002167    assert( pCur==0 || pCur->iPage>0 );
002168  
002169    if( pgno>btreePagecount(pBt) ){
002170      rc = SQLITE_CORRUPT_BKPT;
002171      goto getAndInitPage_error1;
002172    }
002173    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
002174    if( rc ){
002175      goto getAndInitPage_error1;
002176    }
002177    *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
002178    if( (*ppPage)->isInit==0 ){
002179      btreePageFromDbPage(pDbPage, pgno, pBt);
002180      rc = btreeInitPage(*ppPage);
002181      if( rc!=SQLITE_OK ){
002182        goto getAndInitPage_error2;
002183      }
002184    }
002185    assert( (*ppPage)->pgno==pgno );
002186    assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
002187  
002188    /* If obtaining a child page for a cursor, we must verify that the page is
002189    ** compatible with the root page. */
002190    if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
002191      rc = SQLITE_CORRUPT_PGNO(pgno);
002192      goto getAndInitPage_error2;
002193    }
002194    return SQLITE_OK;
002195  
002196  getAndInitPage_error2:
002197    releasePage(*ppPage);
002198  getAndInitPage_error1:
002199    if( pCur ){
002200      pCur->iPage--;
002201      pCur->pPage = pCur->apPage[pCur->iPage];
002202    }
002203    testcase( pgno==0 );
002204    assert( pgno!=0 || rc==SQLITE_CORRUPT );
002205    return rc;
002206  }
002207  
002208  /*
002209  ** Release a MemPage.  This should be called once for each prior
002210  ** call to btreeGetPage.
002211  **
002212  ** Page1 is a special case and must be released using releasePageOne().
002213  */
002214  static void releasePageNotNull(MemPage *pPage){
002215    assert( pPage->aData );
002216    assert( pPage->pBt );
002217    assert( pPage->pDbPage!=0 );
002218    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002219    assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
002220    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002221    sqlite3PagerUnrefNotNull(pPage->pDbPage);
002222  }
002223  static void releasePage(MemPage *pPage){
002224    if( pPage ) releasePageNotNull(pPage);
002225  }
002226  static void releasePageOne(MemPage *pPage){
002227    assert( pPage!=0 );
002228    assert( pPage->aData );
002229    assert( pPage->pBt );
002230    assert( pPage->pDbPage!=0 );
002231    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002232    assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
002233    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002234    sqlite3PagerUnrefPageOne(pPage->pDbPage);
002235  }
002236  
002237  /*
002238  ** Get an unused page.
002239  **
002240  ** This works just like btreeGetPage() with the addition:
002241  **
002242  **   *  If the page is already in use for some other purpose, immediately
002243  **      release it and return an SQLITE_CURRUPT error.
002244  **   *  Make sure the isInit flag is clear
002245  */
002246  static int btreeGetUnusedPage(
002247    BtShared *pBt,       /* The btree */
002248    Pgno pgno,           /* Number of the page to fetch */
002249    MemPage **ppPage,    /* Return the page in this parameter */
002250    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
002251  ){
002252    int rc = btreeGetPage(pBt, pgno, ppPage, flags);
002253    if( rc==SQLITE_OK ){
002254      if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
002255        releasePage(*ppPage);
002256        *ppPage = 0;
002257        return SQLITE_CORRUPT_BKPT;
002258      }
002259      (*ppPage)->isInit = 0;
002260    }else{
002261      *ppPage = 0;
002262    }
002263    return rc;
002264  }
002265  
002266  
002267  /*
002268  ** During a rollback, when the pager reloads information into the cache
002269  ** so that the cache is restored to its original state at the start of
002270  ** the transaction, for each page restored this routine is called.
002271  **
002272  ** This routine needs to reset the extra data section at the end of the
002273  ** page to agree with the restored data.
002274  */
002275  static void pageReinit(DbPage *pData){
002276    MemPage *pPage;
002277    pPage = (MemPage *)sqlite3PagerGetExtra(pData);
002278    assert( sqlite3PagerPageRefcount(pData)>0 );
002279    if( pPage->isInit ){
002280      assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002281      pPage->isInit = 0;
002282      if( sqlite3PagerPageRefcount(pData)>1 ){
002283        /* pPage might not be a btree page;  it might be an overflow page
002284        ** or ptrmap page or a free page.  In those cases, the following
002285        ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
002286        ** But no harm is done by this.  And it is very important that
002287        ** btreeInitPage() be called on every btree page so we make
002288        ** the call for every page that comes in for re-initing. */
002289        btreeInitPage(pPage);
002290      }
002291    }
002292  }
002293  
002294  /*
002295  ** Invoke the busy handler for a btree.
002296  */
002297  static int btreeInvokeBusyHandler(void *pArg){
002298    BtShared *pBt = (BtShared*)pArg;
002299    assert( pBt->db );
002300    assert( sqlite3_mutex_held(pBt->db->mutex) );
002301    return sqlite3InvokeBusyHandler(&pBt->db->busyHandler,
002302                                    sqlite3PagerFile(pBt->pPager));
002303  }
002304  
002305  /*
002306  ** Open a database file.
002307  ** 
002308  ** zFilename is the name of the database file.  If zFilename is NULL
002309  ** then an ephemeral database is created.  The ephemeral database might
002310  ** be exclusively in memory, or it might use a disk-based memory cache.
002311  ** Either way, the ephemeral database will be automatically deleted 
002312  ** when sqlite3BtreeClose() is called.
002313  **
002314  ** If zFilename is ":memory:" then an in-memory database is created
002315  ** that is automatically destroyed when it is closed.
002316  **
002317  ** The "flags" parameter is a bitmask that might contain bits like
002318  ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
002319  **
002320  ** If the database is already opened in the same database connection
002321  ** and we are in shared cache mode, then the open will fail with an
002322  ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
002323  ** objects in the same database connection since doing so will lead
002324  ** to problems with locking.
002325  */
002326  int sqlite3BtreeOpen(
002327    sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
002328    const char *zFilename,  /* Name of the file containing the BTree database */
002329    sqlite3 *db,            /* Associated database handle */
002330    Btree **ppBtree,        /* Pointer to new Btree object written here */
002331    int flags,              /* Options */
002332    int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
002333  ){
002334    BtShared *pBt = 0;             /* Shared part of btree structure */
002335    Btree *p;                      /* Handle to return */
002336    sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
002337    int rc = SQLITE_OK;            /* Result code from this function */
002338    u8 nReserve;                   /* Byte of unused space on each page */
002339    unsigned char zDbHeader[100];  /* Database header content */
002340  
002341    /* True if opening an ephemeral, temporary database */
002342    const int isTempDb = zFilename==0 || zFilename[0]==0;
002343  
002344    /* Set the variable isMemdb to true for an in-memory database, or 
002345    ** false for a file-based database.
002346    */
002347  #ifdef SQLITE_OMIT_MEMORYDB
002348    const int isMemdb = 0;
002349  #else
002350    const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
002351                         || (isTempDb && sqlite3TempInMemory(db))
002352                         || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
002353  #endif
002354  
002355    assert( db!=0 );
002356    assert( pVfs!=0 );
002357    assert( sqlite3_mutex_held(db->mutex) );
002358    assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
002359  
002360    /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
002361    assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
002362  
002363    /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
002364    assert( (flags & BTREE_SINGLE)==0 || isTempDb );
002365  
002366    if( isMemdb ){
002367      flags |= BTREE_MEMORY;
002368    }
002369    if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
002370      vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
002371    }
002372    p = sqlite3MallocZero(sizeof(Btree));
002373    if( !p ){
002374      return SQLITE_NOMEM_BKPT;
002375    }
002376    p->inTrans = TRANS_NONE;
002377    p->db = db;
002378  #ifndef SQLITE_OMIT_SHARED_CACHE
002379    p->lock.pBtree = p;
002380    p->lock.iTable = 1;
002381  #endif
002382  
002383  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002384    /*
002385    ** If this Btree is a candidate for shared cache, try to find an
002386    ** existing BtShared object that we can share with
002387    */
002388    if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
002389      if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
002390        int nFilename = sqlite3Strlen30(zFilename)+1;
002391        int nFullPathname = pVfs->mxPathname+1;
002392        char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
002393        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002394  
002395        p->sharable = 1;
002396        if( !zFullPathname ){
002397          sqlite3_free(p);
002398          return SQLITE_NOMEM_BKPT;
002399        }
002400        if( isMemdb ){
002401          memcpy(zFullPathname, zFilename, nFilename);
002402        }else{
002403          rc = sqlite3OsFullPathname(pVfs, zFilename,
002404                                     nFullPathname, zFullPathname);
002405          if( rc ){
002406            if( rc==SQLITE_OK_SYMLINK ){
002407              rc = SQLITE_OK;
002408            }else{
002409              sqlite3_free(zFullPathname);
002410              sqlite3_free(p);
002411              return rc;
002412            }
002413          }
002414        }
002415  #if SQLITE_THREADSAFE
002416        mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
002417        sqlite3_mutex_enter(mutexOpen);
002418        mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
002419        sqlite3_mutex_enter(mutexShared);
002420  #endif
002421        for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
002422          assert( pBt->nRef>0 );
002423          if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
002424                   && sqlite3PagerVfs(pBt->pPager)==pVfs ){
002425            int iDb;
002426            for(iDb=db->nDb-1; iDb>=0; iDb--){
002427              Btree *pExisting = db->aDb[iDb].pBt;
002428              if( pExisting && pExisting->pBt==pBt ){
002429                sqlite3_mutex_leave(mutexShared);
002430                sqlite3_mutex_leave(mutexOpen);
002431                sqlite3_free(zFullPathname);
002432                sqlite3_free(p);
002433                return SQLITE_CONSTRAINT;
002434              }
002435            }
002436            p->pBt = pBt;
002437            pBt->nRef++;
002438            break;
002439          }
002440        }
002441        sqlite3_mutex_leave(mutexShared);
002442        sqlite3_free(zFullPathname);
002443      }
002444  #ifdef SQLITE_DEBUG
002445      else{
002446        /* In debug mode, we mark all persistent databases as sharable
002447        ** even when they are not.  This exercises the locking code and
002448        ** gives more opportunity for asserts(sqlite3_mutex_held())
002449        ** statements to find locking problems.
002450        */
002451        p->sharable = 1;
002452      }
002453  #endif
002454    }
002455  #endif
002456    if( pBt==0 ){
002457      /*
002458      ** The following asserts make sure that structures used by the btree are
002459      ** the right size.  This is to guard against size changes that result
002460      ** when compiling on a different architecture.
002461      */
002462      assert( sizeof(i64)==8 );
002463      assert( sizeof(u64)==8 );
002464      assert( sizeof(u32)==4 );
002465      assert( sizeof(u16)==2 );
002466      assert( sizeof(Pgno)==4 );
002467    
002468      pBt = sqlite3MallocZero( sizeof(*pBt) );
002469      if( pBt==0 ){
002470        rc = SQLITE_NOMEM_BKPT;
002471        goto btree_open_out;
002472      }
002473      rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
002474                            sizeof(MemPage), flags, vfsFlags, pageReinit);
002475      if( rc==SQLITE_OK ){
002476        sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
002477        rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
002478      }
002479      if( rc!=SQLITE_OK ){
002480        goto btree_open_out;
002481      }
002482      pBt->openFlags = (u8)flags;
002483      pBt->db = db;
002484      sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
002485      p->pBt = pBt;
002486    
002487      pBt->pCursor = 0;
002488      pBt->pPage1 = 0;
002489      if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
002490  #if defined(SQLITE_SECURE_DELETE)
002491      pBt->btsFlags |= BTS_SECURE_DELETE;
002492  #elif defined(SQLITE_FAST_SECURE_DELETE)
002493      pBt->btsFlags |= BTS_OVERWRITE;
002494  #endif
002495      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
002496      ** determined by the 2-byte integer located at an offset of 16 bytes from
002497      ** the beginning of the database file. */
002498      pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
002499      if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
002500           || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
002501        pBt->pageSize = 0;
002502  #ifndef SQLITE_OMIT_AUTOVACUUM
002503        /* If the magic name ":memory:" will create an in-memory database, then
002504        ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
002505        ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
002506        ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
002507        ** regular file-name. In this case the auto-vacuum applies as per normal.
002508        */
002509        if( zFilename && !isMemdb ){
002510          pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
002511          pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
002512        }
002513  #endif
002514        nReserve = 0;
002515      }else{
002516        /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
002517        ** determined by the one-byte unsigned integer found at an offset of 20
002518        ** into the database file header. */
002519        nReserve = zDbHeader[20];
002520        pBt->btsFlags |= BTS_PAGESIZE_FIXED;
002521  #ifndef SQLITE_OMIT_AUTOVACUUM
002522        pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
002523        pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
002524  #endif
002525      }
002526      rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
002527      if( rc ) goto btree_open_out;
002528      pBt->usableSize = pBt->pageSize - nReserve;
002529      assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
002530     
002531  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002532      /* Add the new BtShared object to the linked list sharable BtShareds.
002533      */
002534      pBt->nRef = 1;
002535      if( p->sharable ){
002536        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002537        MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
002538        if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
002539          pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
002540          if( pBt->mutex==0 ){
002541            rc = SQLITE_NOMEM_BKPT;
002542            goto btree_open_out;
002543          }
002544        }
002545        sqlite3_mutex_enter(mutexShared);
002546        pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
002547        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
002548        sqlite3_mutex_leave(mutexShared);
002549      }
002550  #endif
002551    }
002552  
002553  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002554    /* If the new Btree uses a sharable pBtShared, then link the new
002555    ** Btree into the list of all sharable Btrees for the same connection.
002556    ** The list is kept in ascending order by pBt address.
002557    */
002558    if( p->sharable ){
002559      int i;
002560      Btree *pSib;
002561      for(i=0; i<db->nDb; i++){
002562        if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
002563          while( pSib->pPrev ){ pSib = pSib->pPrev; }
002564          if( (uptr)p->pBt<(uptr)pSib->pBt ){
002565            p->pNext = pSib;
002566            p->pPrev = 0;
002567            pSib->pPrev = p;
002568          }else{
002569            while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
002570              pSib = pSib->pNext;
002571            }
002572            p->pNext = pSib->pNext;
002573            p->pPrev = pSib;
002574            if( p->pNext ){
002575              p->pNext->pPrev = p;
002576            }
002577            pSib->pNext = p;
002578          }
002579          break;
002580        }
002581      }
002582    }
002583  #endif
002584    *ppBtree = p;
002585  
002586  btree_open_out:
002587    if( rc!=SQLITE_OK ){
002588      if( pBt && pBt->pPager ){
002589        sqlite3PagerClose(pBt->pPager, 0);
002590      }
002591      sqlite3_free(pBt);
002592      sqlite3_free(p);
002593      *ppBtree = 0;
002594    }else{
002595      sqlite3_file *pFile;
002596  
002597      /* If the B-Tree was successfully opened, set the pager-cache size to the
002598      ** default value. Except, when opening on an existing shared pager-cache,
002599      ** do not change the pager-cache size.
002600      */
002601      if( sqlite3BtreeSchema(p, 0, 0)==0 ){
002602        sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
002603      }
002604  
002605      pFile = sqlite3PagerFile(pBt->pPager);
002606      if( pFile->pMethods ){
002607        sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
002608      }
002609    }
002610    if( mutexOpen ){
002611      assert( sqlite3_mutex_held(mutexOpen) );
002612      sqlite3_mutex_leave(mutexOpen);
002613    }
002614    assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
002615    return rc;
002616  }
002617  
002618  /*
002619  ** Decrement the BtShared.nRef counter.  When it reaches zero,
002620  ** remove the BtShared structure from the sharing list.  Return
002621  ** true if the BtShared.nRef counter reaches zero and return
002622  ** false if it is still positive.
002623  */
002624  static int removeFromSharingList(BtShared *pBt){
002625  #ifndef SQLITE_OMIT_SHARED_CACHE
002626    MUTEX_LOGIC( sqlite3_mutex *pMaster; )
002627    BtShared *pList;
002628    int removed = 0;
002629  
002630    assert( sqlite3_mutex_notheld(pBt->mutex) );
002631    MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
002632    sqlite3_mutex_enter(pMaster);
002633    pBt->nRef--;
002634    if( pBt->nRef<=0 ){
002635      if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
002636        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
002637      }else{
002638        pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
002639        while( ALWAYS(pList) && pList->pNext!=pBt ){
002640          pList=pList->pNext;
002641        }
002642        if( ALWAYS(pList) ){
002643          pList->pNext = pBt->pNext;
002644        }
002645      }
002646      if( SQLITE_THREADSAFE ){
002647        sqlite3_mutex_free(pBt->mutex);
002648      }
002649      removed = 1;
002650    }
002651    sqlite3_mutex_leave(pMaster);
002652    return removed;
002653  #else
002654    return 1;
002655  #endif
002656  }
002657  
002658  /*
002659  ** Make sure pBt->pTmpSpace points to an allocation of 
002660  ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
002661  ** pointer.
002662  */
002663  static void allocateTempSpace(BtShared *pBt){
002664    if( !pBt->pTmpSpace ){
002665      pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
002666  
002667      /* One of the uses of pBt->pTmpSpace is to format cells before
002668      ** inserting them into a leaf page (function fillInCell()). If
002669      ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
002670      ** by the various routines that manipulate binary cells. Which
002671      ** can mean that fillInCell() only initializes the first 2 or 3
002672      ** bytes of pTmpSpace, but that the first 4 bytes are copied from
002673      ** it into a database page. This is not actually a problem, but it
002674      ** does cause a valgrind error when the 1 or 2 bytes of unitialized 
002675      ** data is passed to system call write(). So to avoid this error,
002676      ** zero the first 4 bytes of temp space here.
002677      **
002678      ** Also:  Provide four bytes of initialized space before the
002679      ** beginning of pTmpSpace as an area available to prepend the
002680      ** left-child pointer to the beginning of a cell.
002681      */
002682      if( pBt->pTmpSpace ){
002683        memset(pBt->pTmpSpace, 0, 8);
002684        pBt->pTmpSpace += 4;
002685      }
002686    }
002687  }
002688  
002689  /*
002690  ** Free the pBt->pTmpSpace allocation
002691  */
002692  static void freeTempSpace(BtShared *pBt){
002693    if( pBt->pTmpSpace ){
002694      pBt->pTmpSpace -= 4;
002695      sqlite3PageFree(pBt->pTmpSpace);
002696      pBt->pTmpSpace = 0;
002697    }
002698  }
002699  
002700  /*
002701  ** Close an open database and invalidate all cursors.
002702  */
002703  int sqlite3BtreeClose(Btree *p){
002704    BtShared *pBt = p->pBt;
002705    BtCursor *pCur;
002706  
002707    /* Close all cursors opened via this handle.  */
002708    assert( sqlite3_mutex_held(p->db->mutex) );
002709    sqlite3BtreeEnter(p);
002710    pCur = pBt->pCursor;
002711    while( pCur ){
002712      BtCursor *pTmp = pCur;
002713      pCur = pCur->pNext;
002714      if( pTmp->pBtree==p ){
002715        sqlite3BtreeCloseCursor(pTmp);
002716      }
002717    }
002718  
002719    /* Rollback any active transaction and free the handle structure.
002720    ** The call to sqlite3BtreeRollback() drops any table-locks held by
002721    ** this handle.
002722    */
002723    sqlite3BtreeRollback(p, SQLITE_OK, 0);
002724    sqlite3BtreeLeave(p);
002725  
002726    /* If there are still other outstanding references to the shared-btree
002727    ** structure, return now. The remainder of this procedure cleans 
002728    ** up the shared-btree.
002729    */
002730    assert( p->wantToLock==0 && p->locked==0 );
002731    if( !p->sharable || removeFromSharingList(pBt) ){
002732      /* The pBt is no longer on the sharing list, so we can access
002733      ** it without having to hold the mutex.
002734      **
002735      ** Clean out and delete the BtShared object.
002736      */
002737      assert( !pBt->pCursor );
002738      sqlite3PagerClose(pBt->pPager, p->db);
002739      if( pBt->xFreeSchema && pBt->pSchema ){
002740        pBt->xFreeSchema(pBt->pSchema);
002741      }
002742      sqlite3DbFree(0, pBt->pSchema);
002743      freeTempSpace(pBt);
002744      sqlite3_free(pBt);
002745    }
002746  
002747  #ifndef SQLITE_OMIT_SHARED_CACHE
002748    assert( p->wantToLock==0 );
002749    assert( p->locked==0 );
002750    if( p->pPrev ) p->pPrev->pNext = p->pNext;
002751    if( p->pNext ) p->pNext->pPrev = p->pPrev;
002752  #endif
002753  
002754    sqlite3_free(p);
002755    return SQLITE_OK;
002756  }
002757  
002758  /*
002759  ** Change the "soft" limit on the number of pages in the cache.
002760  ** Unused and unmodified pages will be recycled when the number of
002761  ** pages in the cache exceeds this soft limit.  But the size of the
002762  ** cache is allowed to grow larger than this limit if it contains
002763  ** dirty pages or pages still in active use.
002764  */
002765  int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
002766    BtShared *pBt = p->pBt;
002767    assert( sqlite3_mutex_held(p->db->mutex) );
002768    sqlite3BtreeEnter(p);
002769    sqlite3PagerSetCachesize(pBt->pPager, mxPage);
002770    sqlite3BtreeLeave(p);
002771    return SQLITE_OK;
002772  }
002773  
002774  /*
002775  ** Change the "spill" limit on the number of pages in the cache.
002776  ** If the number of pages exceeds this limit during a write transaction,
002777  ** the pager might attempt to "spill" pages to the journal early in
002778  ** order to free up memory.
002779  **
002780  ** The value returned is the current spill size.  If zero is passed
002781  ** as an argument, no changes are made to the spill size setting, so
002782  ** using mxPage of 0 is a way to query the current spill size.
002783  */
002784  int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
002785    BtShared *pBt = p->pBt;
002786    int res;
002787    assert( sqlite3_mutex_held(p->db->mutex) );
002788    sqlite3BtreeEnter(p);
002789    res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
002790    sqlite3BtreeLeave(p);
002791    return res;
002792  }
002793  
002794  #if SQLITE_MAX_MMAP_SIZE>0
002795  /*
002796  ** Change the limit on the amount of the database file that may be
002797  ** memory mapped.
002798  */
002799  int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
002800    BtShared *pBt = p->pBt;
002801    assert( sqlite3_mutex_held(p->db->mutex) );
002802    sqlite3BtreeEnter(p);
002803    sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
002804    sqlite3BtreeLeave(p);
002805    return SQLITE_OK;
002806  }
002807  #endif /* SQLITE_MAX_MMAP_SIZE>0 */
002808  
002809  /*
002810  ** Change the way data is synced to disk in order to increase or decrease
002811  ** how well the database resists damage due to OS crashes and power
002812  ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
002813  ** there is a high probability of damage)  Level 2 is the default.  There
002814  ** is a very low but non-zero probability of damage.  Level 3 reduces the
002815  ** probability of damage to near zero but with a write performance reduction.
002816  */
002817  #ifndef SQLITE_OMIT_PAGER_PRAGMAS
002818  int sqlite3BtreeSetPagerFlags(
002819    Btree *p,              /* The btree to set the safety level on */
002820    unsigned pgFlags       /* Various PAGER_* flags */
002821  ){
002822    BtShared *pBt = p->pBt;
002823    assert( sqlite3_mutex_held(p->db->mutex) );
002824    sqlite3BtreeEnter(p);
002825    sqlite3PagerSetFlags(pBt->pPager, pgFlags);
002826    sqlite3BtreeLeave(p);
002827    return SQLITE_OK;
002828  }
002829  #endif
002830  
002831  /*
002832  ** Change the default pages size and the number of reserved bytes per page.
002833  ** Or, if the page size has already been fixed, return SQLITE_READONLY 
002834  ** without changing anything.
002835  **
002836  ** The page size must be a power of 2 between 512 and 65536.  If the page
002837  ** size supplied does not meet this constraint then the page size is not
002838  ** changed.
002839  **
002840  ** Page sizes are constrained to be a power of two so that the region
002841  ** of the database file used for locking (beginning at PENDING_BYTE,
002842  ** the first byte past the 1GB boundary, 0x40000000) needs to occur
002843  ** at the beginning of a page.
002844  **
002845  ** If parameter nReserve is less than zero, then the number of reserved
002846  ** bytes per page is left unchanged.
002847  **
002848  ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
002849  ** and autovacuum mode can no longer be changed.
002850  */
002851  int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
002852    int rc = SQLITE_OK;
002853    BtShared *pBt = p->pBt;
002854    assert( nReserve>=-1 && nReserve<=255 );
002855    sqlite3BtreeEnter(p);
002856  #if SQLITE_HAS_CODEC
002857    if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
002858  #endif
002859    if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
002860      sqlite3BtreeLeave(p);
002861      return SQLITE_READONLY;
002862    }
002863    if( nReserve<0 ){
002864      nReserve = pBt->pageSize - pBt->usableSize;
002865    }
002866    assert( nReserve>=0 && nReserve<=255 );
002867    if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
002868          ((pageSize-1)&pageSize)==0 ){
002869      assert( (pageSize & 7)==0 );
002870      assert( !pBt->pCursor );
002871      pBt->pageSize = (u32)pageSize;
002872      freeTempSpace(pBt);
002873    }
002874    rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
002875    pBt->usableSize = pBt->pageSize - (u16)nReserve;
002876    if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
002877    sqlite3BtreeLeave(p);
002878    return rc;
002879  }
002880  
002881  /*
002882  ** Return the currently defined page size
002883  */
002884  int sqlite3BtreeGetPageSize(Btree *p){
002885    return p->pBt->pageSize;
002886  }
002887  
002888  /*
002889  ** This function is similar to sqlite3BtreeGetReserve(), except that it
002890  ** may only be called if it is guaranteed that the b-tree mutex is already
002891  ** held.
002892  **
002893  ** This is useful in one special case in the backup API code where it is
002894  ** known that the shared b-tree mutex is held, but the mutex on the 
002895  ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
002896  ** were to be called, it might collide with some other operation on the
002897  ** database handle that owns *p, causing undefined behavior.
002898  */
002899  int sqlite3BtreeGetReserveNoMutex(Btree *p){
002900    int n;
002901    assert( sqlite3_mutex_held(p->pBt->mutex) );
002902    n = p->pBt->pageSize - p->pBt->usableSize;
002903    return n;
002904  }
002905  
002906  /*
002907  ** Return the number of bytes of space at the end of every page that
002908  ** are intentually left unused.  This is the "reserved" space that is
002909  ** sometimes used by extensions.
002910  **
002911  ** If SQLITE_HAS_MUTEX is defined then the number returned is the
002912  ** greater of the current reserved space and the maximum requested
002913  ** reserve space.
002914  */
002915  int sqlite3BtreeGetOptimalReserve(Btree *p){
002916    int n;
002917    sqlite3BtreeEnter(p);
002918    n = sqlite3BtreeGetReserveNoMutex(p);
002919  #ifdef SQLITE_HAS_CODEC
002920    if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
002921  #endif
002922    sqlite3BtreeLeave(p);
002923    return n;
002924  }
002925  
002926  
002927  /*
002928  ** Set the maximum page count for a database if mxPage is positive.
002929  ** No changes are made if mxPage is 0 or negative.
002930  ** Regardless of the value of mxPage, return the maximum page count.
002931  */
002932  int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
002933    int n;
002934    sqlite3BtreeEnter(p);
002935    n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
002936    sqlite3BtreeLeave(p);
002937    return n;
002938  }
002939  
002940  /*
002941  ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
002942  **
002943  **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
002944  **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
002945  **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
002946  **    newFlag==(-1)    No changes
002947  **
002948  ** This routine acts as a query if newFlag is less than zero
002949  **
002950  ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
002951  ** freelist leaf pages are not written back to the database.  Thus in-page
002952  ** deleted content is cleared, but freelist deleted content is not.
002953  **
002954  ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
002955  ** that freelist leaf pages are written back into the database, increasing
002956  ** the amount of disk I/O.
002957  */
002958  int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
002959    int b;
002960    if( p==0 ) return 0;
002961    sqlite3BtreeEnter(p);
002962    assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
002963    assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
002964    if( newFlag>=0 ){
002965      p->pBt->btsFlags &= ~BTS_FAST_SECURE;
002966      p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
002967    }
002968    b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
002969    sqlite3BtreeLeave(p);
002970    return b;
002971  }
002972  
002973  /*
002974  ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
002975  ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
002976  ** is disabled. The default value for the auto-vacuum property is 
002977  ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
002978  */
002979  int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
002980  #ifdef SQLITE_OMIT_AUTOVACUUM
002981    return SQLITE_READONLY;
002982  #else
002983    BtShared *pBt = p->pBt;
002984    int rc = SQLITE_OK;
002985    u8 av = (u8)autoVacuum;
002986  
002987    sqlite3BtreeEnter(p);
002988    if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
002989      rc = SQLITE_READONLY;
002990    }else{
002991      pBt->autoVacuum = av ?1:0;
002992      pBt->incrVacuum = av==2 ?1:0;
002993    }
002994    sqlite3BtreeLeave(p);
002995    return rc;
002996  #endif
002997  }
002998  
002999  /*
003000  ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
003001  ** enabled 1 is returned. Otherwise 0.
003002  */
003003  int sqlite3BtreeGetAutoVacuum(Btree *p){
003004  #ifdef SQLITE_OMIT_AUTOVACUUM
003005    return BTREE_AUTOVACUUM_NONE;
003006  #else
003007    int rc;
003008    sqlite3BtreeEnter(p);
003009    rc = (
003010      (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
003011      (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
003012      BTREE_AUTOVACUUM_INCR
003013    );
003014    sqlite3BtreeLeave(p);
003015    return rc;
003016  #endif
003017  }
003018  
003019  /*
003020  ** If the user has not set the safety-level for this database connection
003021  ** using "PRAGMA synchronous", and if the safety-level is not already
003022  ** set to the value passed to this function as the second parameter,
003023  ** set it so.
003024  */
003025  #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
003026      && !defined(SQLITE_OMIT_WAL)
003027  static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
003028    sqlite3 *db;
003029    Db *pDb;
003030    if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
003031      while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
003032      if( pDb->bSyncSet==0 
003033       && pDb->safety_level!=safety_level 
003034       && pDb!=&db->aDb[1] 
003035      ){
003036        pDb->safety_level = safety_level;
003037        sqlite3PagerSetFlags(pBt->pPager,
003038            pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
003039      }
003040    }
003041  }
003042  #else
003043  # define setDefaultSyncFlag(pBt,safety_level)
003044  #endif
003045  
003046  /* Forward declaration */
003047  static int newDatabase(BtShared*);
003048  
003049  
003050  /*
003051  ** Get a reference to pPage1 of the database file.  This will
003052  ** also acquire a readlock on that file.
003053  **
003054  ** SQLITE_OK is returned on success.  If the file is not a
003055  ** well-formed database file, then SQLITE_CORRUPT is returned.
003056  ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
003057  ** is returned if we run out of memory. 
003058  */
003059  static int lockBtree(BtShared *pBt){
003060    int rc;              /* Result code from subfunctions */
003061    MemPage *pPage1;     /* Page 1 of the database file */
003062    u32 nPage;           /* Number of pages in the database */
003063    u32 nPageFile = 0;   /* Number of pages in the database file */
003064    u32 nPageHeader;     /* Number of pages in the database according to hdr */
003065  
003066    assert( sqlite3_mutex_held(pBt->mutex) );
003067    assert( pBt->pPage1==0 );
003068    rc = sqlite3PagerSharedLock(pBt->pPager);
003069    if( rc!=SQLITE_OK ) return rc;
003070    rc = btreeGetPage(pBt, 1, &pPage1, 0);
003071    if( rc!=SQLITE_OK ) return rc;
003072  
003073    /* Do some checking to help insure the file we opened really is
003074    ** a valid database file. 
003075    */
003076    nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
003077    sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile);
003078    if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
003079      nPage = nPageFile;
003080    }
003081    if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){
003082      nPage = 0;
003083    }
003084    if( nPage>0 ){
003085      u32 pageSize;
003086      u32 usableSize;
003087      u8 *page1 = pPage1->aData;
003088      rc = SQLITE_NOTADB;
003089      /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
003090      ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
003091      ** 61 74 20 33 00. */
003092      if( memcmp(page1, zMagicHeader, 16)!=0 ){
003093        goto page1_init_failed;
003094      }
003095  
003096  #ifdef SQLITE_OMIT_WAL
003097      if( page1[18]>1 ){
003098        pBt->btsFlags |= BTS_READ_ONLY;
003099      }
003100      if( page1[19]>1 ){
003101        goto page1_init_failed;
003102      }
003103  #else
003104      if( page1[18]>2 ){
003105        pBt->btsFlags |= BTS_READ_ONLY;
003106      }
003107      if( page1[19]>2 ){
003108        goto page1_init_failed;
003109      }
003110  
003111      /* If the write version is set to 2, this database should be accessed
003112      ** in WAL mode. If the log is not already open, open it now. Then 
003113      ** return SQLITE_OK and return without populating BtShared.pPage1.
003114      ** The caller detects this and calls this function again. This is
003115      ** required as the version of page 1 currently in the page1 buffer
003116      ** may not be the latest version - there may be a newer one in the log
003117      ** file.
003118      */
003119      if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
003120        int isOpen = 0;
003121        rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
003122        if( rc!=SQLITE_OK ){
003123          goto page1_init_failed;
003124        }else{
003125          setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
003126          if( isOpen==0 ){
003127            releasePageOne(pPage1);
003128            return SQLITE_OK;
003129          }
003130        }
003131        rc = SQLITE_NOTADB;
003132      }else{
003133        setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
003134      }
003135  #endif
003136  
003137      /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
003138      ** fractions and the leaf payload fraction values must be 64, 32, and 32.
003139      **
003140      ** The original design allowed these amounts to vary, but as of
003141      ** version 3.6.0, we require them to be fixed.
003142      */
003143      if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
003144        goto page1_init_failed;
003145      }
003146      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
003147      ** determined by the 2-byte integer located at an offset of 16 bytes from
003148      ** the beginning of the database file. */
003149      pageSize = (page1[16]<<8) | (page1[17]<<16);
003150      /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
003151      ** between 512 and 65536 inclusive. */
003152      if( ((pageSize-1)&pageSize)!=0
003153       || pageSize>SQLITE_MAX_PAGE_SIZE 
003154       || pageSize<=256 
003155      ){
003156        goto page1_init_failed;
003157      }
003158      pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003159      assert( (pageSize & 7)==0 );
003160      /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
003161      ** integer at offset 20 is the number of bytes of space at the end of
003162      ** each page to reserve for extensions. 
003163      **
003164      ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
003165      ** determined by the one-byte unsigned integer found at an offset of 20
003166      ** into the database file header. */
003167      usableSize = pageSize - page1[20];
003168      if( (u32)pageSize!=pBt->pageSize ){
003169        /* After reading the first page of the database assuming a page size
003170        ** of BtShared.pageSize, we have discovered that the page-size is
003171        ** actually pageSize. Unlock the database, leave pBt->pPage1 at
003172        ** zero and return SQLITE_OK. The caller will call this function
003173        ** again with the correct page-size.
003174        */
003175        releasePageOne(pPage1);
003176        pBt->usableSize = usableSize;
003177        pBt->pageSize = pageSize;
003178        freeTempSpace(pBt);
003179        rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
003180                                     pageSize-usableSize);
003181        return rc;
003182      }
003183      if( sqlite3WritableSchema(pBt->db)==0 && nPage>nPageFile ){
003184        rc = SQLITE_CORRUPT_BKPT;
003185        goto page1_init_failed;
003186      }
003187      /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
003188      ** be less than 480. In other words, if the page size is 512, then the
003189      ** reserved space size cannot exceed 32. */
003190      if( usableSize<480 ){
003191        goto page1_init_failed;
003192      }
003193      pBt->pageSize = pageSize;
003194      pBt->usableSize = usableSize;
003195  #ifndef SQLITE_OMIT_AUTOVACUUM
003196      pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
003197      pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
003198  #endif
003199    }
003200  
003201    /* maxLocal is the maximum amount of payload to store locally for
003202    ** a cell.  Make sure it is small enough so that at least minFanout
003203    ** cells can will fit on one page.  We assume a 10-byte page header.
003204    ** Besides the payload, the cell must store:
003205    **     2-byte pointer to the cell
003206    **     4-byte child pointer
003207    **     9-byte nKey value
003208    **     4-byte nData value
003209    **     4-byte overflow page pointer
003210    ** So a cell consists of a 2-byte pointer, a header which is as much as
003211    ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
003212    ** page pointer.
003213    */
003214    pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
003215    pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
003216    pBt->maxLeaf = (u16)(pBt->usableSize - 35);
003217    pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
003218    if( pBt->maxLocal>127 ){
003219      pBt->max1bytePayload = 127;
003220    }else{
003221      pBt->max1bytePayload = (u8)pBt->maxLocal;
003222    }
003223    assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
003224    pBt->pPage1 = pPage1;
003225    pBt->nPage = nPage;
003226    return SQLITE_OK;
003227  
003228  page1_init_failed:
003229    releasePageOne(pPage1);
003230    pBt->pPage1 = 0;
003231    return rc;
003232  }
003233  
003234  #ifndef NDEBUG
003235  /*
003236  ** Return the number of cursors open on pBt. This is for use
003237  ** in assert() expressions, so it is only compiled if NDEBUG is not
003238  ** defined.
003239  **
003240  ** Only write cursors are counted if wrOnly is true.  If wrOnly is
003241  ** false then all cursors are counted.
003242  **
003243  ** For the purposes of this routine, a cursor is any cursor that
003244  ** is capable of reading or writing to the database.  Cursors that
003245  ** have been tripped into the CURSOR_FAULT state are not counted.
003246  */
003247  static int countValidCursors(BtShared *pBt, int wrOnly){
003248    BtCursor *pCur;
003249    int r = 0;
003250    for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
003251      if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
003252       && pCur->eState!=CURSOR_FAULT ) r++; 
003253    }
003254    return r;
003255  }
003256  #endif
003257  
003258  /*
003259  ** If there are no outstanding cursors and we are not in the middle
003260  ** of a transaction but there is a read lock on the database, then
003261  ** this routine unrefs the first page of the database file which 
003262  ** has the effect of releasing the read lock.
003263  **
003264  ** If there is a transaction in progress, this routine is a no-op.
003265  */
003266  static void unlockBtreeIfUnused(BtShared *pBt){
003267    assert( sqlite3_mutex_held(pBt->mutex) );
003268    assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
003269    if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
003270      MemPage *pPage1 = pBt->pPage1;
003271      assert( pPage1->aData );
003272      assert( sqlite3PagerRefcount(pBt->pPager)==1 );
003273      pBt->pPage1 = 0;
003274      releasePageOne(pPage1);
003275    }
003276  }
003277  
003278  /*
003279  ** If pBt points to an empty file then convert that empty file
003280  ** into a new empty database by initializing the first page of
003281  ** the database.
003282  */
003283  static int newDatabase(BtShared *pBt){
003284    MemPage *pP1;
003285    unsigned char *data;
003286    int rc;
003287  
003288    assert( sqlite3_mutex_held(pBt->mutex) );
003289    if( pBt->nPage>0 ){
003290      return SQLITE_OK;
003291    }
003292    pP1 = pBt->pPage1;
003293    assert( pP1!=0 );
003294    data = pP1->aData;
003295    rc = sqlite3PagerWrite(pP1->pDbPage);
003296    if( rc ) return rc;
003297    memcpy(data, zMagicHeader, sizeof(zMagicHeader));
003298    assert( sizeof(zMagicHeader)==16 );
003299    data[16] = (u8)((pBt->pageSize>>8)&0xff);
003300    data[17] = (u8)((pBt->pageSize>>16)&0xff);
003301    data[18] = 1;
003302    data[19] = 1;
003303    assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
003304    data[20] = (u8)(pBt->pageSize - pBt->usableSize);
003305    data[21] = 64;
003306    data[22] = 32;
003307    data[23] = 32;
003308    memset(&data[24], 0, 100-24);
003309    zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
003310    pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003311  #ifndef SQLITE_OMIT_AUTOVACUUM
003312    assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
003313    assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
003314    put4byte(&data[36 + 4*4], pBt->autoVacuum);
003315    put4byte(&data[36 + 7*4], pBt->incrVacuum);
003316  #endif
003317    pBt->nPage = 1;
003318    data[31] = 1;
003319    return SQLITE_OK;
003320  }
003321  
003322  /*
003323  ** Initialize the first page of the database file (creating a database
003324  ** consisting of a single page and no schema objects). Return SQLITE_OK
003325  ** if successful, or an SQLite error code otherwise.
003326  */
003327  int sqlite3BtreeNewDb(Btree *p){
003328    int rc;
003329    sqlite3BtreeEnter(p);
003330    p->pBt->nPage = 0;
003331    rc = newDatabase(p->pBt);
003332    sqlite3BtreeLeave(p);
003333    return rc;
003334  }
003335  
003336  /*
003337  ** Attempt to start a new transaction. A write-transaction
003338  ** is started if the second argument is nonzero, otherwise a read-
003339  ** transaction.  If the second argument is 2 or more and exclusive
003340  ** transaction is started, meaning that no other process is allowed
003341  ** to access the database.  A preexisting transaction may not be
003342  ** upgraded to exclusive by calling this routine a second time - the
003343  ** exclusivity flag only works for a new transaction.
003344  **
003345  ** A write-transaction must be started before attempting any 
003346  ** changes to the database.  None of the following routines 
003347  ** will work unless a transaction is started first:
003348  **
003349  **      sqlite3BtreeCreateTable()
003350  **      sqlite3BtreeCreateIndex()
003351  **      sqlite3BtreeClearTable()
003352  **      sqlite3BtreeDropTable()
003353  **      sqlite3BtreeInsert()
003354  **      sqlite3BtreeDelete()
003355  **      sqlite3BtreeUpdateMeta()
003356  **
003357  ** If an initial attempt to acquire the lock fails because of lock contention
003358  ** and the database was previously unlocked, then invoke the busy handler
003359  ** if there is one.  But if there was previously a read-lock, do not
003360  ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
003361  ** returned when there is already a read-lock in order to avoid a deadlock.
003362  **
003363  ** Suppose there are two processes A and B.  A has a read lock and B has
003364  ** a reserved lock.  B tries to promote to exclusive but is blocked because
003365  ** of A's read lock.  A tries to promote to reserved but is blocked by B.
003366  ** One or the other of the two processes must give way or there can be
003367  ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
003368  ** when A already has a read lock, we encourage A to give up and let B
003369  ** proceed.
003370  */
003371  int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){
003372    BtShared *pBt = p->pBt;
003373    int rc = SQLITE_OK;
003374  
003375    sqlite3BtreeEnter(p);
003376    btreeIntegrity(p);
003377  
003378    /* If the btree is already in a write-transaction, or it
003379    ** is already in a read-transaction and a read-transaction
003380    ** is requested, this is a no-op.
003381    */
003382    if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
003383      goto trans_begun;
003384    }
003385    assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
003386  
003387    if( (p->db->flags & SQLITE_ResetDatabase) 
003388     && sqlite3PagerIsreadonly(pBt->pPager)==0 
003389    ){
003390      pBt->btsFlags &= ~BTS_READ_ONLY;
003391    }
003392  
003393    /* Write transactions are not possible on a read-only database */
003394    if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
003395      rc = SQLITE_READONLY;
003396      goto trans_begun;
003397    }
003398  
003399  #ifndef SQLITE_OMIT_SHARED_CACHE
003400    {
003401      sqlite3 *pBlock = 0;
003402      /* If another database handle has already opened a write transaction 
003403      ** on this shared-btree structure and a second write transaction is
003404      ** requested, return SQLITE_LOCKED.
003405      */
003406      if( (wrflag && pBt->inTransaction==TRANS_WRITE)
003407       || (pBt->btsFlags & BTS_PENDING)!=0
003408      ){
003409        pBlock = pBt->pWriter->db;
003410      }else if( wrflag>1 ){
003411        BtLock *pIter;
003412        for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
003413          if( pIter->pBtree!=p ){
003414            pBlock = pIter->pBtree->db;
003415            break;
003416          }
003417        }
003418      }
003419      if( pBlock ){
003420        sqlite3ConnectionBlocked(p->db, pBlock);
003421        rc = SQLITE_LOCKED_SHAREDCACHE;
003422        goto trans_begun;
003423      }
003424    }
003425  #endif
003426  
003427    /* Any read-only or read-write transaction implies a read-lock on 
003428    ** page 1. So if some other shared-cache client already has a write-lock 
003429    ** on page 1, the transaction cannot be opened. */
003430    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
003431    if( SQLITE_OK!=rc ) goto trans_begun;
003432  
003433    pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
003434    if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
003435    do {
003436      /* Call lockBtree() until either pBt->pPage1 is populated or
003437      ** lockBtree() returns something other than SQLITE_OK. lockBtree()
003438      ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
003439      ** reading page 1 it discovers that the page-size of the database 
003440      ** file is not pBt->pageSize. In this case lockBtree() will update
003441      ** pBt->pageSize to the page-size of the file on disk.
003442      */
003443      while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
003444  
003445      if( rc==SQLITE_OK && wrflag ){
003446        if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
003447          rc = SQLITE_READONLY;
003448        }else{
003449          rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
003450          if( rc==SQLITE_OK ){
003451            rc = newDatabase(pBt);
003452          }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){
003453            /* if there was no transaction opened when this function was
003454            ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error
003455            ** code to SQLITE_BUSY. */
003456            rc = SQLITE_BUSY;
003457          }
003458        }
003459      }
003460    
003461      if( rc!=SQLITE_OK ){
003462        unlockBtreeIfUnused(pBt);
003463      }
003464    }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
003465            btreeInvokeBusyHandler(pBt) );
003466    sqlite3PagerResetLockTimeout(pBt->pPager);
003467  
003468    if( rc==SQLITE_OK ){
003469      if( p->inTrans==TRANS_NONE ){
003470        pBt->nTransaction++;
003471  #ifndef SQLITE_OMIT_SHARED_CACHE
003472        if( p->sharable ){
003473          assert( p->lock.pBtree==p && p->lock.iTable==1 );
003474          p->lock.eLock = READ_LOCK;
003475          p->lock.pNext = pBt->pLock;
003476          pBt->pLock = &p->lock;
003477        }
003478  #endif
003479      }
003480      p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
003481      if( p->inTrans>pBt->inTransaction ){
003482        pBt->inTransaction = p->inTrans;
003483      }
003484      if( wrflag ){
003485        MemPage *pPage1 = pBt->pPage1;
003486  #ifndef SQLITE_OMIT_SHARED_CACHE
003487        assert( !pBt->pWriter );
003488        pBt->pWriter = p;
003489        pBt->btsFlags &= ~BTS_EXCLUSIVE;
003490        if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
003491  #endif
003492  
003493        /* If the db-size header field is incorrect (as it may be if an old
003494        ** client has been writing the database file), update it now. Doing
003495        ** this sooner rather than later means the database size can safely 
003496        ** re-read the database size from page 1 if a savepoint or transaction
003497        ** rollback occurs within the transaction.
003498        */
003499        if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
003500          rc = sqlite3PagerWrite(pPage1->pDbPage);
003501          if( rc==SQLITE_OK ){
003502            put4byte(&pPage1->aData[28], pBt->nPage);
003503          }
003504        }
003505      }
003506    }
003507  
003508  trans_begun:
003509    if( rc==SQLITE_OK ){
003510      if( pSchemaVersion ){
003511        *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
003512      }
003513      if( wrflag ){
003514        /* This call makes sure that the pager has the correct number of
003515        ** open savepoints. If the second parameter is greater than 0 and
003516        ** the sub-journal is not already open, then it will be opened here.
003517        */
003518        rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
003519      }
003520    }
003521  
003522    btreeIntegrity(p);
003523    sqlite3BtreeLeave(p);
003524    return rc;
003525  }
003526  
003527  #ifndef SQLITE_OMIT_AUTOVACUUM
003528  
003529  /*
003530  ** Set the pointer-map entries for all children of page pPage. Also, if
003531  ** pPage contains cells that point to overflow pages, set the pointer
003532  ** map entries for the overflow pages as well.
003533  */
003534  static int setChildPtrmaps(MemPage *pPage){
003535    int i;                             /* Counter variable */
003536    int nCell;                         /* Number of cells in page pPage */
003537    int rc;                            /* Return code */
003538    BtShared *pBt = pPage->pBt;
003539    Pgno pgno = pPage->pgno;
003540  
003541    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003542    rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
003543    if( rc!=SQLITE_OK ) return rc;
003544    nCell = pPage->nCell;
003545  
003546    for(i=0; i<nCell; i++){
003547      u8 *pCell = findCell(pPage, i);
003548  
003549      ptrmapPutOvflPtr(pPage, pPage, pCell, &rc);
003550  
003551      if( !pPage->leaf ){
003552        Pgno childPgno = get4byte(pCell);
003553        ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003554      }
003555    }
003556  
003557    if( !pPage->leaf ){
003558      Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
003559      ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003560    }
003561  
003562    return rc;
003563  }
003564  
003565  /*
003566  ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
003567  ** that it points to iTo. Parameter eType describes the type of pointer to
003568  ** be modified, as  follows:
003569  **
003570  ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
003571  **                   page of pPage.
003572  **
003573  ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
003574  **                   page pointed to by one of the cells on pPage.
003575  **
003576  ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
003577  **                   overflow page in the list.
003578  */
003579  static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
003580    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003581    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
003582    if( eType==PTRMAP_OVERFLOW2 ){
003583      /* The pointer is always the first 4 bytes of the page in this case.  */
003584      if( get4byte(pPage->aData)!=iFrom ){
003585        return SQLITE_CORRUPT_PAGE(pPage);
003586      }
003587      put4byte(pPage->aData, iTo);
003588    }else{
003589      int i;
003590      int nCell;
003591      int rc;
003592  
003593      rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
003594      if( rc ) return rc;
003595      nCell = pPage->nCell;
003596  
003597      for(i=0; i<nCell; i++){
003598        u8 *pCell = findCell(pPage, i);
003599        if( eType==PTRMAP_OVERFLOW1 ){
003600          CellInfo info;
003601          pPage->xParseCell(pPage, pCell, &info);
003602          if( info.nLocal<info.nPayload ){
003603            if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
003604              return SQLITE_CORRUPT_PAGE(pPage);
003605            }
003606            if( iFrom==get4byte(pCell+info.nSize-4) ){
003607              put4byte(pCell+info.nSize-4, iTo);
003608              break;
003609            }
003610          }
003611        }else{
003612          if( get4byte(pCell)==iFrom ){
003613            put4byte(pCell, iTo);
003614            break;
003615          }
003616        }
003617      }
003618    
003619      if( i==nCell ){
003620        if( eType!=PTRMAP_BTREE || 
003621            get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
003622          return SQLITE_CORRUPT_PAGE(pPage);
003623        }
003624        put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
003625      }
003626    }
003627    return SQLITE_OK;
003628  }
003629  
003630  
003631  /*
003632  ** Move the open database page pDbPage to location iFreePage in the 
003633  ** database. The pDbPage reference remains valid.
003634  **
003635  ** The isCommit flag indicates that there is no need to remember that
003636  ** the journal needs to be sync()ed before database page pDbPage->pgno 
003637  ** can be written to. The caller has already promised not to write to that
003638  ** page.
003639  */
003640  static int relocatePage(
003641    BtShared *pBt,           /* Btree */
003642    MemPage *pDbPage,        /* Open page to move */
003643    u8 eType,                /* Pointer map 'type' entry for pDbPage */
003644    Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
003645    Pgno iFreePage,          /* The location to move pDbPage to */
003646    int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
003647  ){
003648    MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
003649    Pgno iDbPage = pDbPage->pgno;
003650    Pager *pPager = pBt->pPager;
003651    int rc;
003652  
003653    assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
003654        eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
003655    assert( sqlite3_mutex_held(pBt->mutex) );
003656    assert( pDbPage->pBt==pBt );
003657    if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT;
003658  
003659    /* Move page iDbPage from its current location to page number iFreePage */
003660    TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
003661        iDbPage, iFreePage, iPtrPage, eType));
003662    rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
003663    if( rc!=SQLITE_OK ){
003664      return rc;
003665    }
003666    pDbPage->pgno = iFreePage;
003667  
003668    /* If pDbPage was a btree-page, then it may have child pages and/or cells
003669    ** that point to overflow pages. The pointer map entries for all these
003670    ** pages need to be changed.
003671    **
003672    ** If pDbPage is an overflow page, then the first 4 bytes may store a
003673    ** pointer to a subsequent overflow page. If this is the case, then
003674    ** the pointer map needs to be updated for the subsequent overflow page.
003675    */
003676    if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
003677      rc = setChildPtrmaps(pDbPage);
003678      if( rc!=SQLITE_OK ){
003679        return rc;
003680      }
003681    }else{
003682      Pgno nextOvfl = get4byte(pDbPage->aData);
003683      if( nextOvfl!=0 ){
003684        ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
003685        if( rc!=SQLITE_OK ){
003686          return rc;
003687        }
003688      }
003689    }
003690  
003691    /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
003692    ** that it points at iFreePage. Also fix the pointer map entry for
003693    ** iPtrPage.
003694    */
003695    if( eType!=PTRMAP_ROOTPAGE ){
003696      rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
003697      if( rc!=SQLITE_OK ){
003698        return rc;
003699      }
003700      rc = sqlite3PagerWrite(pPtrPage->pDbPage);
003701      if( rc!=SQLITE_OK ){
003702        releasePage(pPtrPage);
003703        return rc;
003704      }
003705      rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
003706      releasePage(pPtrPage);
003707      if( rc==SQLITE_OK ){
003708        ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
003709      }
003710    }
003711    return rc;
003712  }
003713  
003714  /* Forward declaration required by incrVacuumStep(). */
003715  static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
003716  
003717  /*
003718  ** Perform a single step of an incremental-vacuum. If successful, return
003719  ** SQLITE_OK. If there is no work to do (and therefore no point in 
003720  ** calling this function again), return SQLITE_DONE. Or, if an error 
003721  ** occurs, return some other error code.
003722  **
003723  ** More specifically, this function attempts to re-organize the database so 
003724  ** that the last page of the file currently in use is no longer in use.
003725  **
003726  ** Parameter nFin is the number of pages that this database would contain
003727  ** were this function called until it returns SQLITE_DONE.
003728  **
003729  ** If the bCommit parameter is non-zero, this function assumes that the 
003730  ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 
003731  ** or an error. bCommit is passed true for an auto-vacuum-on-commit 
003732  ** operation, or false for an incremental vacuum.
003733  */
003734  static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
003735    Pgno nFreeList;           /* Number of pages still on the free-list */
003736    int rc;
003737  
003738    assert( sqlite3_mutex_held(pBt->mutex) );
003739    assert( iLastPg>nFin );
003740  
003741    if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
003742      u8 eType;
003743      Pgno iPtrPage;
003744  
003745      nFreeList = get4byte(&pBt->pPage1->aData[36]);
003746      if( nFreeList==0 ){
003747        return SQLITE_DONE;
003748      }
003749  
003750      rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
003751      if( rc!=SQLITE_OK ){
003752        return rc;
003753      }
003754      if( eType==PTRMAP_ROOTPAGE ){
003755        return SQLITE_CORRUPT_BKPT;
003756      }
003757  
003758      if( eType==PTRMAP_FREEPAGE ){
003759        if( bCommit==0 ){
003760          /* Remove the page from the files free-list. This is not required
003761          ** if bCommit is non-zero. In that case, the free-list will be
003762          ** truncated to zero after this function returns, so it doesn't 
003763          ** matter if it still contains some garbage entries.
003764          */
003765          Pgno iFreePg;
003766          MemPage *pFreePg;
003767          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
003768          if( rc!=SQLITE_OK ){
003769            return rc;
003770          }
003771          assert( iFreePg==iLastPg );
003772          releasePage(pFreePg);
003773        }
003774      } else {
003775        Pgno iFreePg;             /* Index of free page to move pLastPg to */
003776        MemPage *pLastPg;
003777        u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
003778        Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
003779  
003780        rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
003781        if( rc!=SQLITE_OK ){
003782          return rc;
003783        }
003784  
003785        /* If bCommit is zero, this loop runs exactly once and page pLastPg
003786        ** is swapped with the first free page pulled off the free list.
003787        **
003788        ** On the other hand, if bCommit is greater than zero, then keep
003789        ** looping until a free-page located within the first nFin pages
003790        ** of the file is found.
003791        */
003792        if( bCommit==0 ){
003793          eMode = BTALLOC_LE;
003794          iNear = nFin;
003795        }
003796        do {
003797          MemPage *pFreePg;
003798          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
003799          if( rc!=SQLITE_OK ){
003800            releasePage(pLastPg);
003801            return rc;
003802          }
003803          releasePage(pFreePg);
003804        }while( bCommit && iFreePg>nFin );
003805        assert( iFreePg<iLastPg );
003806        
003807        rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
003808        releasePage(pLastPg);
003809        if( rc!=SQLITE_OK ){
003810          return rc;
003811        }
003812      }
003813    }
003814  
003815    if( bCommit==0 ){
003816      do {
003817        iLastPg--;
003818      }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
003819      pBt->bDoTruncate = 1;
003820      pBt->nPage = iLastPg;
003821    }
003822    return SQLITE_OK;
003823  }
003824  
003825  /*
003826  ** The database opened by the first argument is an auto-vacuum database
003827  ** nOrig pages in size containing nFree free pages. Return the expected 
003828  ** size of the database in pages following an auto-vacuum operation.
003829  */
003830  static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
003831    int nEntry;                     /* Number of entries on one ptrmap page */
003832    Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
003833    Pgno nFin;                      /* Return value */
003834  
003835    nEntry = pBt->usableSize/5;
003836    nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
003837    nFin = nOrig - nFree - nPtrmap;
003838    if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
003839      nFin--;
003840    }
003841    while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
003842      nFin--;
003843    }
003844  
003845    return nFin;
003846  }
003847  
003848  /*
003849  ** A write-transaction must be opened before calling this function.
003850  ** It performs a single unit of work towards an incremental vacuum.
003851  **
003852  ** If the incremental vacuum is finished after this function has run,
003853  ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
003854  ** SQLITE_OK is returned. Otherwise an SQLite error code. 
003855  */
003856  int sqlite3BtreeIncrVacuum(Btree *p){
003857    int rc;
003858    BtShared *pBt = p->pBt;
003859  
003860    sqlite3BtreeEnter(p);
003861    assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
003862    if( !pBt->autoVacuum ){
003863      rc = SQLITE_DONE;
003864    }else{
003865      Pgno nOrig = btreePagecount(pBt);
003866      Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
003867      Pgno nFin = finalDbSize(pBt, nOrig, nFree);
003868  
003869      if( nOrig<nFin ){
003870        rc = SQLITE_CORRUPT_BKPT;
003871      }else if( nFree>0 ){
003872        rc = saveAllCursors(pBt, 0, 0);
003873        if( rc==SQLITE_OK ){
003874          invalidateAllOverflowCache(pBt);
003875          rc = incrVacuumStep(pBt, nFin, nOrig, 0);
003876        }
003877        if( rc==SQLITE_OK ){
003878          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
003879          put4byte(&pBt->pPage1->aData[28], pBt->nPage);
003880        }
003881      }else{
003882        rc = SQLITE_DONE;
003883      }
003884    }
003885    sqlite3BtreeLeave(p);
003886    return rc;
003887  }
003888  
003889  /*
003890  ** This routine is called prior to sqlite3PagerCommit when a transaction
003891  ** is committed for an auto-vacuum database.
003892  **
003893  ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
003894  ** the database file should be truncated to during the commit process. 
003895  ** i.e. the database has been reorganized so that only the first *pnTrunc
003896  ** pages are in use.
003897  */
003898  static int autoVacuumCommit(BtShared *pBt){
003899    int rc = SQLITE_OK;
003900    Pager *pPager = pBt->pPager;
003901    VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
003902  
003903    assert( sqlite3_mutex_held(pBt->mutex) );
003904    invalidateAllOverflowCache(pBt);
003905    assert(pBt->autoVacuum);
003906    if( !pBt->incrVacuum ){
003907      Pgno nFin;         /* Number of pages in database after autovacuuming */
003908      Pgno nFree;        /* Number of pages on the freelist initially */
003909      Pgno iFree;        /* The next page to be freed */
003910      Pgno nOrig;        /* Database size before freeing */
003911  
003912      nOrig = btreePagecount(pBt);
003913      if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
003914        /* It is not possible to create a database for which the final page
003915        ** is either a pointer-map page or the pending-byte page. If one
003916        ** is encountered, this indicates corruption.
003917        */
003918        return SQLITE_CORRUPT_BKPT;
003919      }
003920  
003921      nFree = get4byte(&pBt->pPage1->aData[36]);
003922      nFin = finalDbSize(pBt, nOrig, nFree);
003923      if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
003924      if( nFin<nOrig ){
003925        rc = saveAllCursors(pBt, 0, 0);
003926      }
003927      for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
003928        rc = incrVacuumStep(pBt, nFin, iFree, 1);
003929      }
003930      if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
003931        rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
003932        put4byte(&pBt->pPage1->aData[32], 0);
003933        put4byte(&pBt->pPage1->aData[36], 0);
003934        put4byte(&pBt->pPage1->aData[28], nFin);
003935        pBt->bDoTruncate = 1;
003936        pBt->nPage = nFin;
003937      }
003938      if( rc!=SQLITE_OK ){
003939        sqlite3PagerRollback(pPager);
003940      }
003941    }
003942  
003943    assert( nRef>=sqlite3PagerRefcount(pPager) );
003944    return rc;
003945  }
003946  
003947  #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
003948  # define setChildPtrmaps(x) SQLITE_OK
003949  #endif
003950  
003951  /*
003952  ** This routine does the first phase of a two-phase commit.  This routine
003953  ** causes a rollback journal to be created (if it does not already exist)
003954  ** and populated with enough information so that if a power loss occurs
003955  ** the database can be restored to its original state by playing back
003956  ** the journal.  Then the contents of the journal are flushed out to
003957  ** the disk.  After the journal is safely on oxide, the changes to the
003958  ** database are written into the database file and flushed to oxide.
003959  ** At the end of this call, the rollback journal still exists on the
003960  ** disk and we are still holding all locks, so the transaction has not
003961  ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
003962  ** commit process.
003963  **
003964  ** This call is a no-op if no write-transaction is currently active on pBt.
003965  **
003966  ** Otherwise, sync the database file for the btree pBt. zMaster points to
003967  ** the name of a master journal file that should be written into the
003968  ** individual journal file, or is NULL, indicating no master journal file 
003969  ** (single database transaction).
003970  **
003971  ** When this is called, the master journal should already have been
003972  ** created, populated with this journal pointer and synced to disk.
003973  **
003974  ** Once this is routine has returned, the only thing required to commit
003975  ** the write-transaction for this database file is to delete the journal.
003976  */
003977  int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
003978    int rc = SQLITE_OK;
003979    if( p->inTrans==TRANS_WRITE ){
003980      BtShared *pBt = p->pBt;
003981      sqlite3BtreeEnter(p);
003982  #ifndef SQLITE_OMIT_AUTOVACUUM
003983      if( pBt->autoVacuum ){
003984        rc = autoVacuumCommit(pBt);
003985        if( rc!=SQLITE_OK ){
003986          sqlite3BtreeLeave(p);
003987          return rc;
003988        }
003989      }
003990      if( pBt->bDoTruncate ){
003991        sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
003992      }
003993  #endif
003994      rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
003995      sqlite3BtreeLeave(p);
003996    }
003997    return rc;
003998  }
003999  
004000  /*
004001  ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
004002  ** at the conclusion of a transaction.
004003  */
004004  static void btreeEndTransaction(Btree *p){
004005    BtShared *pBt = p->pBt;
004006    sqlite3 *db = p->db;
004007    assert( sqlite3BtreeHoldsMutex(p) );
004008  
004009  #ifndef SQLITE_OMIT_AUTOVACUUM
004010    pBt->bDoTruncate = 0;
004011  #endif
004012    if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
004013      /* If there are other active statements that belong to this database
004014      ** handle, downgrade to a read-only transaction. The other statements
004015      ** may still be reading from the database.  */
004016      downgradeAllSharedCacheTableLocks(p);
004017      p->inTrans = TRANS_READ;
004018    }else{
004019      /* If the handle had any kind of transaction open, decrement the 
004020      ** transaction count of the shared btree. If the transaction count 
004021      ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
004022      ** call below will unlock the pager.  */
004023      if( p->inTrans!=TRANS_NONE ){
004024        clearAllSharedCacheTableLocks(p);
004025        pBt->nTransaction--;
004026        if( 0==pBt->nTransaction ){
004027          pBt->inTransaction = TRANS_NONE;
004028        }
004029      }
004030  
004031      /* Set the current transaction state to TRANS_NONE and unlock the 
004032      ** pager if this call closed the only read or write transaction.  */
004033      p->inTrans = TRANS_NONE;
004034      unlockBtreeIfUnused(pBt);
004035    }
004036  
004037    btreeIntegrity(p);
004038  }
004039  
004040  /*
004041  ** Commit the transaction currently in progress.
004042  **
004043  ** This routine implements the second phase of a 2-phase commit.  The
004044  ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
004045  ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
004046  ** routine did all the work of writing information out to disk and flushing the
004047  ** contents so that they are written onto the disk platter.  All this
004048  ** routine has to do is delete or truncate or zero the header in the
004049  ** the rollback journal (which causes the transaction to commit) and
004050  ** drop locks.
004051  **
004052  ** Normally, if an error occurs while the pager layer is attempting to 
004053  ** finalize the underlying journal file, this function returns an error and
004054  ** the upper layer will attempt a rollback. However, if the second argument
004055  ** is non-zero then this b-tree transaction is part of a multi-file 
004056  ** transaction. In this case, the transaction has already been committed 
004057  ** (by deleting a master journal file) and the caller will ignore this 
004058  ** functions return code. So, even if an error occurs in the pager layer,
004059  ** reset the b-tree objects internal state to indicate that the write
004060  ** transaction has been closed. This is quite safe, as the pager will have
004061  ** transitioned to the error state.
004062  **
004063  ** This will release the write lock on the database file.  If there
004064  ** are no active cursors, it also releases the read lock.
004065  */
004066  int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
004067  
004068    if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
004069    sqlite3BtreeEnter(p);
004070    btreeIntegrity(p);
004071  
004072    /* If the handle has a write-transaction open, commit the shared-btrees 
004073    ** transaction and set the shared state to TRANS_READ.
004074    */
004075    if( p->inTrans==TRANS_WRITE ){
004076      int rc;
004077      BtShared *pBt = p->pBt;
004078      assert( pBt->inTransaction==TRANS_WRITE );
004079      assert( pBt->nTransaction>0 );
004080      rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
004081      if( rc!=SQLITE_OK && bCleanup==0 ){
004082        sqlite3BtreeLeave(p);
004083        return rc;
004084      }
004085      p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
004086      pBt->inTransaction = TRANS_READ;
004087      btreeClearHasContent(pBt);
004088    }
004089  
004090    btreeEndTransaction(p);
004091    sqlite3BtreeLeave(p);
004092    return SQLITE_OK;
004093  }
004094  
004095  /*
004096  ** Do both phases of a commit.
004097  */
004098  int sqlite3BtreeCommit(Btree *p){
004099    int rc;
004100    sqlite3BtreeEnter(p);
004101    rc = sqlite3BtreeCommitPhaseOne(p, 0);
004102    if( rc==SQLITE_OK ){
004103      rc = sqlite3BtreeCommitPhaseTwo(p, 0);
004104    }
004105    sqlite3BtreeLeave(p);
004106    return rc;
004107  }
004108  
004109  /*
004110  ** This routine sets the state to CURSOR_FAULT and the error
004111  ** code to errCode for every cursor on any BtShared that pBtree
004112  ** references.  Or if the writeOnly flag is set to 1, then only
004113  ** trip write cursors and leave read cursors unchanged.
004114  **
004115  ** Every cursor is a candidate to be tripped, including cursors
004116  ** that belong to other database connections that happen to be
004117  ** sharing the cache with pBtree.
004118  **
004119  ** This routine gets called when a rollback occurs. If the writeOnly
004120  ** flag is true, then only write-cursors need be tripped - read-only
004121  ** cursors save their current positions so that they may continue 
004122  ** following the rollback. Or, if writeOnly is false, all cursors are 
004123  ** tripped. In general, writeOnly is false if the transaction being
004124  ** rolled back modified the database schema. In this case b-tree root
004125  ** pages may be moved or deleted from the database altogether, making
004126  ** it unsafe for read cursors to continue.
004127  **
004128  ** If the writeOnly flag is true and an error is encountered while 
004129  ** saving the current position of a read-only cursor, all cursors, 
004130  ** including all read-cursors are tripped.
004131  **
004132  ** SQLITE_OK is returned if successful, or if an error occurs while
004133  ** saving a cursor position, an SQLite error code.
004134  */
004135  int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
004136    BtCursor *p;
004137    int rc = SQLITE_OK;
004138  
004139    assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
004140    if( pBtree ){
004141      sqlite3BtreeEnter(pBtree);
004142      for(p=pBtree->pBt->pCursor; p; p=p->pNext){
004143        if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
004144          if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
004145            rc = saveCursorPosition(p);
004146            if( rc!=SQLITE_OK ){
004147              (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
004148              break;
004149            }
004150          }
004151        }else{
004152          sqlite3BtreeClearCursor(p);
004153          p->eState = CURSOR_FAULT;
004154          p->skipNext = errCode;
004155        }
004156        btreeReleaseAllCursorPages(p);
004157      }
004158      sqlite3BtreeLeave(pBtree);
004159    }
004160    return rc;
004161  }
004162  
004163  /*
004164  ** Set the pBt->nPage field correctly, according to the current
004165  ** state of the database.  Assume pBt->pPage1 is valid.
004166  */
004167  static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){
004168    int nPage = get4byte(&pPage1->aData[28]);
004169    testcase( nPage==0 );
004170    if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
004171    testcase( pBt->nPage!=nPage );
004172    pBt->nPage = nPage;
004173  }
004174  
004175  /*
004176  ** Rollback the transaction in progress.
004177  **
004178  ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
004179  ** Only write cursors are tripped if writeOnly is true but all cursors are
004180  ** tripped if writeOnly is false.  Any attempt to use
004181  ** a tripped cursor will result in an error.
004182  **
004183  ** This will release the write lock on the database file.  If there
004184  ** are no active cursors, it also releases the read lock.
004185  */
004186  int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
004187    int rc;
004188    BtShared *pBt = p->pBt;
004189    MemPage *pPage1;
004190  
004191    assert( writeOnly==1 || writeOnly==0 );
004192    assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
004193    sqlite3BtreeEnter(p);
004194    if( tripCode==SQLITE_OK ){
004195      rc = tripCode = saveAllCursors(pBt, 0, 0);
004196      if( rc ) writeOnly = 0;
004197    }else{
004198      rc = SQLITE_OK;
004199    }
004200    if( tripCode ){
004201      int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
004202      assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
004203      if( rc2!=SQLITE_OK ) rc = rc2;
004204    }
004205    btreeIntegrity(p);
004206  
004207    if( p->inTrans==TRANS_WRITE ){
004208      int rc2;
004209  
004210      assert( TRANS_WRITE==pBt->inTransaction );
004211      rc2 = sqlite3PagerRollback(pBt->pPager);
004212      if( rc2!=SQLITE_OK ){
004213        rc = rc2;
004214      }
004215  
004216      /* The rollback may have destroyed the pPage1->aData value.  So
004217      ** call btreeGetPage() on page 1 again to make
004218      ** sure pPage1->aData is set correctly. */
004219      if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
004220        btreeSetNPage(pBt, pPage1);
004221        releasePageOne(pPage1);
004222      }
004223      assert( countValidCursors(pBt, 1)==0 );
004224      pBt->inTransaction = TRANS_READ;
004225      btreeClearHasContent(pBt);
004226    }
004227  
004228    btreeEndTransaction(p);
004229    sqlite3BtreeLeave(p);
004230    return rc;
004231  }
004232  
004233  /*
004234  ** Start a statement subtransaction. The subtransaction can be rolled
004235  ** back independently of the main transaction. You must start a transaction 
004236  ** before starting a subtransaction. The subtransaction is ended automatically 
004237  ** if the main transaction commits or rolls back.
004238  **
004239  ** Statement subtransactions are used around individual SQL statements
004240  ** that are contained within a BEGIN...COMMIT block.  If a constraint
004241  ** error occurs within the statement, the effect of that one statement
004242  ** can be rolled back without having to rollback the entire transaction.
004243  **
004244  ** A statement sub-transaction is implemented as an anonymous savepoint. The
004245  ** value passed as the second parameter is the total number of savepoints,
004246  ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
004247  ** are no active savepoints and no other statement-transactions open,
004248  ** iStatement is 1. This anonymous savepoint can be released or rolled back
004249  ** using the sqlite3BtreeSavepoint() function.
004250  */
004251  int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
004252    int rc;
004253    BtShared *pBt = p->pBt;
004254    sqlite3BtreeEnter(p);
004255    assert( p->inTrans==TRANS_WRITE );
004256    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
004257    assert( iStatement>0 );
004258    assert( iStatement>p->db->nSavepoint );
004259    assert( pBt->inTransaction==TRANS_WRITE );
004260    /* At the pager level, a statement transaction is a savepoint with
004261    ** an index greater than all savepoints created explicitly using
004262    ** SQL statements. It is illegal to open, release or rollback any
004263    ** such savepoints while the statement transaction savepoint is active.
004264    */
004265    rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
004266    sqlite3BtreeLeave(p);
004267    return rc;
004268  }
004269  
004270  /*
004271  ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
004272  ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
004273  ** savepoint identified by parameter iSavepoint, depending on the value 
004274  ** of op.
004275  **
004276  ** Normally, iSavepoint is greater than or equal to zero. However, if op is
004277  ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 
004278  ** contents of the entire transaction are rolled back. This is different
004279  ** from a normal transaction rollback, as no locks are released and the
004280  ** transaction remains open.
004281  */
004282  int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
004283    int rc = SQLITE_OK;
004284    if( p && p->inTrans==TRANS_WRITE ){
004285      BtShared *pBt = p->pBt;
004286      assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
004287      assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
004288      sqlite3BtreeEnter(p);
004289      if( op==SAVEPOINT_ROLLBACK ){
004290        rc = saveAllCursors(pBt, 0, 0);
004291      }
004292      if( rc==SQLITE_OK ){
004293        rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
004294      }
004295      if( rc==SQLITE_OK ){
004296        if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
004297          pBt->nPage = 0;
004298        }
004299        rc = newDatabase(pBt);
004300        btreeSetNPage(pBt, pBt->pPage1);
004301  
004302        /* pBt->nPage might be zero if the database was corrupt when 
004303        ** the transaction was started. Otherwise, it must be at least 1.  */
004304        assert( CORRUPT_DB || pBt->nPage>0 );
004305      }
004306      sqlite3BtreeLeave(p);
004307    }
004308    return rc;
004309  }
004310  
004311  /*
004312  ** Create a new cursor for the BTree whose root is on the page
004313  ** iTable. If a read-only cursor is requested, it is assumed that
004314  ** the caller already has at least a read-only transaction open
004315  ** on the database already. If a write-cursor is requested, then
004316  ** the caller is assumed to have an open write transaction.
004317  **
004318  ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
004319  ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
004320  ** can be used for reading or for writing if other conditions for writing
004321  ** are also met.  These are the conditions that must be met in order
004322  ** for writing to be allowed:
004323  **
004324  ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
004325  **
004326  ** 2:  Other database connections that share the same pager cache
004327  **     but which are not in the READ_UNCOMMITTED state may not have
004328  **     cursors open with wrFlag==0 on the same table.  Otherwise
004329  **     the changes made by this write cursor would be visible to
004330  **     the read cursors in the other database connection.
004331  **
004332  ** 3:  The database must be writable (not on read-only media)
004333  **
004334  ** 4:  There must be an active transaction.
004335  **
004336  ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
004337  ** is set.  If FORDELETE is set, that is a hint to the implementation that
004338  ** this cursor will only be used to seek to and delete entries of an index
004339  ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
004340  ** this implementation.  But in a hypothetical alternative storage engine 
004341  ** in which index entries are automatically deleted when corresponding table
004342  ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
004343  ** operations on this cursor can be no-ops and all READ operations can 
004344  ** return a null row (2-bytes: 0x01 0x00).
004345  **
004346  ** No checking is done to make sure that page iTable really is the
004347  ** root page of a b-tree.  If it is not, then the cursor acquired
004348  ** will not work correctly.
004349  **
004350  ** It is assumed that the sqlite3BtreeCursorZero() has been called
004351  ** on pCur to initialize the memory space prior to invoking this routine.
004352  */
004353  static int btreeCursor(
004354    Btree *p,                              /* The btree */
004355    int iTable,                            /* Root page of table to open */
004356    int wrFlag,                            /* 1 to write. 0 read-only */
004357    struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
004358    BtCursor *pCur                         /* Space for new cursor */
004359  ){
004360    BtShared *pBt = p->pBt;                /* Shared b-tree handle */
004361    BtCursor *pX;                          /* Looping over other all cursors */
004362  
004363    assert( sqlite3BtreeHoldsMutex(p) );
004364    assert( wrFlag==0 
004365         || wrFlag==BTREE_WRCSR 
004366         || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 
004367    );
004368  
004369    /* The following assert statements verify that if this is a sharable 
004370    ** b-tree database, the connection is holding the required table locks, 
004371    ** and that no other connection has any open cursor that conflicts with 
004372    ** this lock.  The iTable<1 term disables the check for corrupt schemas. */
004373    assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1))
004374            || iTable<1 );
004375    assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
004376  
004377    /* Assert that the caller has opened the required transaction. */
004378    assert( p->inTrans>TRANS_NONE );
004379    assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
004380    assert( pBt->pPage1 && pBt->pPage1->aData );
004381    assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
004382  
004383    if( wrFlag ){
004384      allocateTempSpace(pBt);
004385      if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
004386    }
004387    if( iTable<=1 ){
004388      if( iTable<1 ){
004389        return SQLITE_CORRUPT_BKPT;
004390      }else if( btreePagecount(pBt)==0 ){
004391        assert( wrFlag==0 );
004392        iTable = 0;
004393      }
004394    }
004395  
004396    /* Now that no other errors can occur, finish filling in the BtCursor
004397    ** variables and link the cursor into the BtShared list.  */
004398    pCur->pgnoRoot = (Pgno)iTable;
004399    pCur->iPage = -1;
004400    pCur->pKeyInfo = pKeyInfo;
004401    pCur->pBtree = p;
004402    pCur->pBt = pBt;
004403    pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
004404    pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
004405    /* If there are two or more cursors on the same btree, then all such
004406    ** cursors *must* have the BTCF_Multiple flag set. */
004407    for(pX=pBt->pCursor; pX; pX=pX->pNext){
004408      if( pX->pgnoRoot==(Pgno)iTable ){
004409        pX->curFlags |= BTCF_Multiple;
004410        pCur->curFlags |= BTCF_Multiple;
004411      }
004412    }
004413    pCur->pNext = pBt->pCursor;
004414    pBt->pCursor = pCur;
004415    pCur->eState = CURSOR_INVALID;
004416    return SQLITE_OK;
004417  }
004418  static int btreeCursorWithLock(
004419    Btree *p,                              /* The btree */
004420    int iTable,                            /* Root page of table to open */
004421    int wrFlag,                            /* 1 to write. 0 read-only */
004422    struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
004423    BtCursor *pCur                         /* Space for new cursor */
004424  ){
004425    int rc;
004426    sqlite3BtreeEnter(p);
004427    rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
004428    sqlite3BtreeLeave(p);
004429    return rc;
004430  }
004431  int sqlite3BtreeCursor(
004432    Btree *p,                                   /* The btree */
004433    int iTable,                                 /* Root page of table to open */
004434    int wrFlag,                                 /* 1 to write. 0 read-only */
004435    struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
004436    BtCursor *pCur                              /* Write new cursor here */
004437  ){
004438    if( p->sharable ){
004439      return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur);
004440    }else{
004441      return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
004442    }
004443  }
004444  
004445  /*
004446  ** Return the size of a BtCursor object in bytes.
004447  **
004448  ** This interfaces is needed so that users of cursors can preallocate
004449  ** sufficient storage to hold a cursor.  The BtCursor object is opaque
004450  ** to users so they cannot do the sizeof() themselves - they must call
004451  ** this routine.
004452  */
004453  int sqlite3BtreeCursorSize(void){
004454    return ROUND8(sizeof(BtCursor));
004455  }
004456  
004457  /*
004458  ** Initialize memory that will be converted into a BtCursor object.
004459  **
004460  ** The simple approach here would be to memset() the entire object
004461  ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
004462  ** do not need to be zeroed and they are large, so we can save a lot
004463  ** of run-time by skipping the initialization of those elements.
004464  */
004465  void sqlite3BtreeCursorZero(BtCursor *p){
004466    memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT));
004467  }
004468  
004469  /*
004470  ** Close a cursor.  The read lock on the database file is released
004471  ** when the last cursor is closed.
004472  */
004473  int sqlite3BtreeCloseCursor(BtCursor *pCur){
004474    Btree *pBtree = pCur->pBtree;
004475    if( pBtree ){
004476      BtShared *pBt = pCur->pBt;
004477      sqlite3BtreeEnter(pBtree);
004478      assert( pBt->pCursor!=0 );
004479      if( pBt->pCursor==pCur ){
004480        pBt->pCursor = pCur->pNext;
004481      }else{
004482        BtCursor *pPrev = pBt->pCursor;
004483        do{
004484          if( pPrev->pNext==pCur ){
004485            pPrev->pNext = pCur->pNext;
004486            break;
004487          }
004488          pPrev = pPrev->pNext;
004489        }while( ALWAYS(pPrev) );
004490      }
004491      btreeReleaseAllCursorPages(pCur);
004492      unlockBtreeIfUnused(pBt);
004493      sqlite3_free(pCur->aOverflow);
004494      sqlite3_free(pCur->pKey);
004495      sqlite3BtreeLeave(pBtree);
004496      pCur->pBtree = 0;
004497    }
004498    return SQLITE_OK;
004499  }
004500  
004501  /*
004502  ** Make sure the BtCursor* given in the argument has a valid
004503  ** BtCursor.info structure.  If it is not already valid, call
004504  ** btreeParseCell() to fill it in.
004505  **
004506  ** BtCursor.info is a cache of the information in the current cell.
004507  ** Using this cache reduces the number of calls to btreeParseCell().
004508  */
004509  #ifndef NDEBUG
004510    static int cellInfoEqual(CellInfo *a, CellInfo *b){
004511      if( a->nKey!=b->nKey ) return 0;
004512      if( a->pPayload!=b->pPayload ) return 0;
004513      if( a->nPayload!=b->nPayload ) return 0;
004514      if( a->nLocal!=b->nLocal ) return 0;
004515      if( a->nSize!=b->nSize ) return 0;
004516      return 1;
004517    }
004518    static void assertCellInfo(BtCursor *pCur){
004519      CellInfo info;
004520      memset(&info, 0, sizeof(info));
004521      btreeParseCell(pCur->pPage, pCur->ix, &info);
004522      assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) );
004523    }
004524  #else
004525    #define assertCellInfo(x)
004526  #endif
004527  static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
004528    if( pCur->info.nSize==0 ){
004529      pCur->curFlags |= BTCF_ValidNKey;
004530      btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
004531    }else{
004532      assertCellInfo(pCur);
004533    }
004534  }
004535  
004536  #ifndef NDEBUG  /* The next routine used only within assert() statements */
004537  /*
004538  ** Return true if the given BtCursor is valid.  A valid cursor is one
004539  ** that is currently pointing to a row in a (non-empty) table.
004540  ** This is a verification routine is used only within assert() statements.
004541  */
004542  int sqlite3BtreeCursorIsValid(BtCursor *pCur){
004543    return pCur && pCur->eState==CURSOR_VALID;
004544  }
004545  #endif /* NDEBUG */
004546  int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
004547    assert( pCur!=0 );
004548    return pCur->eState==CURSOR_VALID;
004549  }
004550  
004551  /*
004552  ** Return the value of the integer key or "rowid" for a table btree.
004553  ** This routine is only valid for a cursor that is pointing into a
004554  ** ordinary table btree.  If the cursor points to an index btree or
004555  ** is invalid, the result of this routine is undefined.
004556  */
004557  i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
004558    assert( cursorHoldsMutex(pCur) );
004559    assert( pCur->eState==CURSOR_VALID );
004560    assert( pCur->curIntKey );
004561    getCellInfo(pCur);
004562    return pCur->info.nKey;
004563  }
004564  
004565  #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
004566  /*
004567  ** Return the offset into the database file for the start of the
004568  ** payload to which the cursor is pointing.
004569  */
004570  i64 sqlite3BtreeOffset(BtCursor *pCur){
004571    assert( cursorHoldsMutex(pCur) );
004572    assert( pCur->eState==CURSOR_VALID );
004573    getCellInfo(pCur);
004574    return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
004575           (i64)(pCur->info.pPayload - pCur->pPage->aData);
004576  }
004577  #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */
004578  
004579  /*
004580  ** Return the number of bytes of payload for the entry that pCur is
004581  ** currently pointing to.  For table btrees, this will be the amount
004582  ** of data.  For index btrees, this will be the size of the key.
004583  **
004584  ** The caller must guarantee that the cursor is pointing to a non-NULL
004585  ** valid entry.  In other words, the calling procedure must guarantee
004586  ** that the cursor has Cursor.eState==CURSOR_VALID.
004587  */
004588  u32 sqlite3BtreePayloadSize(BtCursor *pCur){
004589    assert( cursorHoldsMutex(pCur) );
004590    assert( pCur->eState==CURSOR_VALID );
004591    getCellInfo(pCur);
004592    return pCur->info.nPayload;
004593  }
004594  
004595  /*
004596  ** Return an upper bound on the size of any record for the table
004597  ** that the cursor is pointing into.
004598  **
004599  ** This is an optimization.  Everything will still work if this
004600  ** routine always returns 2147483647 (which is the largest record
004601  ** that SQLite can handle) or more.  But returning a smaller value might
004602  ** prevent large memory allocations when trying to interpret a
004603  ** corrupt datrabase.
004604  **
004605  ** The current implementation merely returns the size of the underlying
004606  ** database file.
004607  */
004608  sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){
004609    assert( cursorHoldsMutex(pCur) );
004610    assert( pCur->eState==CURSOR_VALID );
004611    return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage;
004612  }
004613  
004614  /*
004615  ** Given the page number of an overflow page in the database (parameter
004616  ** ovfl), this function finds the page number of the next page in the 
004617  ** linked list of overflow pages. If possible, it uses the auto-vacuum
004618  ** pointer-map data instead of reading the content of page ovfl to do so. 
004619  **
004620  ** If an error occurs an SQLite error code is returned. Otherwise:
004621  **
004622  ** The page number of the next overflow page in the linked list is 
004623  ** written to *pPgnoNext. If page ovfl is the last page in its linked 
004624  ** list, *pPgnoNext is set to zero. 
004625  **
004626  ** If ppPage is not NULL, and a reference to the MemPage object corresponding
004627  ** to page number pOvfl was obtained, then *ppPage is set to point to that
004628  ** reference. It is the responsibility of the caller to call releasePage()
004629  ** on *ppPage to free the reference. In no reference was obtained (because
004630  ** the pointer-map was used to obtain the value for *pPgnoNext), then
004631  ** *ppPage is set to zero.
004632  */
004633  static int getOverflowPage(
004634    BtShared *pBt,               /* The database file */
004635    Pgno ovfl,                   /* Current overflow page number */
004636    MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
004637    Pgno *pPgnoNext              /* OUT: Next overflow page number */
004638  ){
004639    Pgno next = 0;
004640    MemPage *pPage = 0;
004641    int rc = SQLITE_OK;
004642  
004643    assert( sqlite3_mutex_held(pBt->mutex) );
004644    assert(pPgnoNext);
004645  
004646  #ifndef SQLITE_OMIT_AUTOVACUUM
004647    /* Try to find the next page in the overflow list using the
004648    ** autovacuum pointer-map pages. Guess that the next page in 
004649    ** the overflow list is page number (ovfl+1). If that guess turns 
004650    ** out to be wrong, fall back to loading the data of page 
004651    ** number ovfl to determine the next page number.
004652    */
004653    if( pBt->autoVacuum ){
004654      Pgno pgno;
004655      Pgno iGuess = ovfl+1;
004656      u8 eType;
004657  
004658      while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
004659        iGuess++;
004660      }
004661  
004662      if( iGuess<=btreePagecount(pBt) ){
004663        rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
004664        if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
004665          next = iGuess;
004666          rc = SQLITE_DONE;
004667        }
004668      }
004669    }
004670  #endif
004671  
004672    assert( next==0 || rc==SQLITE_DONE );
004673    if( rc==SQLITE_OK ){
004674      rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
004675      assert( rc==SQLITE_OK || pPage==0 );
004676      if( rc==SQLITE_OK ){
004677        next = get4byte(pPage->aData);
004678      }
004679    }
004680  
004681    *pPgnoNext = next;
004682    if( ppPage ){
004683      *ppPage = pPage;
004684    }else{
004685      releasePage(pPage);
004686    }
004687    return (rc==SQLITE_DONE ? SQLITE_OK : rc);
004688  }
004689  
004690  /*
004691  ** Copy data from a buffer to a page, or from a page to a buffer.
004692  **
004693  ** pPayload is a pointer to data stored on database page pDbPage.
004694  ** If argument eOp is false, then nByte bytes of data are copied
004695  ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
004696  ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
004697  ** of data are copied from the buffer pBuf to pPayload.
004698  **
004699  ** SQLITE_OK is returned on success, otherwise an error code.
004700  */
004701  static int copyPayload(
004702    void *pPayload,           /* Pointer to page data */
004703    void *pBuf,               /* Pointer to buffer */
004704    int nByte,                /* Number of bytes to copy */
004705    int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
004706    DbPage *pDbPage           /* Page containing pPayload */
004707  ){
004708    if( eOp ){
004709      /* Copy data from buffer to page (a write operation) */
004710      int rc = sqlite3PagerWrite(pDbPage);
004711      if( rc!=SQLITE_OK ){
004712        return rc;
004713      }
004714      memcpy(pPayload, pBuf, nByte);
004715    }else{
004716      /* Copy data from page to buffer (a read operation) */
004717      memcpy(pBuf, pPayload, nByte);
004718    }
004719    return SQLITE_OK;
004720  }
004721  
004722  /*
004723  ** This function is used to read or overwrite payload information
004724  ** for the entry that the pCur cursor is pointing to. The eOp
004725  ** argument is interpreted as follows:
004726  **
004727  **   0: The operation is a read. Populate the overflow cache.
004728  **   1: The operation is a write. Populate the overflow cache.
004729  **
004730  ** A total of "amt" bytes are read or written beginning at "offset".
004731  ** Data is read to or from the buffer pBuf.
004732  **
004733  ** The content being read or written might appear on the main page
004734  ** or be scattered out on multiple overflow pages.
004735  **
004736  ** If the current cursor entry uses one or more overflow pages
004737  ** this function may allocate space for and lazily populate
004738  ** the overflow page-list cache array (BtCursor.aOverflow). 
004739  ** Subsequent calls use this cache to make seeking to the supplied offset 
004740  ** more efficient.
004741  **
004742  ** Once an overflow page-list cache has been allocated, it must be
004743  ** invalidated if some other cursor writes to the same table, or if
004744  ** the cursor is moved to a different row. Additionally, in auto-vacuum
004745  ** mode, the following events may invalidate an overflow page-list cache.
004746  **
004747  **   * An incremental vacuum,
004748  **   * A commit in auto_vacuum="full" mode,
004749  **   * Creating a table (may require moving an overflow page).
004750  */
004751  static int accessPayload(
004752    BtCursor *pCur,      /* Cursor pointing to entry to read from */
004753    u32 offset,          /* Begin reading this far into payload */
004754    u32 amt,             /* Read this many bytes */
004755    unsigned char *pBuf, /* Write the bytes into this buffer */ 
004756    int eOp              /* zero to read. non-zero to write. */
004757  ){
004758    unsigned char *aPayload;
004759    int rc = SQLITE_OK;
004760    int iIdx = 0;
004761    MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
004762    BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
004763  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004764    unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
004765  #endif
004766  
004767    assert( pPage );
004768    assert( eOp==0 || eOp==1 );
004769    assert( pCur->eState==CURSOR_VALID );
004770    assert( pCur->ix<pPage->nCell );
004771    assert( cursorHoldsMutex(pCur) );
004772  
004773    getCellInfo(pCur);
004774    aPayload = pCur->info.pPayload;
004775    assert( offset+amt <= pCur->info.nPayload );
004776  
004777    assert( aPayload > pPage->aData );
004778    if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
004779      /* Trying to read or write past the end of the data is an error.  The
004780      ** conditional above is really:
004781      **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
004782      ** but is recast into its current form to avoid integer overflow problems
004783      */
004784      return SQLITE_CORRUPT_PAGE(pPage);
004785    }
004786  
004787    /* Check if data must be read/written to/from the btree page itself. */
004788    if( offset<pCur->info.nLocal ){
004789      int a = amt;
004790      if( a+offset>pCur->info.nLocal ){
004791        a = pCur->info.nLocal - offset;
004792      }
004793      rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
004794      offset = 0;
004795      pBuf += a;
004796      amt -= a;
004797    }else{
004798      offset -= pCur->info.nLocal;
004799    }
004800  
004801  
004802    if( rc==SQLITE_OK && amt>0 ){
004803      const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
004804      Pgno nextPage;
004805  
004806      nextPage = get4byte(&aPayload[pCur->info.nLocal]);
004807  
004808      /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
004809      **
004810      ** The aOverflow[] array is sized at one entry for each overflow page
004811      ** in the overflow chain. The page number of the first overflow page is
004812      ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
004813      ** means "not yet known" (the cache is lazily populated).
004814      */
004815      if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
004816        int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
004817        if( pCur->aOverflow==0
004818         || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow)
004819        ){
004820          Pgno *aNew = (Pgno*)sqlite3Realloc(
004821              pCur->aOverflow, nOvfl*2*sizeof(Pgno)
004822          );
004823          if( aNew==0 ){
004824            return SQLITE_NOMEM_BKPT;
004825          }else{
004826            pCur->aOverflow = aNew;
004827          }
004828        }
004829        memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
004830        pCur->curFlags |= BTCF_ValidOvfl;
004831      }else{
004832        /* If the overflow page-list cache has been allocated and the
004833        ** entry for the first required overflow page is valid, skip
004834        ** directly to it.
004835        */
004836        if( pCur->aOverflow[offset/ovflSize] ){
004837          iIdx = (offset/ovflSize);
004838          nextPage = pCur->aOverflow[iIdx];
004839          offset = (offset%ovflSize);
004840        }
004841      }
004842  
004843      assert( rc==SQLITE_OK && amt>0 );
004844      while( nextPage ){
004845        /* If required, populate the overflow page-list cache. */
004846        assert( pCur->aOverflow[iIdx]==0
004847                || pCur->aOverflow[iIdx]==nextPage
004848                || CORRUPT_DB );
004849        pCur->aOverflow[iIdx] = nextPage;
004850  
004851        if( offset>=ovflSize ){
004852          /* The only reason to read this page is to obtain the page
004853          ** number for the next page in the overflow chain. The page
004854          ** data is not required. So first try to lookup the overflow
004855          ** page-list cache, if any, then fall back to the getOverflowPage()
004856          ** function.
004857          */
004858          assert( pCur->curFlags & BTCF_ValidOvfl );
004859          assert( pCur->pBtree->db==pBt->db );
004860          if( pCur->aOverflow[iIdx+1] ){
004861            nextPage = pCur->aOverflow[iIdx+1];
004862          }else{
004863            rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
004864          }
004865          offset -= ovflSize;
004866        }else{
004867          /* Need to read this page properly. It contains some of the
004868          ** range of data that is being read (eOp==0) or written (eOp!=0).
004869          */
004870          int a = amt;
004871          if( a + offset > ovflSize ){
004872            a = ovflSize - offset;
004873          }
004874  
004875  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004876          /* If all the following are true:
004877          **
004878          **   1) this is a read operation, and 
004879          **   2) data is required from the start of this overflow page, and
004880          **   3) there are no dirty pages in the page-cache
004881          **   4) the database is file-backed, and
004882          **   5) the page is not in the WAL file
004883          **   6) at least 4 bytes have already been read into the output buffer 
004884          **
004885          ** then data can be read directly from the database file into the
004886          ** output buffer, bypassing the page-cache altogether. This speeds
004887          ** up loading large records that span many overflow pages.
004888          */
004889          if( eOp==0                                             /* (1) */
004890           && offset==0                                          /* (2) */
004891           && sqlite3PagerDirectReadOk(pBt->pPager, nextPage)    /* (3,4,5) */
004892           && &pBuf[-4]>=pBufStart                               /* (6) */
004893          ){
004894            sqlite3_file *fd = sqlite3PagerFile(pBt->pPager);
004895            u8 aSave[4];
004896            u8 *aWrite = &pBuf[-4];
004897            assert( aWrite>=pBufStart );                         /* due to (6) */
004898            memcpy(aSave, aWrite, 4);
004899            rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
004900            if( rc && nextPage>pBt->nPage ) rc = SQLITE_CORRUPT_BKPT;
004901            nextPage = get4byte(aWrite);
004902            memcpy(aWrite, aSave, 4);
004903          }else
004904  #endif
004905  
004906          {
004907            DbPage *pDbPage;
004908            rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
004909                (eOp==0 ? PAGER_GET_READONLY : 0)
004910            );
004911            if( rc==SQLITE_OK ){
004912              aPayload = sqlite3PagerGetData(pDbPage);
004913              nextPage = get4byte(aPayload);
004914              rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
004915              sqlite3PagerUnref(pDbPage);
004916              offset = 0;
004917            }
004918          }
004919          amt -= a;
004920          if( amt==0 ) return rc;
004921          pBuf += a;
004922        }
004923        if( rc ) break;
004924        iIdx++;
004925      }
004926    }
004927  
004928    if( rc==SQLITE_OK && amt>0 ){
004929      /* Overflow chain ends prematurely */
004930      return SQLITE_CORRUPT_PAGE(pPage);
004931    }
004932    return rc;
004933  }
004934  
004935  /*
004936  ** Read part of the payload for the row at which that cursor pCur is currently
004937  ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
004938  ** begins at "offset".
004939  **
004940  ** pCur can be pointing to either a table or an index b-tree.
004941  ** If pointing to a table btree, then the content section is read.  If
004942  ** pCur is pointing to an index b-tree then the key section is read.
004943  **
004944  ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
004945  ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
004946  ** cursor might be invalid or might need to be restored before being read.
004947  **
004948  ** Return SQLITE_OK on success or an error code if anything goes
004949  ** wrong.  An error is returned if "offset+amt" is larger than
004950  ** the available payload.
004951  */
004952  int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
004953    assert( cursorHoldsMutex(pCur) );
004954    assert( pCur->eState==CURSOR_VALID );
004955    assert( pCur->iPage>=0 && pCur->pPage );
004956    assert( pCur->ix<pCur->pPage->nCell );
004957    return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
004958  }
004959  
004960  /*
004961  ** This variant of sqlite3BtreePayload() works even if the cursor has not
004962  ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
004963  ** interface.
004964  */
004965  #ifndef SQLITE_OMIT_INCRBLOB
004966  static SQLITE_NOINLINE int accessPayloadChecked(
004967    BtCursor *pCur,
004968    u32 offset,
004969    u32 amt,
004970    void *pBuf
004971  ){
004972    int rc;
004973    if ( pCur->eState==CURSOR_INVALID ){
004974      return SQLITE_ABORT;
004975    }
004976    assert( cursorOwnsBtShared(pCur) );
004977    rc = btreeRestoreCursorPosition(pCur);
004978    return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
004979  }
004980  int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
004981    if( pCur->eState==CURSOR_VALID ){
004982      assert( cursorOwnsBtShared(pCur) );
004983      return accessPayload(pCur, offset, amt, pBuf, 0);
004984    }else{
004985      return accessPayloadChecked(pCur, offset, amt, pBuf);
004986    }
004987  }
004988  #endif /* SQLITE_OMIT_INCRBLOB */
004989  
004990  /*
004991  ** Return a pointer to payload information from the entry that the 
004992  ** pCur cursor is pointing to.  The pointer is to the beginning of
004993  ** the key if index btrees (pPage->intKey==0) and is the data for
004994  ** table btrees (pPage->intKey==1). The number of bytes of available
004995  ** key/data is written into *pAmt.  If *pAmt==0, then the value
004996  ** returned will not be a valid pointer.
004997  **
004998  ** This routine is an optimization.  It is common for the entire key
004999  ** and data to fit on the local page and for there to be no overflow
005000  ** pages.  When that is so, this routine can be used to access the
005001  ** key and data without making a copy.  If the key and/or data spills
005002  ** onto overflow pages, then accessPayload() must be used to reassemble
005003  ** the key/data and copy it into a preallocated buffer.
005004  **
005005  ** The pointer returned by this routine looks directly into the cached
005006  ** page of the database.  The data might change or move the next time
005007  ** any btree routine is called.
005008  */
005009  static const void *fetchPayload(
005010    BtCursor *pCur,      /* Cursor pointing to entry to read from */
005011    u32 *pAmt            /* Write the number of available bytes here */
005012  ){
005013    int amt;
005014    assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
005015    assert( pCur->eState==CURSOR_VALID );
005016    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005017    assert( cursorOwnsBtShared(pCur) );
005018    assert( pCur->ix<pCur->pPage->nCell );
005019    assert( pCur->info.nSize>0 );
005020    assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
005021    assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
005022    amt = pCur->info.nLocal;
005023    if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
005024      /* There is too little space on the page for the expected amount
005025      ** of local content. Database must be corrupt. */
005026      assert( CORRUPT_DB );
005027      amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
005028    }
005029    *pAmt = (u32)amt;
005030    return (void*)pCur->info.pPayload;
005031  }
005032  
005033  
005034  /*
005035  ** For the entry that cursor pCur is point to, return as
005036  ** many bytes of the key or data as are available on the local
005037  ** b-tree page.  Write the number of available bytes into *pAmt.
005038  **
005039  ** The pointer returned is ephemeral.  The key/data may move
005040  ** or be destroyed on the next call to any Btree routine,
005041  ** including calls from other threads against the same cache.
005042  ** Hence, a mutex on the BtShared should be held prior to calling
005043  ** this routine.
005044  **
005045  ** These routines is used to get quick access to key and data
005046  ** in the common case where no overflow pages are used.
005047  */
005048  const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
005049    return fetchPayload(pCur, pAmt);
005050  }
005051  
005052  
005053  /*
005054  ** Move the cursor down to a new child page.  The newPgno argument is the
005055  ** page number of the child page to move to.
005056  **
005057  ** This function returns SQLITE_CORRUPT if the page-header flags field of
005058  ** the new child page does not match the flags field of the parent (i.e.
005059  ** if an intkey page appears to be the parent of a non-intkey page, or
005060  ** vice-versa).
005061  */
005062  static int moveToChild(BtCursor *pCur, u32 newPgno){
005063    BtShared *pBt = pCur->pBt;
005064  
005065    assert( cursorOwnsBtShared(pCur) );
005066    assert( pCur->eState==CURSOR_VALID );
005067    assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
005068    assert( pCur->iPage>=0 );
005069    if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
005070      return SQLITE_CORRUPT_BKPT;
005071    }
005072    pCur->info.nSize = 0;
005073    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
005074    pCur->aiIdx[pCur->iPage] = pCur->ix;
005075    pCur->apPage[pCur->iPage] = pCur->pPage;
005076    pCur->ix = 0;
005077    pCur->iPage++;
005078    return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
005079  }
005080  
005081  #ifdef SQLITE_DEBUG
005082  /*
005083  ** Page pParent is an internal (non-leaf) tree page. This function 
005084  ** asserts that page number iChild is the left-child if the iIdx'th
005085  ** cell in page pParent. Or, if iIdx is equal to the total number of
005086  ** cells in pParent, that page number iChild is the right-child of
005087  ** the page.
005088  */
005089  static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
005090    if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
005091                              ** in a corrupt database */
005092    assert( iIdx<=pParent->nCell );
005093    if( iIdx==pParent->nCell ){
005094      assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
005095    }else{
005096      assert( get4byte(findCell(pParent, iIdx))==iChild );
005097    }
005098  }
005099  #else
005100  #  define assertParentIndex(x,y,z) 
005101  #endif
005102  
005103  /*
005104  ** Move the cursor up to the parent page.
005105  **
005106  ** pCur->idx is set to the cell index that contains the pointer
005107  ** to the page we are coming from.  If we are coming from the
005108  ** right-most child page then pCur->idx is set to one more than
005109  ** the largest cell index.
005110  */
005111  static void moveToParent(BtCursor *pCur){
005112    MemPage *pLeaf;
005113    assert( cursorOwnsBtShared(pCur) );
005114    assert( pCur->eState==CURSOR_VALID );
005115    assert( pCur->iPage>0 );
005116    assert( pCur->pPage );
005117    assertParentIndex(
005118      pCur->apPage[pCur->iPage-1], 
005119      pCur->aiIdx[pCur->iPage-1], 
005120      pCur->pPage->pgno
005121    );
005122    testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
005123    pCur->info.nSize = 0;
005124    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
005125    pCur->ix = pCur->aiIdx[pCur->iPage-1];
005126    pLeaf = pCur->pPage;
005127    pCur->pPage = pCur->apPage[--pCur->iPage];
005128    releasePageNotNull(pLeaf);
005129  }
005130  
005131  /*
005132  ** Move the cursor to point to the root page of its b-tree structure.
005133  **
005134  ** If the table has a virtual root page, then the cursor is moved to point
005135  ** to the virtual root page instead of the actual root page. A table has a
005136  ** virtual root page when the actual root page contains no cells and a 
005137  ** single child page. This can only happen with the table rooted at page 1.
005138  **
005139  ** If the b-tree structure is empty, the cursor state is set to 
005140  ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
005141  ** the cursor is set to point to the first cell located on the root
005142  ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
005143  **
005144  ** If this function returns successfully, it may be assumed that the
005145  ** page-header flags indicate that the [virtual] root-page is the expected 
005146  ** kind of b-tree page (i.e. if when opening the cursor the caller did not
005147  ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
005148  ** indicating a table b-tree, or if the caller did specify a KeyInfo 
005149  ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
005150  ** b-tree).
005151  */
005152  static int moveToRoot(BtCursor *pCur){
005153    MemPage *pRoot;
005154    int rc = SQLITE_OK;
005155  
005156    assert( cursorOwnsBtShared(pCur) );
005157    assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
005158    assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
005159    assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
005160    assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
005161    assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
005162  
005163    if( pCur->iPage>=0 ){
005164      if( pCur->iPage ){
005165        releasePageNotNull(pCur->pPage);
005166        while( --pCur->iPage ){
005167          releasePageNotNull(pCur->apPage[pCur->iPage]);
005168        }
005169        pCur->pPage = pCur->apPage[0];
005170        goto skip_init;
005171      }
005172    }else if( pCur->pgnoRoot==0 ){
005173      pCur->eState = CURSOR_INVALID;
005174      return SQLITE_EMPTY;
005175    }else{
005176      assert( pCur->iPage==(-1) );
005177      if( pCur->eState>=CURSOR_REQUIRESEEK ){
005178        if( pCur->eState==CURSOR_FAULT ){
005179          assert( pCur->skipNext!=SQLITE_OK );
005180          return pCur->skipNext;
005181        }
005182        sqlite3BtreeClearCursor(pCur);
005183      }
005184      rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
005185                          0, pCur->curPagerFlags);
005186      if( rc!=SQLITE_OK ){
005187        pCur->eState = CURSOR_INVALID;
005188        return rc;
005189      }
005190      pCur->iPage = 0;
005191      pCur->curIntKey = pCur->pPage->intKey;
005192    }
005193    pRoot = pCur->pPage;
005194    assert( pRoot->pgno==pCur->pgnoRoot );
005195  
005196    /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
005197    ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
005198    ** NULL, the caller expects a table b-tree. If this is not the case,
005199    ** return an SQLITE_CORRUPT error. 
005200    **
005201    ** Earlier versions of SQLite assumed that this test could not fail
005202    ** if the root page was already loaded when this function was called (i.e.
005203    ** if pCur->iPage>=0). But this is not so if the database is corrupted 
005204    ** in such a way that page pRoot is linked into a second b-tree table 
005205    ** (or the freelist).  */
005206    assert( pRoot->intKey==1 || pRoot->intKey==0 );
005207    if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
005208      return SQLITE_CORRUPT_PAGE(pCur->pPage);
005209    }
005210  
005211  skip_init:  
005212    pCur->ix = 0;
005213    pCur->info.nSize = 0;
005214    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
005215  
005216    pRoot = pCur->pPage;
005217    if( pRoot->nCell>0 ){
005218      pCur->eState = CURSOR_VALID;
005219    }else if( !pRoot->leaf ){
005220      Pgno subpage;
005221      if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
005222      subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
005223      pCur->eState = CURSOR_VALID;
005224      rc = moveToChild(pCur, subpage);
005225    }else{
005226      pCur->eState = CURSOR_INVALID;
005227      rc = SQLITE_EMPTY;
005228    }
005229    return rc;
005230  }
005231  
005232  /*
005233  ** Move the cursor down to the left-most leaf entry beneath the
005234  ** entry to which it is currently pointing.
005235  **
005236  ** The left-most leaf is the one with the smallest key - the first
005237  ** in ascending order.
005238  */
005239  static int moveToLeftmost(BtCursor *pCur){
005240    Pgno pgno;
005241    int rc = SQLITE_OK;
005242    MemPage *pPage;
005243  
005244    assert( cursorOwnsBtShared(pCur) );
005245    assert( pCur->eState==CURSOR_VALID );
005246    while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
005247      assert( pCur->ix<pPage->nCell );
005248      pgno = get4byte(findCell(pPage, pCur->ix));
005249      rc = moveToChild(pCur, pgno);
005250    }
005251    return rc;
005252  }
005253  
005254  /*
005255  ** Move the cursor down to the right-most leaf entry beneath the
005256  ** page to which it is currently pointing.  Notice the difference
005257  ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
005258  ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
005259  ** finds the right-most entry beneath the *page*.
005260  **
005261  ** The right-most entry is the one with the largest key - the last
005262  ** key in ascending order.
005263  */
005264  static int moveToRightmost(BtCursor *pCur){
005265    Pgno pgno;
005266    int rc = SQLITE_OK;
005267    MemPage *pPage = 0;
005268  
005269    assert( cursorOwnsBtShared(pCur) );
005270    assert( pCur->eState==CURSOR_VALID );
005271    while( !(pPage = pCur->pPage)->leaf ){
005272      pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
005273      pCur->ix = pPage->nCell;
005274      rc = moveToChild(pCur, pgno);
005275      if( rc ) return rc;
005276    }
005277    pCur->ix = pPage->nCell-1;
005278    assert( pCur->info.nSize==0 );
005279    assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
005280    return SQLITE_OK;
005281  }
005282  
005283  /* Move the cursor to the first entry in the table.  Return SQLITE_OK
005284  ** on success.  Set *pRes to 0 if the cursor actually points to something
005285  ** or set *pRes to 1 if the table is empty.
005286  */
005287  int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
005288    int rc;
005289  
005290    assert( cursorOwnsBtShared(pCur) );
005291    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005292    rc = moveToRoot(pCur);
005293    if( rc==SQLITE_OK ){
005294      assert( pCur->pPage->nCell>0 );
005295      *pRes = 0;
005296      rc = moveToLeftmost(pCur);
005297    }else if( rc==SQLITE_EMPTY ){
005298      assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005299      *pRes = 1;
005300      rc = SQLITE_OK;
005301    }
005302    return rc;
005303  }
005304  
005305  /* Move the cursor to the last entry in the table.  Return SQLITE_OK
005306  ** on success.  Set *pRes to 0 if the cursor actually points to something
005307  ** or set *pRes to 1 if the table is empty.
005308  */
005309  int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
005310    int rc;
005311   
005312    assert( cursorOwnsBtShared(pCur) );
005313    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005314  
005315    /* If the cursor already points to the last entry, this is a no-op. */
005316    if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
005317  #ifdef SQLITE_DEBUG
005318      /* This block serves to assert() that the cursor really does point 
005319      ** to the last entry in the b-tree. */
005320      int ii;
005321      for(ii=0; ii<pCur->iPage; ii++){
005322        assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
005323      }
005324      assert( pCur->ix==pCur->pPage->nCell-1 );
005325      assert( pCur->pPage->leaf );
005326  #endif
005327      *pRes = 0;
005328      return SQLITE_OK;
005329    }
005330  
005331    rc = moveToRoot(pCur);
005332    if( rc==SQLITE_OK ){
005333      assert( pCur->eState==CURSOR_VALID );
005334      *pRes = 0;
005335      rc = moveToRightmost(pCur);
005336      if( rc==SQLITE_OK ){
005337        pCur->curFlags |= BTCF_AtLast;
005338      }else{
005339        pCur->curFlags &= ~BTCF_AtLast;
005340      }
005341    }else if( rc==SQLITE_EMPTY ){
005342      assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005343      *pRes = 1;
005344      rc = SQLITE_OK;
005345    }
005346    return rc;
005347  }
005348  
005349  /* Move the cursor so that it points to an entry near the key 
005350  ** specified by pIdxKey or intKey.   Return a success code.
005351  **
005352  ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
005353  ** must be NULL.  For index tables, pIdxKey is used and intKey
005354  ** is ignored.
005355  **
005356  ** If an exact match is not found, then the cursor is always
005357  ** left pointing at a leaf page which would hold the entry if it
005358  ** were present.  The cursor might point to an entry that comes
005359  ** before or after the key.
005360  **
005361  ** An integer is written into *pRes which is the result of
005362  ** comparing the key with the entry to which the cursor is 
005363  ** pointing.  The meaning of the integer written into
005364  ** *pRes is as follows:
005365  **
005366  **     *pRes<0      The cursor is left pointing at an entry that
005367  **                  is smaller than intKey/pIdxKey or if the table is empty
005368  **                  and the cursor is therefore left point to nothing.
005369  **
005370  **     *pRes==0     The cursor is left pointing at an entry that
005371  **                  exactly matches intKey/pIdxKey.
005372  **
005373  **     *pRes>0      The cursor is left pointing at an entry that
005374  **                  is larger than intKey/pIdxKey.
005375  **
005376  ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
005377  ** exists an entry in the table that exactly matches pIdxKey.  
005378  */
005379  int sqlite3BtreeMovetoUnpacked(
005380    BtCursor *pCur,          /* The cursor to be moved */
005381    UnpackedRecord *pIdxKey, /* Unpacked index key */
005382    i64 intKey,              /* The table key */
005383    int biasRight,           /* If true, bias the search to the high end */
005384    int *pRes                /* Write search results here */
005385  ){
005386    int rc;
005387    RecordCompare xRecordCompare;
005388  
005389    assert( cursorOwnsBtShared(pCur) );
005390    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005391    assert( pRes );
005392    assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
005393    assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
005394  
005395    /* If the cursor is already positioned at the point we are trying
005396    ** to move to, then just return without doing any work */
005397    if( pIdxKey==0
005398     && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
005399    ){
005400      if( pCur->info.nKey==intKey ){
005401        *pRes = 0;
005402        return SQLITE_OK;
005403      }
005404      if( pCur->info.nKey<intKey ){
005405        if( (pCur->curFlags & BTCF_AtLast)!=0 ){
005406          *pRes = -1;
005407          return SQLITE_OK;
005408        }
005409        /* If the requested key is one more than the previous key, then
005410        ** try to get there using sqlite3BtreeNext() rather than a full
005411        ** binary search.  This is an optimization only.  The correct answer
005412        ** is still obtained without this case, only a little more slowely */
005413        if( pCur->info.nKey+1==intKey ){
005414          *pRes = 0;
005415          rc = sqlite3BtreeNext(pCur, 0);
005416          if( rc==SQLITE_OK ){
005417            getCellInfo(pCur);
005418            if( pCur->info.nKey==intKey ){
005419              return SQLITE_OK;
005420            }
005421          }else if( rc==SQLITE_DONE ){
005422            rc = SQLITE_OK;
005423          }else{
005424            return rc;
005425          }
005426        }
005427      }
005428    }
005429  
005430    if( pIdxKey ){
005431      xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
005432      pIdxKey->errCode = 0;
005433      assert( pIdxKey->default_rc==1 
005434           || pIdxKey->default_rc==0 
005435           || pIdxKey->default_rc==-1
005436      );
005437    }else{
005438      xRecordCompare = 0; /* All keys are integers */
005439    }
005440  
005441    rc = moveToRoot(pCur);
005442    if( rc ){
005443      if( rc==SQLITE_EMPTY ){
005444        assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005445        *pRes = -1;
005446        return SQLITE_OK;
005447      }
005448      return rc;
005449    }
005450    assert( pCur->pPage );
005451    assert( pCur->pPage->isInit );
005452    assert( pCur->eState==CURSOR_VALID );
005453    assert( pCur->pPage->nCell > 0 );
005454    assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
005455    assert( pCur->curIntKey || pIdxKey );
005456    for(;;){
005457      int lwr, upr, idx, c;
005458      Pgno chldPg;
005459      MemPage *pPage = pCur->pPage;
005460      u8 *pCell;                          /* Pointer to current cell in pPage */
005461  
005462      /* pPage->nCell must be greater than zero. If this is the root-page
005463      ** the cursor would have been INVALID above and this for(;;) loop
005464      ** not run. If this is not the root-page, then the moveToChild() routine
005465      ** would have already detected db corruption. Similarly, pPage must
005466      ** be the right kind (index or table) of b-tree page. Otherwise
005467      ** a moveToChild() or moveToRoot() call would have detected corruption.  */
005468      assert( pPage->nCell>0 );
005469      assert( pPage->intKey==(pIdxKey==0) );
005470      lwr = 0;
005471      upr = pPage->nCell-1;
005472      assert( biasRight==0 || biasRight==1 );
005473      idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
005474      pCur->ix = (u16)idx;
005475      if( xRecordCompare==0 ){
005476        for(;;){
005477          i64 nCellKey;
005478          pCell = findCellPastPtr(pPage, idx);
005479          if( pPage->intKeyLeaf ){
005480            while( 0x80 <= *(pCell++) ){
005481              if( pCell>=pPage->aDataEnd ){
005482                return SQLITE_CORRUPT_PAGE(pPage);
005483              }
005484            }
005485          }
005486          getVarint(pCell, (u64*)&nCellKey);
005487          if( nCellKey<intKey ){
005488            lwr = idx+1;
005489            if( lwr>upr ){ c = -1; break; }
005490          }else if( nCellKey>intKey ){
005491            upr = idx-1;
005492            if( lwr>upr ){ c = +1; break; }
005493          }else{
005494            assert( nCellKey==intKey );
005495            pCur->ix = (u16)idx;
005496            if( !pPage->leaf ){
005497              lwr = idx;
005498              goto moveto_next_layer;
005499            }else{
005500              pCur->curFlags |= BTCF_ValidNKey;
005501              pCur->info.nKey = nCellKey;
005502              pCur->info.nSize = 0;
005503              *pRes = 0;
005504              return SQLITE_OK;
005505            }
005506          }
005507          assert( lwr+upr>=0 );
005508          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
005509        }
005510      }else{
005511        for(;;){
005512          int nCell;  /* Size of the pCell cell in bytes */
005513          pCell = findCellPastPtr(pPage, idx);
005514  
005515          /* The maximum supported page-size is 65536 bytes. This means that
005516          ** the maximum number of record bytes stored on an index B-Tree
005517          ** page is less than 16384 bytes and may be stored as a 2-byte
005518          ** varint. This information is used to attempt to avoid parsing 
005519          ** the entire cell by checking for the cases where the record is 
005520          ** stored entirely within the b-tree page by inspecting the first 
005521          ** 2 bytes of the cell.
005522          */
005523          nCell = pCell[0];
005524          if( nCell<=pPage->max1bytePayload ){
005525            /* This branch runs if the record-size field of the cell is a
005526            ** single byte varint and the record fits entirely on the main
005527            ** b-tree page.  */
005528            testcase( pCell+nCell+1==pPage->aDataEnd );
005529            c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
005530          }else if( !(pCell[1] & 0x80) 
005531            && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
005532          ){
005533            /* The record-size field is a 2 byte varint and the record 
005534            ** fits entirely on the main b-tree page.  */
005535            testcase( pCell+nCell+2==pPage->aDataEnd );
005536            c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
005537          }else{
005538            /* The record flows over onto one or more overflow pages. In
005539            ** this case the whole cell needs to be parsed, a buffer allocated
005540            ** and accessPayload() used to retrieve the record into the
005541            ** buffer before VdbeRecordCompare() can be called. 
005542            **
005543            ** If the record is corrupt, the xRecordCompare routine may read
005544            ** up to two varints past the end of the buffer. An extra 18 
005545            ** bytes of padding is allocated at the end of the buffer in
005546            ** case this happens.  */
005547            void *pCellKey;
005548            u8 * const pCellBody = pCell - pPage->childPtrSize;
005549            const int nOverrun = 18;  /* Size of the overrun padding */
005550            pPage->xParseCell(pPage, pCellBody, &pCur->info);
005551            nCell = (int)pCur->info.nKey;
005552            testcase( nCell<0 );   /* True if key size is 2^32 or more */
005553            testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
005554            testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
005555            testcase( nCell==2 );  /* Minimum legal index key size */
005556            if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){
005557              rc = SQLITE_CORRUPT_PAGE(pPage);
005558              goto moveto_finish;
005559            }
005560            pCellKey = sqlite3Malloc( nCell+nOverrun );
005561            if( pCellKey==0 ){
005562              rc = SQLITE_NOMEM_BKPT;
005563              goto moveto_finish;
005564            }
005565            pCur->ix = (u16)idx;
005566            rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
005567            memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */
005568            pCur->curFlags &= ~BTCF_ValidOvfl;
005569            if( rc ){
005570              sqlite3_free(pCellKey);
005571              goto moveto_finish;
005572            }
005573            c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
005574            sqlite3_free(pCellKey);
005575          }
005576          assert( 
005577              (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
005578           && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
005579          );
005580          if( c<0 ){
005581            lwr = idx+1;
005582          }else if( c>0 ){
005583            upr = idx-1;
005584          }else{
005585            assert( c==0 );
005586            *pRes = 0;
005587            rc = SQLITE_OK;
005588            pCur->ix = (u16)idx;
005589            if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
005590            goto moveto_finish;
005591          }
005592          if( lwr>upr ) break;
005593          assert( lwr+upr>=0 );
005594          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
005595        }
005596      }
005597      assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
005598      assert( pPage->isInit );
005599      if( pPage->leaf ){
005600        assert( pCur->ix<pCur->pPage->nCell );
005601        pCur->ix = (u16)idx;
005602        *pRes = c;
005603        rc = SQLITE_OK;
005604        goto moveto_finish;
005605      }
005606  moveto_next_layer:
005607      if( lwr>=pPage->nCell ){
005608        chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
005609      }else{
005610        chldPg = get4byte(findCell(pPage, lwr));
005611      }
005612      pCur->ix = (u16)lwr;
005613      rc = moveToChild(pCur, chldPg);
005614      if( rc ) break;
005615    }
005616  moveto_finish:
005617    pCur->info.nSize = 0;
005618    assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
005619    return rc;
005620  }
005621  
005622  
005623  /*
005624  ** Return TRUE if the cursor is not pointing at an entry of the table.
005625  **
005626  ** TRUE will be returned after a call to sqlite3BtreeNext() moves
005627  ** past the last entry in the table or sqlite3BtreePrev() moves past
005628  ** the first entry.  TRUE is also returned if the table is empty.
005629  */
005630  int sqlite3BtreeEof(BtCursor *pCur){
005631    /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
005632    ** have been deleted? This API will need to change to return an error code
005633    ** as well as the boolean result value.
005634    */
005635    return (CURSOR_VALID!=pCur->eState);
005636  }
005637  
005638  /*
005639  ** Return an estimate for the number of rows in the table that pCur is
005640  ** pointing to.  Return a negative number if no estimate is currently 
005641  ** available.
005642  */
005643  i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
005644    i64 n;
005645    u8 i;
005646  
005647    assert( cursorOwnsBtShared(pCur) );
005648    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005649  
005650    /* Currently this interface is only called by the OP_IfSmaller
005651    ** opcode, and it that case the cursor will always be valid and
005652    ** will always point to a leaf node. */
005653    if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
005654    if( NEVER(pCur->pPage->leaf==0) ) return -1;
005655  
005656    n = pCur->pPage->nCell;
005657    for(i=0; i<pCur->iPage; i++){
005658      n *= pCur->apPage[i]->nCell;
005659    }
005660    return n;
005661  }
005662  
005663  /*
005664  ** Advance the cursor to the next entry in the database. 
005665  ** Return value:
005666  **
005667  **    SQLITE_OK        success
005668  **    SQLITE_DONE      cursor is already pointing at the last element
005669  **    otherwise        some kind of error occurred
005670  **
005671  ** The main entry point is sqlite3BtreeNext().  That routine is optimized
005672  ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
005673  ** to the next cell on the current page.  The (slower) btreeNext() helper
005674  ** routine is called when it is necessary to move to a different page or
005675  ** to restore the cursor.
005676  **
005677  ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
005678  ** cursor corresponds to an SQL index and this routine could have been
005679  ** skipped if the SQL index had been a unique index.  The F argument
005680  ** is a hint to the implement.  SQLite btree implementation does not use
005681  ** this hint, but COMDB2 does.
005682  */
005683  static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
005684    int rc;
005685    int idx;
005686    MemPage *pPage;
005687  
005688    assert( cursorOwnsBtShared(pCur) );
005689    if( pCur->eState!=CURSOR_VALID ){
005690      assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
005691      rc = restoreCursorPosition(pCur);
005692      if( rc!=SQLITE_OK ){
005693        return rc;
005694      }
005695      if( CURSOR_INVALID==pCur->eState ){
005696        return SQLITE_DONE;
005697      }
005698      if( pCur->eState==CURSOR_SKIPNEXT ){
005699        pCur->eState = CURSOR_VALID;
005700        if( pCur->skipNext>0 ) return SQLITE_OK;
005701      }
005702    }
005703  
005704    pPage = pCur->pPage;
005705    idx = ++pCur->ix;
005706    if( !pPage->isInit ){
005707      /* The only known way for this to happen is for there to be a
005708      ** recursive SQL function that does a DELETE operation as part of a
005709      ** SELECT which deletes content out from under an active cursor
005710      ** in a corrupt database file where the table being DELETE-ed from
005711      ** has pages in common with the table being queried.  See TH3
005712      ** module cov1/btree78.test testcase 220 (2018-06-08) for an
005713      ** example. */
005714      return SQLITE_CORRUPT_BKPT;
005715    }
005716  
005717    /* If the database file is corrupt, it is possible for the value of idx 
005718    ** to be invalid here. This can only occur if a second cursor modifies
005719    ** the page while cursor pCur is holding a reference to it. Which can
005720    ** only happen if the database is corrupt in such a way as to link the
005721    ** page into more than one b-tree structure.
005722    **
005723    ** Update 2019-12-23: appears to long longer be possible after the
005724    ** addition of anotherValidCursor() condition on balance_deeper().  */
005725    harmless( idx>pPage->nCell );
005726  
005727    if( idx>=pPage->nCell ){
005728      if( !pPage->leaf ){
005729        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
005730        if( rc ) return rc;
005731        return moveToLeftmost(pCur);
005732      }
005733      do{
005734        if( pCur->iPage==0 ){
005735          pCur->eState = CURSOR_INVALID;
005736          return SQLITE_DONE;
005737        }
005738        moveToParent(pCur);
005739        pPage = pCur->pPage;
005740      }while( pCur->ix>=pPage->nCell );
005741      if( pPage->intKey ){
005742        return sqlite3BtreeNext(pCur, 0);
005743      }else{
005744        return SQLITE_OK;
005745      }
005746    }
005747    if( pPage->leaf ){
005748      return SQLITE_OK;
005749    }else{
005750      return moveToLeftmost(pCur);
005751    }
005752  }
005753  int sqlite3BtreeNext(BtCursor *pCur, int flags){
005754    MemPage *pPage;
005755    UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
005756    assert( cursorOwnsBtShared(pCur) );
005757    assert( flags==0 || flags==1 );
005758    pCur->info.nSize = 0;
005759    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
005760    if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
005761    pPage = pCur->pPage;
005762    if( (++pCur->ix)>=pPage->nCell ){
005763      pCur->ix--;
005764      return btreeNext(pCur);
005765    }
005766    if( pPage->leaf ){
005767      return SQLITE_OK;
005768    }else{
005769      return moveToLeftmost(pCur);
005770    }
005771  }
005772  
005773  /*
005774  ** Step the cursor to the back to the previous entry in the database.
005775  ** Return values:
005776  **
005777  **     SQLITE_OK     success
005778  **     SQLITE_DONE   the cursor is already on the first element of the table
005779  **     otherwise     some kind of error occurred
005780  **
005781  ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
005782  ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
005783  ** to the previous cell on the current page.  The (slower) btreePrevious()
005784  ** helper routine is called when it is necessary to move to a different page
005785  ** or to restore the cursor.
005786  **
005787  ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
005788  ** the cursor corresponds to an SQL index and this routine could have been
005789  ** skipped if the SQL index had been a unique index.  The F argument is a
005790  ** hint to the implement.  The native SQLite btree implementation does not
005791  ** use this hint, but COMDB2 does.
005792  */
005793  static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
005794    int rc;
005795    MemPage *pPage;
005796  
005797    assert( cursorOwnsBtShared(pCur) );
005798    assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
005799    assert( pCur->info.nSize==0 );
005800    if( pCur->eState!=CURSOR_VALID ){
005801      rc = restoreCursorPosition(pCur);
005802      if( rc!=SQLITE_OK ){
005803        return rc;
005804      }
005805      if( CURSOR_INVALID==pCur->eState ){
005806        return SQLITE_DONE;
005807      }
005808      if( CURSOR_SKIPNEXT==pCur->eState ){
005809        pCur->eState = CURSOR_VALID;
005810        if( pCur->skipNext<0 ) return SQLITE_OK;
005811      }
005812    }
005813  
005814    pPage = pCur->pPage;
005815    assert( pPage->isInit );
005816    if( !pPage->leaf ){
005817      int idx = pCur->ix;
005818      rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
005819      if( rc ) return rc;
005820      rc = moveToRightmost(pCur);
005821    }else{
005822      while( pCur->ix==0 ){
005823        if( pCur->iPage==0 ){
005824          pCur->eState = CURSOR_INVALID;
005825          return SQLITE_DONE;
005826        }
005827        moveToParent(pCur);
005828      }
005829      assert( pCur->info.nSize==0 );
005830      assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
005831  
005832      pCur->ix--;
005833      pPage = pCur->pPage;
005834      if( pPage->intKey && !pPage->leaf ){
005835        rc = sqlite3BtreePrevious(pCur, 0);
005836      }else{
005837        rc = SQLITE_OK;
005838      }
005839    }
005840    return rc;
005841  }
005842  int sqlite3BtreePrevious(BtCursor *pCur, int flags){
005843    assert( cursorOwnsBtShared(pCur) );
005844    assert( flags==0 || flags==1 );
005845    UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
005846    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
005847    pCur->info.nSize = 0;
005848    if( pCur->eState!=CURSOR_VALID
005849     || pCur->ix==0
005850     || pCur->pPage->leaf==0
005851    ){
005852      return btreePrevious(pCur);
005853    }
005854    pCur->ix--;
005855    return SQLITE_OK;
005856  }
005857  
005858  /*
005859  ** Allocate a new page from the database file.
005860  **
005861  ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
005862  ** has already been called on the new page.)  The new page has also
005863  ** been referenced and the calling routine is responsible for calling
005864  ** sqlite3PagerUnref() on the new page when it is done.
005865  **
005866  ** SQLITE_OK is returned on success.  Any other return value indicates
005867  ** an error.  *ppPage is set to NULL in the event of an error.
005868  **
005869  ** If the "nearby" parameter is not 0, then an effort is made to 
005870  ** locate a page close to the page number "nearby".  This can be used in an
005871  ** attempt to keep related pages close to each other in the database file,
005872  ** which in turn can make database access faster.
005873  **
005874  ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
005875  ** anywhere on the free-list, then it is guaranteed to be returned.  If
005876  ** eMode is BTALLOC_LT then the page returned will be less than or equal
005877  ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
005878  ** are no restrictions on which page is returned.
005879  */
005880  static int allocateBtreePage(
005881    BtShared *pBt,         /* The btree */
005882    MemPage **ppPage,      /* Store pointer to the allocated page here */
005883    Pgno *pPgno,           /* Store the page number here */
005884    Pgno nearby,           /* Search for a page near this one */
005885    u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
005886  ){
005887    MemPage *pPage1;
005888    int rc;
005889    u32 n;     /* Number of pages on the freelist */
005890    u32 k;     /* Number of leaves on the trunk of the freelist */
005891    MemPage *pTrunk = 0;
005892    MemPage *pPrevTrunk = 0;
005893    Pgno mxPage;     /* Total size of the database file */
005894  
005895    assert( sqlite3_mutex_held(pBt->mutex) );
005896    assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
005897    pPage1 = pBt->pPage1;
005898    mxPage = btreePagecount(pBt);
005899    /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
005900    ** stores stores the total number of pages on the freelist. */
005901    n = get4byte(&pPage1->aData[36]);
005902    testcase( n==mxPage-1 );
005903    if( n>=mxPage ){
005904      return SQLITE_CORRUPT_BKPT;
005905    }
005906    if( n>0 ){
005907      /* There are pages on the freelist.  Reuse one of those pages. */
005908      Pgno iTrunk;
005909      u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
005910      u32 nSearch = 0;   /* Count of the number of search attempts */
005911      
005912      /* If eMode==BTALLOC_EXACT and a query of the pointer-map
005913      ** shows that the page 'nearby' is somewhere on the free-list, then
005914      ** the entire-list will be searched for that page.
005915      */
005916  #ifndef SQLITE_OMIT_AUTOVACUUM
005917      if( eMode==BTALLOC_EXACT ){
005918        if( nearby<=mxPage ){
005919          u8 eType;
005920          assert( nearby>0 );
005921          assert( pBt->autoVacuum );
005922          rc = ptrmapGet(pBt, nearby, &eType, 0);
005923          if( rc ) return rc;
005924          if( eType==PTRMAP_FREEPAGE ){
005925            searchList = 1;
005926          }
005927        }
005928      }else if( eMode==BTALLOC_LE ){
005929        searchList = 1;
005930      }
005931  #endif
005932  
005933      /* Decrement the free-list count by 1. Set iTrunk to the index of the
005934      ** first free-list trunk page. iPrevTrunk is initially 1.
005935      */
005936      rc = sqlite3PagerWrite(pPage1->pDbPage);
005937      if( rc ) return rc;
005938      put4byte(&pPage1->aData[36], n-1);
005939  
005940      /* The code within this loop is run only once if the 'searchList' variable
005941      ** is not true. Otherwise, it runs once for each trunk-page on the
005942      ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
005943      ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
005944      */
005945      do {
005946        pPrevTrunk = pTrunk;
005947        if( pPrevTrunk ){
005948          /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
005949          ** is the page number of the next freelist trunk page in the list or
005950          ** zero if this is the last freelist trunk page. */
005951          iTrunk = get4byte(&pPrevTrunk->aData[0]);
005952        }else{
005953          /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
005954          ** stores the page number of the first page of the freelist, or zero if
005955          ** the freelist is empty. */
005956          iTrunk = get4byte(&pPage1->aData[32]);
005957        }
005958        testcase( iTrunk==mxPage );
005959        if( iTrunk>mxPage || nSearch++ > n ){
005960          rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
005961        }else{
005962          rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
005963        }
005964        if( rc ){
005965          pTrunk = 0;
005966          goto end_allocate_page;
005967        }
005968        assert( pTrunk!=0 );
005969        assert( pTrunk->aData!=0 );
005970        /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
005971        ** is the number of leaf page pointers to follow. */
005972        k = get4byte(&pTrunk->aData[4]);
005973        if( k==0 && !searchList ){
005974          /* The trunk has no leaves and the list is not being searched. 
005975          ** So extract the trunk page itself and use it as the newly 
005976          ** allocated page */
005977          assert( pPrevTrunk==0 );
005978          rc = sqlite3PagerWrite(pTrunk->pDbPage);
005979          if( rc ){
005980            goto end_allocate_page;
005981          }
005982          *pPgno = iTrunk;
005983          memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
005984          *ppPage = pTrunk;
005985          pTrunk = 0;
005986          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
005987        }else if( k>(u32)(pBt->usableSize/4 - 2) ){
005988          /* Value of k is out of range.  Database corruption */
005989          rc = SQLITE_CORRUPT_PGNO(iTrunk);
005990          goto end_allocate_page;
005991  #ifndef SQLITE_OMIT_AUTOVACUUM
005992        }else if( searchList 
005993              && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 
005994        ){
005995          /* The list is being searched and this trunk page is the page
005996          ** to allocate, regardless of whether it has leaves.
005997          */
005998          *pPgno = iTrunk;
005999          *ppPage = pTrunk;
006000          searchList = 0;
006001          rc = sqlite3PagerWrite(pTrunk->pDbPage);
006002          if( rc ){
006003            goto end_allocate_page;
006004          }
006005          if( k==0 ){
006006            if( !pPrevTrunk ){
006007              memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
006008            }else{
006009              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
006010              if( rc!=SQLITE_OK ){
006011                goto end_allocate_page;
006012              }
006013              memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
006014            }
006015          }else{
006016            /* The trunk page is required by the caller but it contains 
006017            ** pointers to free-list leaves. The first leaf becomes a trunk
006018            ** page in this case.
006019            */
006020            MemPage *pNewTrunk;
006021            Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
006022            if( iNewTrunk>mxPage ){ 
006023              rc = SQLITE_CORRUPT_PGNO(iTrunk);
006024              goto end_allocate_page;
006025            }
006026            testcase( iNewTrunk==mxPage );
006027            rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
006028            if( rc!=SQLITE_OK ){
006029              goto end_allocate_page;
006030            }
006031            rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
006032            if( rc!=SQLITE_OK ){
006033              releasePage(pNewTrunk);
006034              goto end_allocate_page;
006035            }
006036            memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
006037            put4byte(&pNewTrunk->aData[4], k-1);
006038            memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
006039            releasePage(pNewTrunk);
006040            if( !pPrevTrunk ){
006041              assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
006042              put4byte(&pPage1->aData[32], iNewTrunk);
006043            }else{
006044              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
006045              if( rc ){
006046                goto end_allocate_page;
006047              }
006048              put4byte(&pPrevTrunk->aData[0], iNewTrunk);
006049            }
006050          }
006051          pTrunk = 0;
006052          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
006053  #endif
006054        }else if( k>0 ){
006055          /* Extract a leaf from the trunk */
006056          u32 closest;
006057          Pgno iPage;
006058          unsigned char *aData = pTrunk->aData;
006059          if( nearby>0 ){
006060            u32 i;
006061            closest = 0;
006062            if( eMode==BTALLOC_LE ){
006063              for(i=0; i<k; i++){
006064                iPage = get4byte(&aData[8+i*4]);
006065                if( iPage<=nearby ){
006066                  closest = i;
006067                  break;
006068                }
006069              }
006070            }else{
006071              int dist;
006072              dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
006073              for(i=1; i<k; i++){
006074                int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
006075                if( d2<dist ){
006076                  closest = i;
006077                  dist = d2;
006078                }
006079              }
006080            }
006081          }else{
006082            closest = 0;
006083          }
006084  
006085          iPage = get4byte(&aData[8+closest*4]);
006086          testcase( iPage==mxPage );
006087          if( iPage>mxPage ){
006088            rc = SQLITE_CORRUPT_PGNO(iTrunk);
006089            goto end_allocate_page;
006090          }
006091          testcase( iPage==mxPage );
006092          if( !searchList 
006093           || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 
006094          ){
006095            int noContent;
006096            *pPgno = iPage;
006097            TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
006098                   ": %d more free pages\n",
006099                   *pPgno, closest+1, k, pTrunk->pgno, n-1));
006100            rc = sqlite3PagerWrite(pTrunk->pDbPage);
006101            if( rc ) goto end_allocate_page;
006102            if( closest<k-1 ){
006103              memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
006104            }
006105            put4byte(&aData[4], k-1);
006106            noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
006107            rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
006108            if( rc==SQLITE_OK ){
006109              rc = sqlite3PagerWrite((*ppPage)->pDbPage);
006110              if( rc!=SQLITE_OK ){
006111                releasePage(*ppPage);
006112                *ppPage = 0;
006113              }
006114            }
006115            searchList = 0;
006116          }
006117        }
006118        releasePage(pPrevTrunk);
006119        pPrevTrunk = 0;
006120      }while( searchList );
006121    }else{
006122      /* There are no pages on the freelist, so append a new page to the
006123      ** database image.
006124      **
006125      ** Normally, new pages allocated by this block can be requested from the
006126      ** pager layer with the 'no-content' flag set. This prevents the pager
006127      ** from trying to read the pages content from disk. However, if the
006128      ** current transaction has already run one or more incremental-vacuum
006129      ** steps, then the page we are about to allocate may contain content
006130      ** that is required in the event of a rollback. In this case, do
006131      ** not set the no-content flag. This causes the pager to load and journal
006132      ** the current page content before overwriting it.
006133      **
006134      ** Note that the pager will not actually attempt to load or journal 
006135      ** content for any page that really does lie past the end of the database
006136      ** file on disk. So the effects of disabling the no-content optimization
006137      ** here are confined to those pages that lie between the end of the
006138      ** database image and the end of the database file.
006139      */
006140      int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
006141  
006142      rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
006143      if( rc ) return rc;
006144      pBt->nPage++;
006145      if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
006146  
006147  #ifndef SQLITE_OMIT_AUTOVACUUM
006148      if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
006149        /* If *pPgno refers to a pointer-map page, allocate two new pages
006150        ** at the end of the file instead of one. The first allocated page
006151        ** becomes a new pointer-map page, the second is used by the caller.
006152        */
006153        MemPage *pPg = 0;
006154        TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
006155        assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
006156        rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
006157        if( rc==SQLITE_OK ){
006158          rc = sqlite3PagerWrite(pPg->pDbPage);
006159          releasePage(pPg);
006160        }
006161        if( rc ) return rc;
006162        pBt->nPage++;
006163        if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
006164      }
006165  #endif
006166      put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
006167      *pPgno = pBt->nPage;
006168  
006169      assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
006170      rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
006171      if( rc ) return rc;
006172      rc = sqlite3PagerWrite((*ppPage)->pDbPage);
006173      if( rc!=SQLITE_OK ){
006174        releasePage(*ppPage);
006175        *ppPage = 0;
006176      }
006177      TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
006178    }
006179  
006180    assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) );
006181  
006182  end_allocate_page:
006183    releasePage(pTrunk);
006184    releasePage(pPrevTrunk);
006185    assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
006186    assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
006187    return rc;
006188  }
006189  
006190  /*
006191  ** This function is used to add page iPage to the database file free-list. 
006192  ** It is assumed that the page is not already a part of the free-list.
006193  **
006194  ** The value passed as the second argument to this function is optional.
006195  ** If the caller happens to have a pointer to the MemPage object 
006196  ** corresponding to page iPage handy, it may pass it as the second value. 
006197  ** Otherwise, it may pass NULL.
006198  **
006199  ** If a pointer to a MemPage object is passed as the second argument,
006200  ** its reference count is not altered by this function.
006201  */
006202  static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
006203    MemPage *pTrunk = 0;                /* Free-list trunk page */
006204    Pgno iTrunk = 0;                    /* Page number of free-list trunk page */ 
006205    MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
006206    MemPage *pPage;                     /* Page being freed. May be NULL. */
006207    int rc;                             /* Return Code */
006208    u32 nFree;                          /* Initial number of pages on free-list */
006209  
006210    assert( sqlite3_mutex_held(pBt->mutex) );
006211    assert( CORRUPT_DB || iPage>1 );
006212    assert( !pMemPage || pMemPage->pgno==iPage );
006213  
006214    if( iPage<2 || iPage>pBt->nPage ){
006215      return SQLITE_CORRUPT_BKPT;
006216    }
006217    if( pMemPage ){
006218      pPage = pMemPage;
006219      sqlite3PagerRef(pPage->pDbPage);
006220    }else{
006221      pPage = btreePageLookup(pBt, iPage);
006222    }
006223  
006224    /* Increment the free page count on pPage1 */
006225    rc = sqlite3PagerWrite(pPage1->pDbPage);
006226    if( rc ) goto freepage_out;
006227    nFree = get4byte(&pPage1->aData[36]);
006228    put4byte(&pPage1->aData[36], nFree+1);
006229  
006230    if( pBt->btsFlags & BTS_SECURE_DELETE ){
006231      /* If the secure_delete option is enabled, then
006232      ** always fully overwrite deleted information with zeros.
006233      */
006234      if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
006235       ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
006236      ){
006237        goto freepage_out;
006238      }
006239      memset(pPage->aData, 0, pPage->pBt->pageSize);
006240    }
006241  
006242    /* If the database supports auto-vacuum, write an entry in the pointer-map
006243    ** to indicate that the page is free.
006244    */
006245    if( ISAUTOVACUUM ){
006246      ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
006247      if( rc ) goto freepage_out;
006248    }
006249  
006250    /* Now manipulate the actual database free-list structure. There are two
006251    ** possibilities. If the free-list is currently empty, or if the first
006252    ** trunk page in the free-list is full, then this page will become a
006253    ** new free-list trunk page. Otherwise, it will become a leaf of the
006254    ** first trunk page in the current free-list. This block tests if it
006255    ** is possible to add the page as a new free-list leaf.
006256    */
006257    if( nFree!=0 ){
006258      u32 nLeaf;                /* Initial number of leaf cells on trunk page */
006259  
006260      iTrunk = get4byte(&pPage1->aData[32]);
006261      rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
006262      if( rc!=SQLITE_OK ){
006263        goto freepage_out;
006264      }
006265  
006266      nLeaf = get4byte(&pTrunk->aData[4]);
006267      assert( pBt->usableSize>32 );
006268      if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
006269        rc = SQLITE_CORRUPT_BKPT;
006270        goto freepage_out;
006271      }
006272      if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
006273        /* In this case there is room on the trunk page to insert the page
006274        ** being freed as a new leaf.
006275        **
006276        ** Note that the trunk page is not really full until it contains
006277        ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
006278        ** coded.  But due to a coding error in versions of SQLite prior to
006279        ** 3.6.0, databases with freelist trunk pages holding more than
006280        ** usableSize/4 - 8 entries will be reported as corrupt.  In order
006281        ** to maintain backwards compatibility with older versions of SQLite,
006282        ** we will continue to restrict the number of entries to usableSize/4 - 8
006283        ** for now.  At some point in the future (once everyone has upgraded
006284        ** to 3.6.0 or later) we should consider fixing the conditional above
006285        ** to read "usableSize/4-2" instead of "usableSize/4-8".
006286        **
006287        ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
006288        ** avoid using the last six entries in the freelist trunk page array in
006289        ** order that database files created by newer versions of SQLite can be
006290        ** read by older versions of SQLite.
006291        */
006292        rc = sqlite3PagerWrite(pTrunk->pDbPage);
006293        if( rc==SQLITE_OK ){
006294          put4byte(&pTrunk->aData[4], nLeaf+1);
006295          put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
006296          if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
006297            sqlite3PagerDontWrite(pPage->pDbPage);
006298          }
006299          rc = btreeSetHasContent(pBt, iPage);
006300        }
006301        TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
006302        goto freepage_out;
006303      }
006304    }
006305  
006306    /* If control flows to this point, then it was not possible to add the
006307    ** the page being freed as a leaf page of the first trunk in the free-list.
006308    ** Possibly because the free-list is empty, or possibly because the 
006309    ** first trunk in the free-list is full. Either way, the page being freed
006310    ** will become the new first trunk page in the free-list.
006311    */
006312    if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
006313      goto freepage_out;
006314    }
006315    rc = sqlite3PagerWrite(pPage->pDbPage);
006316    if( rc!=SQLITE_OK ){
006317      goto freepage_out;
006318    }
006319    put4byte(pPage->aData, iTrunk);
006320    put4byte(&pPage->aData[4], 0);
006321    put4byte(&pPage1->aData[32], iPage);
006322    TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
006323  
006324  freepage_out:
006325    if( pPage ){
006326      pPage->isInit = 0;
006327    }
006328    releasePage(pPage);
006329    releasePage(pTrunk);
006330    return rc;
006331  }
006332  static void freePage(MemPage *pPage, int *pRC){
006333    if( (*pRC)==SQLITE_OK ){
006334      *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
006335    }
006336  }
006337  
006338  /*
006339  ** Free any overflow pages associated with the given Cell.  Store
006340  ** size information about the cell in pInfo.
006341  */
006342  static int clearCell(
006343    MemPage *pPage,          /* The page that contains the Cell */
006344    unsigned char *pCell,    /* First byte of the Cell */
006345    CellInfo *pInfo          /* Size information about the cell */
006346  ){
006347    BtShared *pBt;
006348    Pgno ovflPgno;
006349    int rc;
006350    int nOvfl;
006351    u32 ovflPageSize;
006352  
006353    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006354    pPage->xParseCell(pPage, pCell, pInfo);
006355    if( pInfo->nLocal==pInfo->nPayload ){
006356      return SQLITE_OK;  /* No overflow pages. Return without doing anything */
006357    }
006358    testcase( pCell + pInfo->nSize == pPage->aDataEnd );
006359    testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd );
006360    if( pCell + pInfo->nSize > pPage->aDataEnd ){
006361      /* Cell extends past end of page */
006362      return SQLITE_CORRUPT_PAGE(pPage);
006363    }
006364    ovflPgno = get4byte(pCell + pInfo->nSize - 4);
006365    pBt = pPage->pBt;
006366    assert( pBt->usableSize > 4 );
006367    ovflPageSize = pBt->usableSize - 4;
006368    nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
006369    assert( nOvfl>0 || 
006370      (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
006371    );
006372    while( nOvfl-- ){
006373      Pgno iNext = 0;
006374      MemPage *pOvfl = 0;
006375      if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
006376        /* 0 is not a legal page number and page 1 cannot be an 
006377        ** overflow page. Therefore if ovflPgno<2 or past the end of the 
006378        ** file the database must be corrupt. */
006379        return SQLITE_CORRUPT_BKPT;
006380      }
006381      if( nOvfl ){
006382        rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
006383        if( rc ) return rc;
006384      }
006385  
006386      if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
006387       && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
006388      ){
006389        /* There is no reason any cursor should have an outstanding reference 
006390        ** to an overflow page belonging to a cell that is being deleted/updated.
006391        ** So if there exists more than one reference to this page, then it 
006392        ** must not really be an overflow page and the database must be corrupt. 
006393        ** It is helpful to detect this before calling freePage2(), as 
006394        ** freePage2() may zero the page contents if secure-delete mode is
006395        ** enabled. If this 'overflow' page happens to be a page that the
006396        ** caller is iterating through or using in some other way, this
006397        ** can be problematic.
006398        */
006399        rc = SQLITE_CORRUPT_BKPT;
006400      }else{
006401        rc = freePage2(pBt, pOvfl, ovflPgno);
006402      }
006403  
006404      if( pOvfl ){
006405        sqlite3PagerUnref(pOvfl->pDbPage);
006406      }
006407      if( rc ) return rc;
006408      ovflPgno = iNext;
006409    }
006410    return SQLITE_OK;
006411  }
006412  
006413  /*
006414  ** Create the byte sequence used to represent a cell on page pPage
006415  ** and write that byte sequence into pCell[].  Overflow pages are
006416  ** allocated and filled in as necessary.  The calling procedure
006417  ** is responsible for making sure sufficient space has been allocated
006418  ** for pCell[].
006419  **
006420  ** Note that pCell does not necessary need to point to the pPage->aData
006421  ** area.  pCell might point to some temporary storage.  The cell will
006422  ** be constructed in this temporary area then copied into pPage->aData
006423  ** later.
006424  */
006425  static int fillInCell(
006426    MemPage *pPage,                /* The page that contains the cell */
006427    unsigned char *pCell,          /* Complete text of the cell */
006428    const BtreePayload *pX,        /* Payload with which to construct the cell */
006429    int *pnSize                    /* Write cell size here */
006430  ){
006431    int nPayload;
006432    const u8 *pSrc;
006433    int nSrc, n, rc, mn;
006434    int spaceLeft;
006435    MemPage *pToRelease;
006436    unsigned char *pPrior;
006437    unsigned char *pPayload;
006438    BtShared *pBt;
006439    Pgno pgnoOvfl;
006440    int nHeader;
006441  
006442    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006443  
006444    /* pPage is not necessarily writeable since pCell might be auxiliary
006445    ** buffer space that is separate from the pPage buffer area */
006446    assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
006447              || sqlite3PagerIswriteable(pPage->pDbPage) );
006448  
006449    /* Fill in the header. */
006450    nHeader = pPage->childPtrSize;
006451    if( pPage->intKey ){
006452      nPayload = pX->nData + pX->nZero;
006453      pSrc = pX->pData;
006454      nSrc = pX->nData;
006455      assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
006456      nHeader += putVarint32(&pCell[nHeader], nPayload);
006457      nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
006458    }else{
006459      assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
006460      nSrc = nPayload = (int)pX->nKey;
006461      pSrc = pX->pKey;
006462      nHeader += putVarint32(&pCell[nHeader], nPayload);
006463    }
006464    
006465    /* Fill in the payload */
006466    pPayload = &pCell[nHeader];
006467    if( nPayload<=pPage->maxLocal ){
006468      /* This is the common case where everything fits on the btree page
006469      ** and no overflow pages are required. */
006470      n = nHeader + nPayload;
006471      testcase( n==3 );
006472      testcase( n==4 );
006473      if( n<4 ) n = 4;
006474      *pnSize = n;
006475      assert( nSrc<=nPayload );
006476      testcase( nSrc<nPayload );
006477      memcpy(pPayload, pSrc, nSrc);
006478      memset(pPayload+nSrc, 0, nPayload-nSrc);
006479      return SQLITE_OK;
006480    }
006481  
006482    /* If we reach this point, it means that some of the content will need
006483    ** to spill onto overflow pages.
006484    */
006485    mn = pPage->minLocal;
006486    n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
006487    testcase( n==pPage->maxLocal );
006488    testcase( n==pPage->maxLocal+1 );
006489    if( n > pPage->maxLocal ) n = mn;
006490    spaceLeft = n;
006491    *pnSize = n + nHeader + 4;
006492    pPrior = &pCell[nHeader+n];
006493    pToRelease = 0;
006494    pgnoOvfl = 0;
006495    pBt = pPage->pBt;
006496  
006497    /* At this point variables should be set as follows:
006498    **
006499    **   nPayload           Total payload size in bytes
006500    **   pPayload           Begin writing payload here
006501    **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
006502    **                      that means content must spill into overflow pages.
006503    **   *pnSize            Size of the local cell (not counting overflow pages)
006504    **   pPrior             Where to write the pgno of the first overflow page
006505    **
006506    ** Use a call to btreeParseCellPtr() to verify that the values above
006507    ** were computed correctly.
006508    */
006509  #ifdef SQLITE_DEBUG
006510    {
006511      CellInfo info;
006512      pPage->xParseCell(pPage, pCell, &info);
006513      assert( nHeader==(int)(info.pPayload - pCell) );
006514      assert( info.nKey==pX->nKey );
006515      assert( *pnSize == info.nSize );
006516      assert( spaceLeft == info.nLocal );
006517    }
006518  #endif
006519  
006520    /* Write the payload into the local Cell and any extra into overflow pages */
006521    while( 1 ){
006522      n = nPayload;
006523      if( n>spaceLeft ) n = spaceLeft;
006524  
006525      /* If pToRelease is not zero than pPayload points into the data area
006526      ** of pToRelease.  Make sure pToRelease is still writeable. */
006527      assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
006528  
006529      /* If pPayload is part of the data area of pPage, then make sure pPage
006530      ** is still writeable */
006531      assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
006532              || sqlite3PagerIswriteable(pPage->pDbPage) );
006533  
006534      if( nSrc>=n ){
006535        memcpy(pPayload, pSrc, n);
006536      }else if( nSrc>0 ){
006537        n = nSrc;
006538        memcpy(pPayload, pSrc, n);
006539      }else{
006540        memset(pPayload, 0, n);
006541      }
006542      nPayload -= n;
006543      if( nPayload<=0 ) break;
006544      pPayload += n;
006545      pSrc += n;
006546      nSrc -= n;
006547      spaceLeft -= n;
006548      if( spaceLeft==0 ){
006549        MemPage *pOvfl = 0;
006550  #ifndef SQLITE_OMIT_AUTOVACUUM
006551        Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
006552        if( pBt->autoVacuum ){
006553          do{
006554            pgnoOvfl++;
006555          } while( 
006556            PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
006557          );
006558        }
006559  #endif
006560        rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
006561  #ifndef SQLITE_OMIT_AUTOVACUUM
006562        /* If the database supports auto-vacuum, and the second or subsequent
006563        ** overflow page is being allocated, add an entry to the pointer-map
006564        ** for that page now. 
006565        **
006566        ** If this is the first overflow page, then write a partial entry 
006567        ** to the pointer-map. If we write nothing to this pointer-map slot,
006568        ** then the optimistic overflow chain processing in clearCell()
006569        ** may misinterpret the uninitialized values and delete the
006570        ** wrong pages from the database.
006571        */
006572        if( pBt->autoVacuum && rc==SQLITE_OK ){
006573          u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
006574          ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
006575          if( rc ){
006576            releasePage(pOvfl);
006577          }
006578        }
006579  #endif
006580        if( rc ){
006581          releasePage(pToRelease);
006582          return rc;
006583        }
006584  
006585        /* If pToRelease is not zero than pPrior points into the data area
006586        ** of pToRelease.  Make sure pToRelease is still writeable. */
006587        assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
006588  
006589        /* If pPrior is part of the data area of pPage, then make sure pPage
006590        ** is still writeable */
006591        assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
006592              || sqlite3PagerIswriteable(pPage->pDbPage) );
006593  
006594        put4byte(pPrior, pgnoOvfl);
006595        releasePage(pToRelease);
006596        pToRelease = pOvfl;
006597        pPrior = pOvfl->aData;
006598        put4byte(pPrior, 0);
006599        pPayload = &pOvfl->aData[4];
006600        spaceLeft = pBt->usableSize - 4;
006601      }
006602    }
006603    releasePage(pToRelease);
006604    return SQLITE_OK;
006605  }
006606  
006607  /*
006608  ** Remove the i-th cell from pPage.  This routine effects pPage only.
006609  ** The cell content is not freed or deallocated.  It is assumed that
006610  ** the cell content has been copied someplace else.  This routine just
006611  ** removes the reference to the cell from pPage.
006612  **
006613  ** "sz" must be the number of bytes in the cell.
006614  */
006615  static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
006616    u32 pc;         /* Offset to cell content of cell being deleted */
006617    u8 *data;       /* pPage->aData */
006618    u8 *ptr;        /* Used to move bytes around within data[] */
006619    int rc;         /* The return code */
006620    int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
006621  
006622    if( *pRC ) return;
006623    assert( idx>=0 && idx<pPage->nCell );
006624    assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
006625    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
006626    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006627    assert( pPage->nFree>=0 );
006628    data = pPage->aData;
006629    ptr = &pPage->aCellIdx[2*idx];
006630    pc = get2byte(ptr);
006631    hdr = pPage->hdrOffset;
006632    testcase( pc==get2byte(&data[hdr+5]) );
006633    testcase( pc+sz==pPage->pBt->usableSize );
006634    if( pc+sz > pPage->pBt->usableSize ){
006635      *pRC = SQLITE_CORRUPT_BKPT;
006636      return;
006637    }
006638    rc = freeSpace(pPage, pc, sz);
006639    if( rc ){
006640      *pRC = rc;
006641      return;
006642    }
006643    pPage->nCell--;
006644    if( pPage->nCell==0 ){
006645      memset(&data[hdr+1], 0, 4);
006646      data[hdr+7] = 0;
006647      put2byte(&data[hdr+5], pPage->pBt->usableSize);
006648      pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
006649                         - pPage->childPtrSize - 8;
006650    }else{
006651      memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
006652      put2byte(&data[hdr+3], pPage->nCell);
006653      pPage->nFree += 2;
006654    }
006655  }
006656  
006657  /*
006658  ** Insert a new cell on pPage at cell index "i".  pCell points to the
006659  ** content of the cell.
006660  **
006661  ** If the cell content will fit on the page, then put it there.  If it
006662  ** will not fit, then make a copy of the cell content into pTemp if
006663  ** pTemp is not null.  Regardless of pTemp, allocate a new entry
006664  ** in pPage->apOvfl[] and make it point to the cell content (either
006665  ** in pTemp or the original pCell) and also record its index. 
006666  ** Allocating a new entry in pPage->aCell[] implies that 
006667  ** pPage->nOverflow is incremented.
006668  **
006669  ** *pRC must be SQLITE_OK when this routine is called.
006670  */
006671  static void insertCell(
006672    MemPage *pPage,   /* Page into which we are copying */
006673    int i,            /* New cell becomes the i-th cell of the page */
006674    u8 *pCell,        /* Content of the new cell */
006675    int sz,           /* Bytes of content in pCell */
006676    u8 *pTemp,        /* Temp storage space for pCell, if needed */
006677    Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
006678    int *pRC          /* Read and write return code from here */
006679  ){
006680    int idx = 0;      /* Where to write new cell content in data[] */
006681    int j;            /* Loop counter */
006682    u8 *data;         /* The content of the whole page */
006683    u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
006684  
006685    assert( *pRC==SQLITE_OK );
006686    assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
006687    assert( MX_CELL(pPage->pBt)<=10921 );
006688    assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
006689    assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
006690    assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
006691    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006692    assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
006693    assert( pPage->nFree>=0 );
006694    if( pPage->nOverflow || sz+2>pPage->nFree ){
006695      if( pTemp ){
006696        memcpy(pTemp, pCell, sz);
006697        pCell = pTemp;
006698      }
006699      if( iChild ){
006700        put4byte(pCell, iChild);
006701      }
006702      j = pPage->nOverflow++;
006703      /* Comparison against ArraySize-1 since we hold back one extra slot
006704      ** as a contingency.  In other words, never need more than 3 overflow
006705      ** slots but 4 are allocated, just to be safe. */
006706      assert( j < ArraySize(pPage->apOvfl)-1 );
006707      pPage->apOvfl[j] = pCell;
006708      pPage->aiOvfl[j] = (u16)i;
006709  
006710      /* When multiple overflows occur, they are always sequential and in
006711      ** sorted order.  This invariants arise because multiple overflows can
006712      ** only occur when inserting divider cells into the parent page during
006713      ** balancing, and the dividers are adjacent and sorted.
006714      */
006715      assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
006716      assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
006717    }else{
006718      int rc = sqlite3PagerWrite(pPage->pDbPage);
006719      if( rc!=SQLITE_OK ){
006720        *pRC = rc;
006721        return;
006722      }
006723      assert( sqlite3PagerIswriteable(pPage->pDbPage) );
006724      data = pPage->aData;
006725      assert( &data[pPage->cellOffset]==pPage->aCellIdx );
006726      rc = allocateSpace(pPage, sz, &idx);
006727      if( rc ){ *pRC = rc; return; }
006728      /* The allocateSpace() routine guarantees the following properties
006729      ** if it returns successfully */
006730      assert( idx >= 0 );
006731      assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
006732      assert( idx+sz <= (int)pPage->pBt->usableSize );
006733      pPage->nFree -= (u16)(2 + sz);
006734      if( iChild ){
006735        /* In a corrupt database where an entry in the cell index section of
006736        ** a btree page has a value of 3 or less, the pCell value might point
006737        ** as many as 4 bytes in front of the start of the aData buffer for
006738        ** the source page.  Make sure this does not cause problems by not
006739        ** reading the first 4 bytes */
006740        memcpy(&data[idx+4], pCell+4, sz-4);
006741        put4byte(&data[idx], iChild);
006742      }else{
006743        memcpy(&data[idx], pCell, sz);
006744      }
006745      pIns = pPage->aCellIdx + i*2;
006746      memmove(pIns+2, pIns, 2*(pPage->nCell - i));
006747      put2byte(pIns, idx);
006748      pPage->nCell++;
006749      /* increment the cell count */
006750      if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
006751      assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
006752  #ifndef SQLITE_OMIT_AUTOVACUUM
006753      if( pPage->pBt->autoVacuum ){
006754        /* The cell may contain a pointer to an overflow page. If so, write
006755        ** the entry for the overflow page into the pointer map.
006756        */
006757        ptrmapPutOvflPtr(pPage, pPage, pCell, pRC);
006758      }
006759  #endif
006760    }
006761  }
006762  
006763  /*
006764  ** The following parameters determine how many adjacent pages get involved
006765  ** in a balancing operation.  NN is the number of neighbors on either side
006766  ** of the page that participate in the balancing operation.  NB is the
006767  ** total number of pages that participate, including the target page and
006768  ** NN neighbors on either side.
006769  **
006770  ** The minimum value of NN is 1 (of course).  Increasing NN above 1
006771  ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
006772  ** in exchange for a larger degradation in INSERT and UPDATE performance.
006773  ** The value of NN appears to give the best results overall.
006774  **
006775  ** (Later:) The description above makes it seem as if these values are
006776  ** tunable - as if you could change them and recompile and it would all work.
006777  ** But that is unlikely.  NB has been 3 since the inception of SQLite and
006778  ** we have never tested any other value.
006779  */
006780  #define NN 1             /* Number of neighbors on either side of pPage */
006781  #define NB 3             /* (NN*2+1): Total pages involved in the balance */
006782  
006783  /*
006784  ** A CellArray object contains a cache of pointers and sizes for a
006785  ** consecutive sequence of cells that might be held on multiple pages.
006786  **
006787  ** The cells in this array are the divider cell or cells from the pParent
006788  ** page plus up to three child pages.  There are a total of nCell cells.
006789  **
006790  ** pRef is a pointer to one of the pages that contributes cells.  This is
006791  ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize
006792  ** which should be common to all pages that contribute cells to this array.
006793  **
006794  ** apCell[] and szCell[] hold, respectively, pointers to the start of each
006795  ** cell and the size of each cell.  Some of the apCell[] pointers might refer
006796  ** to overflow cells.  In other words, some apCel[] pointers might not point
006797  ** to content area of the pages.
006798  **
006799  ** A szCell[] of zero means the size of that cell has not yet been computed.
006800  **
006801  ** The cells come from as many as four different pages:
006802  **
006803  **             -----------
006804  **             | Parent  |
006805  **             -----------
006806  **            /     |     \
006807  **           /      |      \
006808  **  ---------   ---------   ---------
006809  **  |Child-1|   |Child-2|   |Child-3|
006810  **  ---------   ---------   ---------
006811  **
006812  ** The order of cells is in the array is for an index btree is:
006813  **
006814  **       1.  All cells from Child-1 in order
006815  **       2.  The first divider cell from Parent
006816  **       3.  All cells from Child-2 in order
006817  **       4.  The second divider cell from Parent
006818  **       5.  All cells from Child-3 in order
006819  **
006820  ** For a table-btree (with rowids) the items 2 and 4 are empty because
006821  ** content exists only in leaves and there are no divider cells.
006822  **
006823  ** For an index btree, the apEnd[] array holds pointer to the end of page
006824  ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3,
006825  ** respectively. The ixNx[] array holds the number of cells contained in
006826  ** each of these 5 stages, and all stages to the left.  Hence:
006827  **
006828  **    ixNx[0] = Number of cells in Child-1.
006829  **    ixNx[1] = Number of cells in Child-1 plus 1 for first divider.
006830  **    ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider.
006831  **    ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells
006832  **    ixNx[4] = Total number of cells.
006833  **
006834  ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2]
006835  ** are used and they point to the leaf pages only, and the ixNx value are:
006836  **
006837  **    ixNx[0] = Number of cells in Child-1.
006838  **    ixNx[1] = Number of cells in Child-1 and Child-2.
006839  **    ixNx[2] = Total number of cells.
006840  **
006841  ** Sometimes when deleting, a child page can have zero cells.  In those
006842  ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[]
006843  ** entries, shift down.  The end result is that each ixNx[] entry should
006844  ** be larger than the previous
006845  */
006846  typedef struct CellArray CellArray;
006847  struct CellArray {
006848    int nCell;              /* Number of cells in apCell[] */
006849    MemPage *pRef;          /* Reference page */
006850    u8 **apCell;            /* All cells begin balanced */
006851    u16 *szCell;            /* Local size of all cells in apCell[] */
006852    u8 *apEnd[NB*2];        /* MemPage.aDataEnd values */
006853    int ixNx[NB*2];         /* Index of at which we move to the next apEnd[] */
006854  };
006855  
006856  /*
006857  ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
006858  ** computed.
006859  */
006860  static void populateCellCache(CellArray *p, int idx, int N){
006861    assert( idx>=0 && idx+N<=p->nCell );
006862    while( N>0 ){
006863      assert( p->apCell[idx]!=0 );
006864      if( p->szCell[idx]==0 ){
006865        p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
006866      }else{
006867        assert( CORRUPT_DB ||
006868                p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
006869      }
006870      idx++;
006871      N--;
006872    }
006873  }
006874  
006875  /*
006876  ** Return the size of the Nth element of the cell array
006877  */
006878  static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
006879    assert( N>=0 && N<p->nCell );
006880    assert( p->szCell[N]==0 );
006881    p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
006882    return p->szCell[N];
006883  }
006884  static u16 cachedCellSize(CellArray *p, int N){
006885    assert( N>=0 && N<p->nCell );
006886    if( p->szCell[N] ) return p->szCell[N];
006887    return computeCellSize(p, N);
006888  }
006889  
006890  /*
006891  ** Array apCell[] contains pointers to nCell b-tree page cells. The 
006892  ** szCell[] array contains the size in bytes of each cell. This function
006893  ** replaces the current contents of page pPg with the contents of the cell
006894  ** array.
006895  **
006896  ** Some of the cells in apCell[] may currently be stored in pPg. This
006897  ** function works around problems caused by this by making a copy of any 
006898  ** such cells before overwriting the page data.
006899  **
006900  ** The MemPage.nFree field is invalidated by this function. It is the 
006901  ** responsibility of the caller to set it correctly.
006902  */
006903  static int rebuildPage(
006904    CellArray *pCArray,             /* Content to be added to page pPg */
006905    int iFirst,                     /* First cell in pCArray to use */
006906    int nCell,                      /* Final number of cells on page */
006907    MemPage *pPg                    /* The page to be reconstructed */
006908  ){
006909    const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
006910    u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
006911    const int usableSize = pPg->pBt->usableSize;
006912    u8 * const pEnd = &aData[usableSize];
006913    int i = iFirst;                 /* Which cell to copy from pCArray*/
006914    u32 j;                          /* Start of cell content area */
006915    int iEnd = i+nCell;             /* Loop terminator */
006916    u8 *pCellptr = pPg->aCellIdx;
006917    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
006918    u8 *pData;
006919    int k;                          /* Current slot in pCArray->apEnd[] */
006920    u8 *pSrcEnd;                    /* Current pCArray->apEnd[k] value */
006921  
006922    assert( i<iEnd );
006923    j = get2byte(&aData[hdr+5]);
006924    if( j>(u32)usableSize ){ j = 0; }
006925    memcpy(&pTmp[j], &aData[j], usableSize - j);
006926  
006927    for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
006928    pSrcEnd = pCArray->apEnd[k];
006929  
006930    pData = pEnd;
006931    while( 1/*exit by break*/ ){
006932      u8 *pCell = pCArray->apCell[i];
006933      u16 sz = pCArray->szCell[i];
006934      assert( sz>0 );
006935      if( SQLITE_WITHIN(pCell,aData,pEnd) ){
006936        if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT;
006937        pCell = &pTmp[pCell - aData];
006938      }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd
006939             && (uptr)(pCell)<(uptr)pSrcEnd
006940      ){
006941        return SQLITE_CORRUPT_BKPT;
006942      }
006943  
006944      pData -= sz;
006945      put2byte(pCellptr, (pData - aData));
006946      pCellptr += 2;
006947      if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
006948      memcpy(pData, pCell, sz);
006949      assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
006950      testcase( sz!=pPg->xCellSize(pPg,pCell) );
006951      i++;
006952      if( i>=iEnd ) break;
006953      if( pCArray->ixNx[k]<=i ){
006954        k++;
006955        pSrcEnd = pCArray->apEnd[k];
006956      }
006957    }
006958  
006959    /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
006960    pPg->nCell = nCell;
006961    pPg->nOverflow = 0;
006962  
006963    put2byte(&aData[hdr+1], 0);
006964    put2byte(&aData[hdr+3], pPg->nCell);
006965    put2byte(&aData[hdr+5], pData - aData);
006966    aData[hdr+7] = 0x00;
006967    return SQLITE_OK;
006968  }
006969  
006970  /*
006971  ** The pCArray objects contains pointers to b-tree cells and the cell sizes.
006972  ** This function attempts to add the cells stored in the array to page pPg.
006973  ** If it cannot (because the page needs to be defragmented before the cells
006974  ** will fit), non-zero is returned. Otherwise, if the cells are added
006975  ** successfully, zero is returned.
006976  **
006977  ** Argument pCellptr points to the first entry in the cell-pointer array
006978  ** (part of page pPg) to populate. After cell apCell[0] is written to the
006979  ** page body, a 16-bit offset is written to pCellptr. And so on, for each
006980  ** cell in the array. It is the responsibility of the caller to ensure
006981  ** that it is safe to overwrite this part of the cell-pointer array.
006982  **
006983  ** When this function is called, *ppData points to the start of the 
006984  ** content area on page pPg. If the size of the content area is extended,
006985  ** *ppData is updated to point to the new start of the content area
006986  ** before returning.
006987  **
006988  ** Finally, argument pBegin points to the byte immediately following the
006989  ** end of the space required by this page for the cell-pointer area (for
006990  ** all cells - not just those inserted by the current call). If the content
006991  ** area must be extended to before this point in order to accomodate all
006992  ** cells in apCell[], then the cells do not fit and non-zero is returned.
006993  */
006994  static int pageInsertArray(
006995    MemPage *pPg,                   /* Page to add cells to */
006996    u8 *pBegin,                     /* End of cell-pointer array */
006997    u8 **ppData,                    /* IN/OUT: Page content-area pointer */
006998    u8 *pCellptr,                   /* Pointer to cell-pointer area */
006999    int iFirst,                     /* Index of first cell to add */
007000    int nCell,                      /* Number of cells to add to pPg */
007001    CellArray *pCArray              /* Array of cells */
007002  ){
007003    int i = iFirst;                 /* Loop counter - cell index to insert */
007004    u8 *aData = pPg->aData;         /* Complete page */
007005    u8 *pData = *ppData;            /* Content area.  A subset of aData[] */
007006    int iEnd = iFirst + nCell;      /* End of loop. One past last cell to ins */
007007    int k;                          /* Current slot in pCArray->apEnd[] */
007008    u8 *pEnd;                       /* Maximum extent of cell data */
007009    assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
007010    if( iEnd<=iFirst ) return 0;
007011    for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
007012    pEnd = pCArray->apEnd[k];
007013    while( 1 /*Exit by break*/ ){
007014      int sz, rc;
007015      u8 *pSlot;
007016      assert( pCArray->szCell[i]!=0 );
007017      sz = pCArray->szCell[i];
007018      if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
007019        if( (pData - pBegin)<sz ) return 1;
007020        pData -= sz;
007021        pSlot = pData;
007022      }
007023      /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
007024      ** database.  But they might for a corrupt database.  Hence use memmove()
007025      ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
007026      assert( (pSlot+sz)<=pCArray->apCell[i]
007027           || pSlot>=(pCArray->apCell[i]+sz)
007028           || CORRUPT_DB );
007029      if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd
007030       && (uptr)(pCArray->apCell[i])<(uptr)pEnd
007031      ){
007032        assert( CORRUPT_DB );
007033        (void)SQLITE_CORRUPT_BKPT;
007034        return 1;
007035      }
007036      memmove(pSlot, pCArray->apCell[i], sz);
007037      put2byte(pCellptr, (pSlot - aData));
007038      pCellptr += 2;
007039      i++;
007040      if( i>=iEnd ) break;
007041      if( pCArray->ixNx[k]<=i ){
007042        k++;
007043        pEnd = pCArray->apEnd[k];
007044      }
007045    }
007046    *ppData = pData;
007047    return 0;
007048  }
007049  
007050  /*
007051  ** The pCArray object contains pointers to b-tree cells and their sizes.
007052  **
007053  ** This function adds the space associated with each cell in the array
007054  ** that is currently stored within the body of pPg to the pPg free-list.
007055  ** The cell-pointers and other fields of the page are not updated.
007056  **
007057  ** This function returns the total number of cells added to the free-list.
007058  */
007059  static int pageFreeArray(
007060    MemPage *pPg,                   /* Page to edit */
007061    int iFirst,                     /* First cell to delete */
007062    int nCell,                      /* Cells to delete */
007063    CellArray *pCArray              /* Array of cells */
007064  ){
007065    u8 * const aData = pPg->aData;
007066    u8 * const pEnd = &aData[pPg->pBt->usableSize];
007067    u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
007068    int nRet = 0;
007069    int i;
007070    int iEnd = iFirst + nCell;
007071    u8 *pFree = 0;
007072    int szFree = 0;
007073  
007074    for(i=iFirst; i<iEnd; i++){
007075      u8 *pCell = pCArray->apCell[i];
007076      if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
007077        int sz;
007078        /* No need to use cachedCellSize() here.  The sizes of all cells that
007079        ** are to be freed have already been computing while deciding which
007080        ** cells need freeing */
007081        sz = pCArray->szCell[i];  assert( sz>0 );
007082        if( pFree!=(pCell + sz) ){
007083          if( pFree ){
007084            assert( pFree>aData && (pFree - aData)<65536 );
007085            freeSpace(pPg, (u16)(pFree - aData), szFree);
007086          }
007087          pFree = pCell;
007088          szFree = sz;
007089          if( pFree+sz>pEnd ) return 0;
007090        }else{
007091          pFree = pCell;
007092          szFree += sz;
007093        }
007094        nRet++;
007095      }
007096    }
007097    if( pFree ){
007098      assert( pFree>aData && (pFree - aData)<65536 );
007099      freeSpace(pPg, (u16)(pFree - aData), szFree);
007100    }
007101    return nRet;
007102  }
007103  
007104  /*
007105  ** pCArray contains pointers to and sizes of all cells in the page being
007106  ** balanced.  The current page, pPg, has pPg->nCell cells starting with
007107  ** pCArray->apCell[iOld].  After balancing, this page should hold nNew cells
007108  ** starting at apCell[iNew].
007109  **
007110  ** This routine makes the necessary adjustments to pPg so that it contains
007111  ** the correct cells after being balanced.
007112  **
007113  ** The pPg->nFree field is invalid when this function returns. It is the
007114  ** responsibility of the caller to set it correctly.
007115  */
007116  static int editPage(
007117    MemPage *pPg,                   /* Edit this page */
007118    int iOld,                       /* Index of first cell currently on page */
007119    int iNew,                       /* Index of new first cell on page */
007120    int nNew,                       /* Final number of cells on page */
007121    CellArray *pCArray              /* Array of cells and sizes */
007122  ){
007123    u8 * const aData = pPg->aData;
007124    const int hdr = pPg->hdrOffset;
007125    u8 *pBegin = &pPg->aCellIdx[nNew * 2];
007126    int nCell = pPg->nCell;       /* Cells stored on pPg */
007127    u8 *pData;
007128    u8 *pCellptr;
007129    int i;
007130    int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
007131    int iNewEnd = iNew + nNew;
007132  
007133  #ifdef SQLITE_DEBUG
007134    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
007135    memcpy(pTmp, aData, pPg->pBt->usableSize);
007136  #endif
007137  
007138    /* Remove cells from the start and end of the page */
007139    assert( nCell>=0 );
007140    if( iOld<iNew ){
007141      int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
007142      if( nShift>nCell ) return SQLITE_CORRUPT_BKPT;
007143      memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
007144      nCell -= nShift;
007145    }
007146    if( iNewEnd < iOldEnd ){
007147      int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
007148      assert( nCell>=nTail );
007149      nCell -= nTail;
007150    }
007151  
007152    pData = &aData[get2byteNotZero(&aData[hdr+5])];
007153    if( pData<pBegin ) goto editpage_fail;
007154  
007155    /* Add cells to the start of the page */
007156    if( iNew<iOld ){
007157      int nAdd = MIN(nNew,iOld-iNew);
007158      assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
007159      assert( nAdd>=0 );
007160      pCellptr = pPg->aCellIdx;
007161      memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
007162      if( pageInsertArray(
007163            pPg, pBegin, &pData, pCellptr,
007164            iNew, nAdd, pCArray
007165      ) ) goto editpage_fail;
007166      nCell += nAdd;
007167    }
007168  
007169    /* Add any overflow cells */
007170    for(i=0; i<pPg->nOverflow; i++){
007171      int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
007172      if( iCell>=0 && iCell<nNew ){
007173        pCellptr = &pPg->aCellIdx[iCell * 2];
007174        if( nCell>iCell ){
007175          memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
007176        }
007177        nCell++;
007178        cachedCellSize(pCArray, iCell+iNew);
007179        if( pageInsertArray(
007180              pPg, pBegin, &pData, pCellptr,
007181              iCell+iNew, 1, pCArray
007182        ) ) goto editpage_fail;
007183      }
007184    }
007185  
007186    /* Append cells to the end of the page */
007187    assert( nCell>=0 );
007188    pCellptr = &pPg->aCellIdx[nCell*2];
007189    if( pageInsertArray(
007190          pPg, pBegin, &pData, pCellptr,
007191          iNew+nCell, nNew-nCell, pCArray
007192    ) ) goto editpage_fail;
007193  
007194    pPg->nCell = nNew;
007195    pPg->nOverflow = 0;
007196  
007197    put2byte(&aData[hdr+3], pPg->nCell);
007198    put2byte(&aData[hdr+5], pData - aData);
007199  
007200  #ifdef SQLITE_DEBUG
007201    for(i=0; i<nNew && !CORRUPT_DB; i++){
007202      u8 *pCell = pCArray->apCell[i+iNew];
007203      int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
007204      if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
007205        pCell = &pTmp[pCell - aData];
007206      }
007207      assert( 0==memcmp(pCell, &aData[iOff],
007208              pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
007209    }
007210  #endif
007211  
007212    return SQLITE_OK;
007213   editpage_fail:
007214    /* Unable to edit this page. Rebuild it from scratch instead. */
007215    populateCellCache(pCArray, iNew, nNew);
007216    return rebuildPage(pCArray, iNew, nNew, pPg);
007217  }
007218  
007219  
007220  #ifndef SQLITE_OMIT_QUICKBALANCE
007221  /*
007222  ** This version of balance() handles the common special case where
007223  ** a new entry is being inserted on the extreme right-end of the
007224  ** tree, in other words, when the new entry will become the largest
007225  ** entry in the tree.
007226  **
007227  ** Instead of trying to balance the 3 right-most leaf pages, just add
007228  ** a new page to the right-hand side and put the one new entry in
007229  ** that page.  This leaves the right side of the tree somewhat
007230  ** unbalanced.  But odds are that we will be inserting new entries
007231  ** at the end soon afterwards so the nearly empty page will quickly
007232  ** fill up.  On average.
007233  **
007234  ** pPage is the leaf page which is the right-most page in the tree.
007235  ** pParent is its parent.  pPage must have a single overflow entry
007236  ** which is also the right-most entry on the page.
007237  **
007238  ** The pSpace buffer is used to store a temporary copy of the divider
007239  ** cell that will be inserted into pParent. Such a cell consists of a 4
007240  ** byte page number followed by a variable length integer. In other
007241  ** words, at most 13 bytes. Hence the pSpace buffer must be at
007242  ** least 13 bytes in size.
007243  */
007244  static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
007245    BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
007246    MemPage *pNew;                       /* Newly allocated page */
007247    int rc;                              /* Return Code */
007248    Pgno pgnoNew;                        /* Page number of pNew */
007249  
007250    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
007251    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007252    assert( pPage->nOverflow==1 );
007253    
007254    if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;  /* dbfuzz001.test */
007255    assert( pPage->nFree>=0 );
007256    assert( pParent->nFree>=0 );
007257  
007258    /* Allocate a new page. This page will become the right-sibling of 
007259    ** pPage. Make the parent page writable, so that the new divider cell
007260    ** may be inserted. If both these operations are successful, proceed.
007261    */
007262    rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
007263  
007264    if( rc==SQLITE_OK ){
007265  
007266      u8 *pOut = &pSpace[4];
007267      u8 *pCell = pPage->apOvfl[0];
007268      u16 szCell = pPage->xCellSize(pPage, pCell);
007269      u8 *pStop;
007270      CellArray b;
007271  
007272      assert( sqlite3PagerIswriteable(pNew->pDbPage) );
007273      assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
007274      zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
007275      b.nCell = 1;
007276      b.pRef = pPage;
007277      b.apCell = &pCell;
007278      b.szCell = &szCell;
007279      b.apEnd[0] = pPage->aDataEnd;
007280      b.ixNx[0] = 2;
007281      rc = rebuildPage(&b, 0, 1, pNew);
007282      if( NEVER(rc) ){
007283        releasePage(pNew);
007284        return rc;
007285      }
007286      pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
007287  
007288      /* If this is an auto-vacuum database, update the pointer map
007289      ** with entries for the new page, and any pointer from the 
007290      ** cell on the page to an overflow page. If either of these
007291      ** operations fails, the return code is set, but the contents
007292      ** of the parent page are still manipulated by thh code below.
007293      ** That is Ok, at this point the parent page is guaranteed to
007294      ** be marked as dirty. Returning an error code will cause a
007295      ** rollback, undoing any changes made to the parent page.
007296      */
007297      if( ISAUTOVACUUM ){
007298        ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
007299        if( szCell>pNew->minLocal ){
007300          ptrmapPutOvflPtr(pNew, pNew, pCell, &rc);
007301        }
007302      }
007303    
007304      /* Create a divider cell to insert into pParent. The divider cell
007305      ** consists of a 4-byte page number (the page number of pPage) and
007306      ** a variable length key value (which must be the same value as the
007307      ** largest key on pPage).
007308      **
007309      ** To find the largest key value on pPage, first find the right-most 
007310      ** cell on pPage. The first two fields of this cell are the 
007311      ** record-length (a variable length integer at most 32-bits in size)
007312      ** and the key value (a variable length integer, may have any value).
007313      ** The first of the while(...) loops below skips over the record-length
007314      ** field. The second while(...) loop copies the key value from the
007315      ** cell on pPage into the pSpace buffer.
007316      */
007317      pCell = findCell(pPage, pPage->nCell-1);
007318      pStop = &pCell[9];
007319      while( (*(pCell++)&0x80) && pCell<pStop );
007320      pStop = &pCell[9];
007321      while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
007322  
007323      /* Insert the new divider cell into pParent. */
007324      if( rc==SQLITE_OK ){
007325        insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
007326                     0, pPage->pgno, &rc);
007327      }
007328  
007329      /* Set the right-child pointer of pParent to point to the new page. */
007330      put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
007331    
007332      /* Release the reference to the new page. */
007333      releasePage(pNew);
007334    }
007335  
007336    return rc;
007337  }
007338  #endif /* SQLITE_OMIT_QUICKBALANCE */
007339  
007340  #if 0
007341  /*
007342  ** This function does not contribute anything to the operation of SQLite.
007343  ** it is sometimes activated temporarily while debugging code responsible 
007344  ** for setting pointer-map entries.
007345  */
007346  static int ptrmapCheckPages(MemPage **apPage, int nPage){
007347    int i, j;
007348    for(i=0; i<nPage; i++){
007349      Pgno n;
007350      u8 e;
007351      MemPage *pPage = apPage[i];
007352      BtShared *pBt = pPage->pBt;
007353      assert( pPage->isInit );
007354  
007355      for(j=0; j<pPage->nCell; j++){
007356        CellInfo info;
007357        u8 *z;
007358       
007359        z = findCell(pPage, j);
007360        pPage->xParseCell(pPage, z, &info);
007361        if( info.nLocal<info.nPayload ){
007362          Pgno ovfl = get4byte(&z[info.nSize-4]);
007363          ptrmapGet(pBt, ovfl, &e, &n);
007364          assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
007365        }
007366        if( !pPage->leaf ){
007367          Pgno child = get4byte(z);
007368          ptrmapGet(pBt, child, &e, &n);
007369          assert( n==pPage->pgno && e==PTRMAP_BTREE );
007370        }
007371      }
007372      if( !pPage->leaf ){
007373        Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
007374        ptrmapGet(pBt, child, &e, &n);
007375        assert( n==pPage->pgno && e==PTRMAP_BTREE );
007376      }
007377    }
007378    return 1;
007379  }
007380  #endif
007381  
007382  /*
007383  ** This function is used to copy the contents of the b-tree node stored 
007384  ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
007385  ** the pointer-map entries for each child page are updated so that the
007386  ** parent page stored in the pointer map is page pTo. If pFrom contained
007387  ** any cells with overflow page pointers, then the corresponding pointer
007388  ** map entries are also updated so that the parent page is page pTo.
007389  **
007390  ** If pFrom is currently carrying any overflow cells (entries in the
007391  ** MemPage.apOvfl[] array), they are not copied to pTo. 
007392  **
007393  ** Before returning, page pTo is reinitialized using btreeInitPage().
007394  **
007395  ** The performance of this function is not critical. It is only used by 
007396  ** the balance_shallower() and balance_deeper() procedures, neither of
007397  ** which are called often under normal circumstances.
007398  */
007399  static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
007400    if( (*pRC)==SQLITE_OK ){
007401      BtShared * const pBt = pFrom->pBt;
007402      u8 * const aFrom = pFrom->aData;
007403      u8 * const aTo = pTo->aData;
007404      int const iFromHdr = pFrom->hdrOffset;
007405      int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
007406      int rc;
007407      int iData;
007408    
007409    
007410      assert( pFrom->isInit );
007411      assert( pFrom->nFree>=iToHdr );
007412      assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
007413    
007414      /* Copy the b-tree node content from page pFrom to page pTo. */
007415      iData = get2byte(&aFrom[iFromHdr+5]);
007416      memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
007417      memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
007418    
007419      /* Reinitialize page pTo so that the contents of the MemPage structure
007420      ** match the new data. The initialization of pTo can actually fail under
007421      ** fairly obscure circumstances, even though it is a copy of initialized 
007422      ** page pFrom.
007423      */
007424      pTo->isInit = 0;
007425      rc = btreeInitPage(pTo);
007426      if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo);
007427      if( rc!=SQLITE_OK ){
007428        *pRC = rc;
007429        return;
007430      }
007431    
007432      /* If this is an auto-vacuum database, update the pointer-map entries
007433      ** for any b-tree or overflow pages that pTo now contains the pointers to.
007434      */
007435      if( ISAUTOVACUUM ){
007436        *pRC = setChildPtrmaps(pTo);
007437      }
007438    }
007439  }
007440  
007441  /*
007442  ** This routine redistributes cells on the iParentIdx'th child of pParent
007443  ** (hereafter "the page") and up to 2 siblings so that all pages have about the
007444  ** same amount of free space. Usually a single sibling on either side of the
007445  ** page are used in the balancing, though both siblings might come from one
007446  ** side if the page is the first or last child of its parent. If the page 
007447  ** has fewer than 2 siblings (something which can only happen if the page
007448  ** is a root page or a child of a root page) then all available siblings
007449  ** participate in the balancing.
007450  **
007451  ** The number of siblings of the page might be increased or decreased by 
007452  ** one or two in an effort to keep pages nearly full but not over full. 
007453  **
007454  ** Note that when this routine is called, some of the cells on the page
007455  ** might not actually be stored in MemPage.aData[]. This can happen
007456  ** if the page is overfull. This routine ensures that all cells allocated
007457  ** to the page and its siblings fit into MemPage.aData[] before returning.
007458  **
007459  ** In the course of balancing the page and its siblings, cells may be
007460  ** inserted into or removed from the parent page (pParent). Doing so
007461  ** may cause the parent page to become overfull or underfull. If this
007462  ** happens, it is the responsibility of the caller to invoke the correct
007463  ** balancing routine to fix this problem (see the balance() routine). 
007464  **
007465  ** If this routine fails for any reason, it might leave the database
007466  ** in a corrupted state. So if this routine fails, the database should
007467  ** be rolled back.
007468  **
007469  ** The third argument to this function, aOvflSpace, is a pointer to a
007470  ** buffer big enough to hold one page. If while inserting cells into the parent
007471  ** page (pParent) the parent page becomes overfull, this buffer is
007472  ** used to store the parent's overflow cells. Because this function inserts
007473  ** a maximum of four divider cells into the parent page, and the maximum
007474  ** size of a cell stored within an internal node is always less than 1/4
007475  ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
007476  ** enough for all overflow cells.
007477  **
007478  ** If aOvflSpace is set to a null pointer, this function returns 
007479  ** SQLITE_NOMEM.
007480  */
007481  static int balance_nonroot(
007482    MemPage *pParent,               /* Parent page of siblings being balanced */
007483    int iParentIdx,                 /* Index of "the page" in pParent */
007484    u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
007485    int isRoot,                     /* True if pParent is a root-page */
007486    int bBulk                       /* True if this call is part of a bulk load */
007487  ){
007488    BtShared *pBt;               /* The whole database */
007489    int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
007490    int nNew = 0;                /* Number of pages in apNew[] */
007491    int nOld;                    /* Number of pages in apOld[] */
007492    int i, j, k;                 /* Loop counters */
007493    int nxDiv;                   /* Next divider slot in pParent->aCell[] */
007494    int rc = SQLITE_OK;          /* The return code */
007495    u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
007496    int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
007497    int usableSpace;             /* Bytes in pPage beyond the header */
007498    int pageFlags;               /* Value of pPage->aData[0] */
007499    int iSpace1 = 0;             /* First unused byte of aSpace1[] */
007500    int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
007501    int szScratch;               /* Size of scratch memory requested */
007502    MemPage *apOld[NB];          /* pPage and up to two siblings */
007503    MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
007504    u8 *pRight;                  /* Location in parent of right-sibling pointer */
007505    u8 *apDiv[NB-1];             /* Divider cells in pParent */
007506    int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
007507    int cntOld[NB+2];            /* Old index in b.apCell[] */
007508    int szNew[NB+2];             /* Combined size of cells placed on i-th page */
007509    u8 *aSpace1;                 /* Space for copies of dividers cells */
007510    Pgno pgno;                   /* Temp var to store a page number in */
007511    u8 abDone[NB+2];             /* True after i'th new page is populated */
007512    Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
007513    Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
007514    u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
007515    CellArray b;                  /* Parsed information on cells being balanced */
007516  
007517    memset(abDone, 0, sizeof(abDone));
007518    b.nCell = 0;
007519    b.apCell = 0;
007520    pBt = pParent->pBt;
007521    assert( sqlite3_mutex_held(pBt->mutex) );
007522    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007523  
007524    /* At this point pParent may have at most one overflow cell. And if
007525    ** this overflow cell is present, it must be the cell with 
007526    ** index iParentIdx. This scenario comes about when this function
007527    ** is called (indirectly) from sqlite3BtreeDelete().
007528    */
007529    assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
007530    assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
007531  
007532    if( !aOvflSpace ){
007533      return SQLITE_NOMEM_BKPT;
007534    }
007535    assert( pParent->nFree>=0 );
007536  
007537    /* Find the sibling pages to balance. Also locate the cells in pParent 
007538    ** that divide the siblings. An attempt is made to find NN siblings on 
007539    ** either side of pPage. More siblings are taken from one side, however, 
007540    ** if there are fewer than NN siblings on the other side. If pParent
007541    ** has NB or fewer children then all children of pParent are taken.  
007542    **
007543    ** This loop also drops the divider cells from the parent page. This
007544    ** way, the remainder of the function does not have to deal with any
007545    ** overflow cells in the parent page, since if any existed they will
007546    ** have already been removed.
007547    */
007548    i = pParent->nOverflow + pParent->nCell;
007549    if( i<2 ){
007550      nxDiv = 0;
007551    }else{
007552      assert( bBulk==0 || bBulk==1 );
007553      if( iParentIdx==0 ){                 
007554        nxDiv = 0;
007555      }else if( iParentIdx==i ){
007556        nxDiv = i-2+bBulk;
007557      }else{
007558        nxDiv = iParentIdx-1;
007559      }
007560      i = 2-bBulk;
007561    }
007562    nOld = i+1;
007563    if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
007564      pRight = &pParent->aData[pParent->hdrOffset+8];
007565    }else{
007566      pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
007567    }
007568    pgno = get4byte(pRight);
007569    while( 1 ){
007570      rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
007571      if( rc ){
007572        memset(apOld, 0, (i+1)*sizeof(MemPage*));
007573        goto balance_cleanup;
007574      }
007575      if( apOld[i]->nFree<0 ){
007576        rc = btreeComputeFreeSpace(apOld[i]);
007577        if( rc ){
007578          memset(apOld, 0, (i)*sizeof(MemPage*));
007579          goto balance_cleanup;
007580        }
007581      }
007582      if( (i--)==0 ) break;
007583  
007584      if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
007585        apDiv[i] = pParent->apOvfl[0];
007586        pgno = get4byte(apDiv[i]);
007587        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
007588        pParent->nOverflow = 0;
007589      }else{
007590        apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
007591        pgno = get4byte(apDiv[i]);
007592        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
007593  
007594        /* Drop the cell from the parent page. apDiv[i] still points to
007595        ** the cell within the parent, even though it has been dropped.
007596        ** This is safe because dropping a cell only overwrites the first
007597        ** four bytes of it, and this function does not need the first
007598        ** four bytes of the divider cell. So the pointer is safe to use
007599        ** later on.  
007600        **
007601        ** But not if we are in secure-delete mode. In secure-delete mode,
007602        ** the dropCell() routine will overwrite the entire cell with zeroes.
007603        ** In this case, temporarily copy the cell into the aOvflSpace[]
007604        ** buffer. It will be copied out again as soon as the aSpace[] buffer
007605        ** is allocated.  */
007606        if( pBt->btsFlags & BTS_FAST_SECURE ){
007607          int iOff;
007608  
007609          iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
007610          if( (iOff+szNew[i])>(int)pBt->usableSize ){
007611            rc = SQLITE_CORRUPT_BKPT;
007612            memset(apOld, 0, (i+1)*sizeof(MemPage*));
007613            goto balance_cleanup;
007614          }else{
007615            memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
007616            apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
007617          }
007618        }
007619        dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
007620      }
007621    }
007622  
007623    /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
007624    ** alignment */
007625    nMaxCells = nOld*(MX_CELL(pBt) + ArraySize(pParent->apOvfl));
007626    nMaxCells = (nMaxCells + 3)&~3;
007627  
007628    /*
007629    ** Allocate space for memory structures
007630    */
007631    szScratch =
007632         nMaxCells*sizeof(u8*)                       /* b.apCell */
007633       + nMaxCells*sizeof(u16)                       /* b.szCell */
007634       + pBt->pageSize;                              /* aSpace1 */
007635  
007636    assert( szScratch<=7*(int)pBt->pageSize );
007637    b.apCell = sqlite3StackAllocRaw(0, szScratch );
007638    if( b.apCell==0 ){
007639      rc = SQLITE_NOMEM_BKPT;
007640      goto balance_cleanup;
007641    }
007642    b.szCell = (u16*)&b.apCell[nMaxCells];
007643    aSpace1 = (u8*)&b.szCell[nMaxCells];
007644    assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
007645  
007646    /*
007647    ** Load pointers to all cells on sibling pages and the divider cells
007648    ** into the local b.apCell[] array.  Make copies of the divider cells
007649    ** into space obtained from aSpace1[]. The divider cells have already
007650    ** been removed from pParent.
007651    **
007652    ** If the siblings are on leaf pages, then the child pointers of the
007653    ** divider cells are stripped from the cells before they are copied
007654    ** into aSpace1[].  In this way, all cells in b.apCell[] are without
007655    ** child pointers.  If siblings are not leaves, then all cell in
007656    ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
007657    ** are alike.
007658    **
007659    ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
007660    **       leafData:  1 if pPage holds key+data and pParent holds only keys.
007661    */
007662    b.pRef = apOld[0];
007663    leafCorrection = b.pRef->leaf*4;
007664    leafData = b.pRef->intKeyLeaf;
007665    for(i=0; i<nOld; i++){
007666      MemPage *pOld = apOld[i];
007667      int limit = pOld->nCell;
007668      u8 *aData = pOld->aData;
007669      u16 maskPage = pOld->maskPage;
007670      u8 *piCell = aData + pOld->cellOffset;
007671      u8 *piEnd;
007672      VVA_ONLY( int nCellAtStart = b.nCell; )
007673  
007674      /* Verify that all sibling pages are of the same "type" (table-leaf,
007675      ** table-interior, index-leaf, or index-interior).
007676      */
007677      if( pOld->aData[0]!=apOld[0]->aData[0] ){
007678        rc = SQLITE_CORRUPT_BKPT;
007679        goto balance_cleanup;
007680      }
007681  
007682      /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
007683      ** contains overflow cells, include them in the b.apCell[] array
007684      ** in the correct spot.
007685      **
007686      ** Note that when there are multiple overflow cells, it is always the
007687      ** case that they are sequential and adjacent.  This invariant arises
007688      ** because multiple overflows can only occurs when inserting divider
007689      ** cells into a parent on a prior balance, and divider cells are always
007690      ** adjacent and are inserted in order.  There is an assert() tagged
007691      ** with "NOTE 1" in the overflow cell insertion loop to prove this
007692      ** invariant.
007693      **
007694      ** This must be done in advance.  Once the balance starts, the cell
007695      ** offset section of the btree page will be overwritten and we will no
007696      ** long be able to find the cells if a pointer to each cell is not saved
007697      ** first.
007698      */
007699      memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
007700      if( pOld->nOverflow>0 ){
007701        if( NEVER(limit<pOld->aiOvfl[0]) ){
007702          rc = SQLITE_CORRUPT_BKPT;
007703          goto balance_cleanup;
007704        }
007705        limit = pOld->aiOvfl[0];
007706        for(j=0; j<limit; j++){
007707          b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
007708          piCell += 2;
007709          b.nCell++;
007710        }
007711        for(k=0; k<pOld->nOverflow; k++){
007712          assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
007713          b.apCell[b.nCell] = pOld->apOvfl[k];
007714          b.nCell++;
007715        }
007716      }
007717      piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
007718      while( piCell<piEnd ){
007719        assert( b.nCell<nMaxCells );
007720        b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
007721        piCell += 2;
007722        b.nCell++;
007723      }
007724      assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
007725  
007726      cntOld[i] = b.nCell;
007727      if( i<nOld-1 && !leafData){
007728        u16 sz = (u16)szNew[i];
007729        u8 *pTemp;
007730        assert( b.nCell<nMaxCells );
007731        b.szCell[b.nCell] = sz;
007732        pTemp = &aSpace1[iSpace1];
007733        iSpace1 += sz;
007734        assert( sz<=pBt->maxLocal+23 );
007735        assert( iSpace1 <= (int)pBt->pageSize );
007736        memcpy(pTemp, apDiv[i], sz);
007737        b.apCell[b.nCell] = pTemp+leafCorrection;
007738        assert( leafCorrection==0 || leafCorrection==4 );
007739        b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
007740        if( !pOld->leaf ){
007741          assert( leafCorrection==0 );
007742          assert( pOld->hdrOffset==0 );
007743          /* The right pointer of the child page pOld becomes the left
007744          ** pointer of the divider cell */
007745          memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
007746        }else{
007747          assert( leafCorrection==4 );
007748          while( b.szCell[b.nCell]<4 ){
007749            /* Do not allow any cells smaller than 4 bytes. If a smaller cell
007750            ** does exist, pad it with 0x00 bytes. */
007751            assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
007752            assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
007753            aSpace1[iSpace1++] = 0x00;
007754            b.szCell[b.nCell]++;
007755          }
007756        }
007757        b.nCell++;
007758      }
007759    }
007760  
007761    /*
007762    ** Figure out the number of pages needed to hold all b.nCell cells.
007763    ** Store this number in "k".  Also compute szNew[] which is the total
007764    ** size of all cells on the i-th page and cntNew[] which is the index
007765    ** in b.apCell[] of the cell that divides page i from page i+1.  
007766    ** cntNew[k] should equal b.nCell.
007767    **
007768    ** Values computed by this block:
007769    **
007770    **           k: The total number of sibling pages
007771    **    szNew[i]: Spaced used on the i-th sibling page.
007772    **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
007773    **              the right of the i-th sibling page.
007774    ** usableSpace: Number of bytes of space available on each sibling.
007775    ** 
007776    */
007777    usableSpace = pBt->usableSize - 12 + leafCorrection;
007778    for(i=k=0; i<nOld; i++, k++){
007779      MemPage *p = apOld[i];
007780      b.apEnd[k] = p->aDataEnd;
007781      b.ixNx[k] = cntOld[i];
007782      if( k && b.ixNx[k]==b.ixNx[k-1] ){
007783        k--;  /* Omit b.ixNx[] entry for child pages with no cells */
007784      }
007785      if( !leafData ){
007786        k++;
007787        b.apEnd[k] = pParent->aDataEnd;
007788        b.ixNx[k] = cntOld[i]+1;
007789      }
007790      assert( p->nFree>=0 );
007791      szNew[i] = usableSpace - p->nFree;
007792      for(j=0; j<p->nOverflow; j++){
007793        szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
007794      }
007795      cntNew[i] = cntOld[i];
007796    }
007797    k = nOld;
007798    for(i=0; i<k; i++){
007799      int sz;
007800      while( szNew[i]>usableSpace ){
007801        if( i+1>=k ){
007802          k = i+2;
007803          if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
007804          szNew[k-1] = 0;
007805          cntNew[k-1] = b.nCell;
007806        }
007807        sz = 2 + cachedCellSize(&b, cntNew[i]-1);
007808        szNew[i] -= sz;
007809        if( !leafData ){
007810          if( cntNew[i]<b.nCell ){
007811            sz = 2 + cachedCellSize(&b, cntNew[i]);
007812          }else{
007813            sz = 0;
007814          }
007815        }
007816        szNew[i+1] += sz;
007817        cntNew[i]--;
007818      }
007819      while( cntNew[i]<b.nCell ){
007820        sz = 2 + cachedCellSize(&b, cntNew[i]);
007821        if( szNew[i]+sz>usableSpace ) break;
007822        szNew[i] += sz;
007823        cntNew[i]++;
007824        if( !leafData ){
007825          if( cntNew[i]<b.nCell ){
007826            sz = 2 + cachedCellSize(&b, cntNew[i]);
007827          }else{
007828            sz = 0;
007829          }
007830        }
007831        szNew[i+1] -= sz;
007832      }
007833      if( cntNew[i]>=b.nCell ){
007834        k = i+1;
007835      }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
007836        rc = SQLITE_CORRUPT_BKPT;
007837        goto balance_cleanup;
007838      }
007839    }
007840  
007841    /*
007842    ** The packing computed by the previous block is biased toward the siblings
007843    ** on the left side (siblings with smaller keys). The left siblings are
007844    ** always nearly full, while the right-most sibling might be nearly empty.
007845    ** The next block of code attempts to adjust the packing of siblings to
007846    ** get a better balance.
007847    **
007848    ** This adjustment is more than an optimization.  The packing above might
007849    ** be so out of balance as to be illegal.  For example, the right-most
007850    ** sibling might be completely empty.  This adjustment is not optional.
007851    */
007852    for(i=k-1; i>0; i--){
007853      int szRight = szNew[i];  /* Size of sibling on the right */
007854      int szLeft = szNew[i-1]; /* Size of sibling on the left */
007855      int r;              /* Index of right-most cell in left sibling */
007856      int d;              /* Index of first cell to the left of right sibling */
007857  
007858      r = cntNew[i-1] - 1;
007859      d = r + 1 - leafData;
007860      (void)cachedCellSize(&b, d);
007861      do{
007862        assert( d<nMaxCells );
007863        assert( r<nMaxCells );
007864        (void)cachedCellSize(&b, r);
007865        if( szRight!=0
007866         && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
007867          break;
007868        }
007869        szRight += b.szCell[d] + 2;
007870        szLeft -= b.szCell[r] + 2;
007871        cntNew[i-1] = r;
007872        r--;
007873        d--;
007874      }while( r>=0 );
007875      szNew[i] = szRight;
007876      szNew[i-1] = szLeft;
007877      if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
007878        rc = SQLITE_CORRUPT_BKPT;
007879        goto balance_cleanup;
007880      }
007881    }
007882  
007883    /* Sanity check:  For a non-corrupt database file one of the follwing
007884    ** must be true:
007885    **    (1) We found one or more cells (cntNew[0])>0), or
007886    **    (2) pPage is a virtual root page.  A virtual root page is when
007887    **        the real root page is page 1 and we are the only child of
007888    **        that page.
007889    */
007890    assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
007891    TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
007892      apOld[0]->pgno, apOld[0]->nCell,
007893      nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
007894      nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
007895    ));
007896  
007897    /*
007898    ** Allocate k new pages.  Reuse old pages where possible.
007899    */
007900    pageFlags = apOld[0]->aData[0];
007901    for(i=0; i<k; i++){
007902      MemPage *pNew;
007903      if( i<nOld ){
007904        pNew = apNew[i] = apOld[i];
007905        apOld[i] = 0;
007906        rc = sqlite3PagerWrite(pNew->pDbPage);
007907        nNew++;
007908        if( rc ) goto balance_cleanup;
007909      }else{
007910        assert( i>0 );
007911        rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
007912        if( rc ) goto balance_cleanup;
007913        zeroPage(pNew, pageFlags);
007914        apNew[i] = pNew;
007915        nNew++;
007916        cntOld[i] = b.nCell;
007917  
007918        /* Set the pointer-map entry for the new sibling page. */
007919        if( ISAUTOVACUUM ){
007920          ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
007921          if( rc!=SQLITE_OK ){
007922            goto balance_cleanup;
007923          }
007924        }
007925      }
007926    }
007927  
007928    /*
007929    ** Reassign page numbers so that the new pages are in ascending order. 
007930    ** This helps to keep entries in the disk file in order so that a scan
007931    ** of the table is closer to a linear scan through the file. That in turn 
007932    ** helps the operating system to deliver pages from the disk more rapidly.
007933    **
007934    ** An O(n^2) insertion sort algorithm is used, but since n is never more 
007935    ** than (NB+2) (a small constant), that should not be a problem.
007936    **
007937    ** When NB==3, this one optimization makes the database about 25% faster 
007938    ** for large insertions and deletions.
007939    */
007940    for(i=0; i<nNew; i++){
007941      aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
007942      aPgFlags[i] = apNew[i]->pDbPage->flags;
007943      for(j=0; j<i; j++){
007944        if( aPgno[j]==aPgno[i] ){
007945          /* This branch is taken if the set of sibling pages somehow contains
007946          ** duplicate entries. This can happen if the database is corrupt. 
007947          ** It would be simpler to detect this as part of the loop below, but
007948          ** we do the detection here in order to avoid populating the pager
007949          ** cache with two separate objects associated with the same
007950          ** page number.  */
007951          assert( CORRUPT_DB );
007952          rc = SQLITE_CORRUPT_BKPT;
007953          goto balance_cleanup;
007954        }
007955      }
007956    }
007957    for(i=0; i<nNew; i++){
007958      int iBest = 0;                /* aPgno[] index of page number to use */
007959      for(j=1; j<nNew; j++){
007960        if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
007961      }
007962      pgno = aPgOrder[iBest];
007963      aPgOrder[iBest] = 0xffffffff;
007964      if( iBest!=i ){
007965        if( iBest>i ){
007966          sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
007967        }
007968        sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
007969        apNew[i]->pgno = pgno;
007970      }
007971    }
007972  
007973    TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
007974           "%d(%d nc=%d) %d(%d nc=%d)\n",
007975      apNew[0]->pgno, szNew[0], cntNew[0],
007976      nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
007977      nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
007978      nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
007979      nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
007980      nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
007981      nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
007982      nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
007983      nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
007984    ));
007985  
007986    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007987    assert( nNew>=1 && nNew<=ArraySize(apNew) );
007988    assert( apNew[nNew-1]!=0 );
007989    put4byte(pRight, apNew[nNew-1]->pgno);
007990  
007991    /* If the sibling pages are not leaves, ensure that the right-child pointer
007992    ** of the right-most new sibling page is set to the value that was 
007993    ** originally in the same field of the right-most old sibling page. */
007994    if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
007995      MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
007996      memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
007997    }
007998  
007999    /* Make any required updates to pointer map entries associated with 
008000    ** cells stored on sibling pages following the balance operation. Pointer
008001    ** map entries associated with divider cells are set by the insertCell()
008002    ** routine. The associated pointer map entries are:
008003    **
008004    **   a) if the cell contains a reference to an overflow chain, the
008005    **      entry associated with the first page in the overflow chain, and
008006    **
008007    **   b) if the sibling pages are not leaves, the child page associated
008008    **      with the cell.
008009    **
008010    ** If the sibling pages are not leaves, then the pointer map entry 
008011    ** associated with the right-child of each sibling may also need to be 
008012    ** updated. This happens below, after the sibling pages have been 
008013    ** populated, not here.
008014    */
008015    if( ISAUTOVACUUM ){
008016      MemPage *pOld;
008017      MemPage *pNew = pOld = apNew[0];
008018      int cntOldNext = pNew->nCell + pNew->nOverflow;
008019      int iNew = 0;
008020      int iOld = 0;
008021  
008022      for(i=0; i<b.nCell; i++){
008023        u8 *pCell = b.apCell[i];
008024        while( i==cntOldNext ){
008025          iOld++;
008026          assert( iOld<nNew || iOld<nOld );
008027          assert( iOld>=0 && iOld<NB );
008028          pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
008029          cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
008030        }
008031        if( i==cntNew[iNew] ){
008032          pNew = apNew[++iNew];
008033          if( !leafData ) continue;
008034        }
008035  
008036        /* Cell pCell is destined for new sibling page pNew. Originally, it
008037        ** was either part of sibling page iOld (possibly an overflow cell), 
008038        ** or else the divider cell to the left of sibling page iOld. So,
008039        ** if sibling page iOld had the same page number as pNew, and if
008040        ** pCell really was a part of sibling page iOld (not a divider or
008041        ** overflow cell), we can skip updating the pointer map entries.  */
008042        if( iOld>=nNew
008043         || pNew->pgno!=aPgno[iOld]
008044         || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
008045        ){
008046          if( !leafCorrection ){
008047            ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
008048          }
008049          if( cachedCellSize(&b,i)>pNew->minLocal ){
008050            ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
008051          }
008052          if( rc ) goto balance_cleanup;
008053        }
008054      }
008055    }
008056  
008057    /* Insert new divider cells into pParent. */
008058    for(i=0; i<nNew-1; i++){
008059      u8 *pCell;
008060      u8 *pTemp;
008061      int sz;
008062      MemPage *pNew = apNew[i];
008063      j = cntNew[i];
008064  
008065      assert( j<nMaxCells );
008066      assert( b.apCell[j]!=0 );
008067      pCell = b.apCell[j];
008068      sz = b.szCell[j] + leafCorrection;
008069      pTemp = &aOvflSpace[iOvflSpace];
008070      if( !pNew->leaf ){
008071        memcpy(&pNew->aData[8], pCell, 4);
008072      }else if( leafData ){
008073        /* If the tree is a leaf-data tree, and the siblings are leaves, 
008074        ** then there is no divider cell in b.apCell[]. Instead, the divider 
008075        ** cell consists of the integer key for the right-most cell of 
008076        ** the sibling-page assembled above only.
008077        */
008078        CellInfo info;
008079        j--;
008080        pNew->xParseCell(pNew, b.apCell[j], &info);
008081        pCell = pTemp;
008082        sz = 4 + putVarint(&pCell[4], info.nKey);
008083        pTemp = 0;
008084      }else{
008085        pCell -= 4;
008086        /* Obscure case for non-leaf-data trees: If the cell at pCell was
008087        ** previously stored on a leaf node, and its reported size was 4
008088        ** bytes, then it may actually be smaller than this 
008089        ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
008090        ** any cell). But it is important to pass the correct size to 
008091        ** insertCell(), so reparse the cell now.
008092        **
008093        ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
008094        ** and WITHOUT ROWID tables with exactly one column which is the
008095        ** primary key.
008096        */
008097        if( b.szCell[j]==4 ){
008098          assert(leafCorrection==4);
008099          sz = pParent->xCellSize(pParent, pCell);
008100        }
008101      }
008102      iOvflSpace += sz;
008103      assert( sz<=pBt->maxLocal+23 );
008104      assert( iOvflSpace <= (int)pBt->pageSize );
008105      insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
008106      if( rc!=SQLITE_OK ) goto balance_cleanup;
008107      assert( sqlite3PagerIswriteable(pParent->pDbPage) );
008108    }
008109  
008110    /* Now update the actual sibling pages. The order in which they are updated
008111    ** is important, as this code needs to avoid disrupting any page from which
008112    ** cells may still to be read. In practice, this means:
008113    **
008114    **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
008115    **      then it is not safe to update page apNew[iPg] until after
008116    **      the left-hand sibling apNew[iPg-1] has been updated.
008117    **
008118    **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
008119    **      then it is not safe to update page apNew[iPg] until after
008120    **      the right-hand sibling apNew[iPg+1] has been updated.
008121    **
008122    ** If neither of the above apply, the page is safe to update.
008123    **
008124    ** The iPg value in the following loop starts at nNew-1 goes down
008125    ** to 0, then back up to nNew-1 again, thus making two passes over
008126    ** the pages.  On the initial downward pass, only condition (1) above
008127    ** needs to be tested because (2) will always be true from the previous
008128    ** step.  On the upward pass, both conditions are always true, so the
008129    ** upwards pass simply processes pages that were missed on the downward
008130    ** pass.
008131    */
008132    for(i=1-nNew; i<nNew; i++){
008133      int iPg = i<0 ? -i : i;
008134      assert( iPg>=0 && iPg<nNew );
008135      if( abDone[iPg] ) continue;         /* Skip pages already processed */
008136      if( i>=0                            /* On the upwards pass, or... */
008137       || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
008138      ){
008139        int iNew;
008140        int iOld;
008141        int nNewCell;
008142  
008143        /* Verify condition (1):  If cells are moving left, update iPg
008144        ** only after iPg-1 has already been updated. */
008145        assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
008146  
008147        /* Verify condition (2):  If cells are moving right, update iPg
008148        ** only after iPg+1 has already been updated. */
008149        assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
008150  
008151        if( iPg==0 ){
008152          iNew = iOld = 0;
008153          nNewCell = cntNew[0];
008154        }else{
008155          iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
008156          iNew = cntNew[iPg-1] + !leafData;
008157          nNewCell = cntNew[iPg] - iNew;
008158        }
008159  
008160        rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
008161        if( rc ) goto balance_cleanup;
008162        abDone[iPg]++;
008163        apNew[iPg]->nFree = usableSpace-szNew[iPg];
008164        assert( apNew[iPg]->nOverflow==0 );
008165        assert( apNew[iPg]->nCell==nNewCell );
008166      }
008167    }
008168  
008169    /* All pages have been processed exactly once */
008170    assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
008171  
008172    assert( nOld>0 );
008173    assert( nNew>0 );
008174  
008175    if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
008176      /* The root page of the b-tree now contains no cells. The only sibling
008177      ** page is the right-child of the parent. Copy the contents of the
008178      ** child page into the parent, decreasing the overall height of the
008179      ** b-tree structure by one. This is described as the "balance-shallower"
008180      ** sub-algorithm in some documentation.
008181      **
008182      ** If this is an auto-vacuum database, the call to copyNodeContent() 
008183      ** sets all pointer-map entries corresponding to database image pages 
008184      ** for which the pointer is stored within the content being copied.
008185      **
008186      ** It is critical that the child page be defragmented before being
008187      ** copied into the parent, because if the parent is page 1 then it will
008188      ** by smaller than the child due to the database header, and so all the
008189      ** free space needs to be up front.
008190      */
008191      assert( nNew==1 || CORRUPT_DB );
008192      rc = defragmentPage(apNew[0], -1);
008193      testcase( rc!=SQLITE_OK );
008194      assert( apNew[0]->nFree == 
008195          (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
008196            - apNew[0]->nCell*2)
008197        || rc!=SQLITE_OK
008198      );
008199      copyNodeContent(apNew[0], pParent, &rc);
008200      freePage(apNew[0], &rc);
008201    }else if( ISAUTOVACUUM && !leafCorrection ){
008202      /* Fix the pointer map entries associated with the right-child of each
008203      ** sibling page. All other pointer map entries have already been taken
008204      ** care of.  */
008205      for(i=0; i<nNew; i++){
008206        u32 key = get4byte(&apNew[i]->aData[8]);
008207        ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
008208      }
008209    }
008210  
008211    assert( pParent->isInit );
008212    TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
008213            nOld, nNew, b.nCell));
008214  
008215    /* Free any old pages that were not reused as new pages.
008216    */
008217    for(i=nNew; i<nOld; i++){
008218      freePage(apOld[i], &rc);
008219    }
008220  
008221  #if 0
008222    if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
008223      /* The ptrmapCheckPages() contains assert() statements that verify that
008224      ** all pointer map pages are set correctly. This is helpful while 
008225      ** debugging. This is usually disabled because a corrupt database may
008226      ** cause an assert() statement to fail.  */
008227      ptrmapCheckPages(apNew, nNew);
008228      ptrmapCheckPages(&pParent, 1);
008229    }
008230  #endif
008231  
008232    /*
008233    ** Cleanup before returning.
008234    */
008235  balance_cleanup:
008236    sqlite3StackFree(0, b.apCell);
008237    for(i=0; i<nOld; i++){
008238      releasePage(apOld[i]);
008239    }
008240    for(i=0; i<nNew; i++){
008241      releasePage(apNew[i]);
008242    }
008243  
008244    return rc;
008245  }
008246  
008247  
008248  /*
008249  ** This function is called when the root page of a b-tree structure is
008250  ** overfull (has one or more overflow pages).
008251  **
008252  ** A new child page is allocated and the contents of the current root
008253  ** page, including overflow cells, are copied into the child. The root
008254  ** page is then overwritten to make it an empty page with the right-child 
008255  ** pointer pointing to the new page.
008256  **
008257  ** Before returning, all pointer-map entries corresponding to pages 
008258  ** that the new child-page now contains pointers to are updated. The
008259  ** entry corresponding to the new right-child pointer of the root
008260  ** page is also updated.
008261  **
008262  ** If successful, *ppChild is set to contain a reference to the child 
008263  ** page and SQLITE_OK is returned. In this case the caller is required
008264  ** to call releasePage() on *ppChild exactly once. If an error occurs,
008265  ** an error code is returned and *ppChild is set to 0.
008266  */
008267  static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
008268    int rc;                        /* Return value from subprocedures */
008269    MemPage *pChild = 0;           /* Pointer to a new child page */
008270    Pgno pgnoChild = 0;            /* Page number of the new child page */
008271    BtShared *pBt = pRoot->pBt;    /* The BTree */
008272  
008273    assert( pRoot->nOverflow>0 );
008274    assert( sqlite3_mutex_held(pBt->mutex) );
008275  
008276    /* Make pRoot, the root page of the b-tree, writable. Allocate a new 
008277    ** page that will become the new right-child of pPage. Copy the contents
008278    ** of the node stored on pRoot into the new child page.
008279    */
008280    rc = sqlite3PagerWrite(pRoot->pDbPage);
008281    if( rc==SQLITE_OK ){
008282      rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
008283      copyNodeContent(pRoot, pChild, &rc);
008284      if( ISAUTOVACUUM ){
008285        ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
008286      }
008287    }
008288    if( rc ){
008289      *ppChild = 0;
008290      releasePage(pChild);
008291      return rc;
008292    }
008293    assert( sqlite3PagerIswriteable(pChild->pDbPage) );
008294    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
008295    assert( pChild->nCell==pRoot->nCell || CORRUPT_DB );
008296  
008297    TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
008298  
008299    /* Copy the overflow cells from pRoot to pChild */
008300    memcpy(pChild->aiOvfl, pRoot->aiOvfl,
008301           pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
008302    memcpy(pChild->apOvfl, pRoot->apOvfl,
008303           pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
008304    pChild->nOverflow = pRoot->nOverflow;
008305  
008306    /* Zero the contents of pRoot. Then install pChild as the right-child. */
008307    zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
008308    put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
008309  
008310    *ppChild = pChild;
008311    return SQLITE_OK;
008312  }
008313  
008314  /*
008315  ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid
008316  ** on the same B-tree as pCur.
008317  **
008318  ** This can if a database is corrupt with two or more SQL tables
008319  ** pointing to the same b-tree.  If an insert occurs on one SQL table
008320  ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL
008321  ** table linked to the same b-tree.  If the secondary insert causes a
008322  ** rebalance, that can change content out from under the cursor on the
008323  ** first SQL table, violating invariants on the first insert.
008324  */
008325  static int anotherValidCursor(BtCursor *pCur){
008326    BtCursor *pOther;
008327    for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){
008328      if( pOther!=pCur
008329       && pOther->eState==CURSOR_VALID
008330       && pOther->pPage==pCur->pPage
008331      ){
008332        return SQLITE_CORRUPT_BKPT;
008333      }
008334    }
008335    return SQLITE_OK;
008336  }
008337  
008338  /*
008339  ** The page that pCur currently points to has just been modified in
008340  ** some way. This function figures out if this modification means the
008341  ** tree needs to be balanced, and if so calls the appropriate balancing 
008342  ** routine. Balancing routines are:
008343  **
008344  **   balance_quick()
008345  **   balance_deeper()
008346  **   balance_nonroot()
008347  */
008348  static int balance(BtCursor *pCur){
008349    int rc = SQLITE_OK;
008350    const int nMin = pCur->pBt->usableSize * 2 / 3;
008351    u8 aBalanceQuickSpace[13];
008352    u8 *pFree = 0;
008353  
008354    VVA_ONLY( int balance_quick_called = 0 );
008355    VVA_ONLY( int balance_deeper_called = 0 );
008356  
008357    do {
008358      int iPage;
008359      MemPage *pPage = pCur->pPage;
008360  
008361      if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break;
008362      if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
008363        break;
008364      }else if( (iPage = pCur->iPage)==0 ){
008365        if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){
008366          /* The root page of the b-tree is overfull. In this case call the
008367          ** balance_deeper() function to create a new child for the root-page
008368          ** and copy the current contents of the root-page to it. The
008369          ** next iteration of the do-loop will balance the child page.
008370          */ 
008371          assert( balance_deeper_called==0 );
008372          VVA_ONLY( balance_deeper_called++ );
008373          rc = balance_deeper(pPage, &pCur->apPage[1]);
008374          if( rc==SQLITE_OK ){
008375            pCur->iPage = 1;
008376            pCur->ix = 0;
008377            pCur->aiIdx[0] = 0;
008378            pCur->apPage[0] = pPage;
008379            pCur->pPage = pCur->apPage[1];
008380            assert( pCur->pPage->nOverflow );
008381          }
008382        }else{
008383          break;
008384        }
008385      }else{
008386        MemPage * const pParent = pCur->apPage[iPage-1];
008387        int const iIdx = pCur->aiIdx[iPage-1];
008388  
008389        rc = sqlite3PagerWrite(pParent->pDbPage);
008390        if( rc==SQLITE_OK && pParent->nFree<0 ){
008391          rc = btreeComputeFreeSpace(pParent);
008392        }
008393        if( rc==SQLITE_OK ){
008394  #ifndef SQLITE_OMIT_QUICKBALANCE
008395          if( pPage->intKeyLeaf
008396           && pPage->nOverflow==1
008397           && pPage->aiOvfl[0]==pPage->nCell
008398           && pParent->pgno!=1
008399           && pParent->nCell==iIdx
008400          ){
008401            /* Call balance_quick() to create a new sibling of pPage on which
008402            ** to store the overflow cell. balance_quick() inserts a new cell
008403            ** into pParent, which may cause pParent overflow. If this
008404            ** happens, the next iteration of the do-loop will balance pParent 
008405            ** use either balance_nonroot() or balance_deeper(). Until this
008406            ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
008407            ** buffer. 
008408            **
008409            ** The purpose of the following assert() is to check that only a
008410            ** single call to balance_quick() is made for each call to this
008411            ** function. If this were not verified, a subtle bug involving reuse
008412            ** of the aBalanceQuickSpace[] might sneak in.
008413            */
008414            assert( balance_quick_called==0 ); 
008415            VVA_ONLY( balance_quick_called++ );
008416            rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
008417          }else
008418  #endif
008419          {
008420            /* In this case, call balance_nonroot() to redistribute cells
008421            ** between pPage and up to 2 of its sibling pages. This involves
008422            ** modifying the contents of pParent, which may cause pParent to
008423            ** become overfull or underfull. The next iteration of the do-loop
008424            ** will balance the parent page to correct this.
008425            ** 
008426            ** If the parent page becomes overfull, the overflow cell or cells
008427            ** are stored in the pSpace buffer allocated immediately below. 
008428            ** A subsequent iteration of the do-loop will deal with this by
008429            ** calling balance_nonroot() (balance_deeper() may be called first,
008430            ** but it doesn't deal with overflow cells - just moves them to a
008431            ** different page). Once this subsequent call to balance_nonroot() 
008432            ** has completed, it is safe to release the pSpace buffer used by
008433            ** the previous call, as the overflow cell data will have been 
008434            ** copied either into the body of a database page or into the new
008435            ** pSpace buffer passed to the latter call to balance_nonroot().
008436            */
008437            u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
008438            rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
008439                                 pCur->hints&BTREE_BULKLOAD);
008440            if( pFree ){
008441              /* If pFree is not NULL, it points to the pSpace buffer used 
008442              ** by a previous call to balance_nonroot(). Its contents are
008443              ** now stored either on real database pages or within the 
008444              ** new pSpace buffer, so it may be safely freed here. */
008445              sqlite3PageFree(pFree);
008446            }
008447  
008448            /* The pSpace buffer will be freed after the next call to
008449            ** balance_nonroot(), or just before this function returns, whichever
008450            ** comes first. */
008451            pFree = pSpace;
008452          }
008453        }
008454  
008455        pPage->nOverflow = 0;
008456  
008457        /* The next iteration of the do-loop balances the parent page. */
008458        releasePage(pPage);
008459        pCur->iPage--;
008460        assert( pCur->iPage>=0 );
008461        pCur->pPage = pCur->apPage[pCur->iPage];
008462      }
008463    }while( rc==SQLITE_OK );
008464  
008465    if( pFree ){
008466      sqlite3PageFree(pFree);
008467    }
008468    return rc;
008469  }
008470  
008471  /* Overwrite content from pX into pDest.  Only do the write if the
008472  ** content is different from what is already there.
008473  */
008474  static int btreeOverwriteContent(
008475    MemPage *pPage,           /* MemPage on which writing will occur */
008476    u8 *pDest,                /* Pointer to the place to start writing */
008477    const BtreePayload *pX,   /* Source of data to write */
008478    int iOffset,              /* Offset of first byte to write */
008479    int iAmt                  /* Number of bytes to be written */
008480  ){
008481    int nData = pX->nData - iOffset;
008482    if( nData<=0 ){
008483      /* Overwritting with zeros */
008484      int i;
008485      for(i=0; i<iAmt && pDest[i]==0; i++){}
008486      if( i<iAmt ){
008487        int rc = sqlite3PagerWrite(pPage->pDbPage);
008488        if( rc ) return rc;
008489        memset(pDest + i, 0, iAmt - i);
008490      }
008491    }else{
008492      if( nData<iAmt ){
008493        /* Mixed read data and zeros at the end.  Make a recursive call
008494        ** to write the zeros then fall through to write the real data */
008495        int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData,
008496                                   iAmt-nData);
008497        if( rc ) return rc;
008498        iAmt = nData;
008499      }
008500      if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){
008501        int rc = sqlite3PagerWrite(pPage->pDbPage);
008502        if( rc ) return rc;
008503        /* In a corrupt database, it is possible for the source and destination
008504        ** buffers to overlap.  This is harmless since the database is already
008505        ** corrupt but it does cause valgrind and ASAN warnings.  So use
008506        ** memmove(). */
008507        memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt);
008508      }
008509    }
008510    return SQLITE_OK;
008511  }
008512  
008513  /*
008514  ** Overwrite the cell that cursor pCur is pointing to with fresh content
008515  ** contained in pX.
008516  */
008517  static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){
008518    int iOffset;                        /* Next byte of pX->pData to write */
008519    int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
008520    int rc;                             /* Return code */
008521    MemPage *pPage = pCur->pPage;       /* Page being written */
008522    BtShared *pBt;                      /* Btree */
008523    Pgno ovflPgno;                      /* Next overflow page to write */
008524    u32 ovflPageSize;                   /* Size to write on overflow page */
008525  
008526    if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd
008527     || pCur->info.pPayload < pPage->aData + pPage->cellOffset
008528    ){
008529      return SQLITE_CORRUPT_BKPT;
008530    }
008531    /* Overwrite the local portion first */
008532    rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
008533                               0, pCur->info.nLocal);
008534    if( rc ) return rc;
008535    if( pCur->info.nLocal==nTotal ) return SQLITE_OK;
008536  
008537    /* Now overwrite the overflow pages */
008538    iOffset = pCur->info.nLocal;
008539    assert( nTotal>=0 );
008540    assert( iOffset>=0 );
008541    ovflPgno = get4byte(pCur->info.pPayload + iOffset);
008542    pBt = pPage->pBt;
008543    ovflPageSize = pBt->usableSize - 4;
008544    do{
008545      rc = btreeGetPage(pBt, ovflPgno, &pPage, 0);
008546      if( rc ) return rc;
008547      if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 ){
008548        rc = SQLITE_CORRUPT_BKPT;
008549      }else{
008550        if( iOffset+ovflPageSize<(u32)nTotal ){
008551          ovflPgno = get4byte(pPage->aData);
008552        }else{
008553          ovflPageSize = nTotal - iOffset;
008554        }
008555        rc = btreeOverwriteContent(pPage, pPage->aData+4, pX,
008556                                   iOffset, ovflPageSize);
008557      }
008558      sqlite3PagerUnref(pPage->pDbPage);
008559      if( rc ) return rc;
008560      iOffset += ovflPageSize;
008561    }while( iOffset<nTotal );
008562    return SQLITE_OK;    
008563  }
008564  
008565  
008566  /*
008567  ** Insert a new record into the BTree.  The content of the new record
008568  ** is described by the pX object.  The pCur cursor is used only to
008569  ** define what table the record should be inserted into, and is left
008570  ** pointing at a random location.
008571  **
008572  ** For a table btree (used for rowid tables), only the pX.nKey value of
008573  ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
008574  ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
008575  ** hold the content of the row.
008576  **
008577  ** For an index btree (used for indexes and WITHOUT ROWID tables), the
008578  ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The 
008579  ** pX.pData,nData,nZero fields must be zero.
008580  **
008581  ** If the seekResult parameter is non-zero, then a successful call to
008582  ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
008583  ** been performed.  In other words, if seekResult!=0 then the cursor
008584  ** is currently pointing to a cell that will be adjacent to the cell
008585  ** to be inserted.  If seekResult<0 then pCur points to a cell that is
008586  ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
008587  ** that is larger than (pKey,nKey).
008588  **
008589  ** If seekResult==0, that means pCur is pointing at some unknown location.
008590  ** In that case, this routine must seek the cursor to the correct insertion
008591  ** point for (pKey,nKey) before doing the insertion.  For index btrees,
008592  ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
008593  ** key values and pX->aMem can be used instead of pX->pKey to avoid having
008594  ** to decode the key.
008595  */
008596  int sqlite3BtreeInsert(
008597    BtCursor *pCur,                /* Insert data into the table of this cursor */
008598    const BtreePayload *pX,        /* Content of the row to be inserted */
008599    int flags,                     /* True if this is likely an append */
008600    int seekResult                 /* Result of prior MovetoUnpacked() call */
008601  ){
008602    int rc;
008603    int loc = seekResult;          /* -1: before desired location  +1: after */
008604    int szNew = 0;
008605    int idx;
008606    MemPage *pPage;
008607    Btree *p = pCur->pBtree;
008608    BtShared *pBt = p->pBt;
008609    unsigned char *oldCell;
008610    unsigned char *newCell = 0;
008611  
008612    assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
008613  
008614    if( pCur->eState==CURSOR_FAULT ){
008615      assert( pCur->skipNext!=SQLITE_OK );
008616      return pCur->skipNext;
008617    }
008618  
008619    assert( cursorOwnsBtShared(pCur) );
008620    assert( (pCur->curFlags & BTCF_WriteFlag)!=0
008621                && pBt->inTransaction==TRANS_WRITE
008622                && (pBt->btsFlags & BTS_READ_ONLY)==0 );
008623    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
008624  
008625    /* Assert that the caller has been consistent. If this cursor was opened
008626    ** expecting an index b-tree, then the caller should be inserting blob
008627    ** keys with no associated data. If the cursor was opened expecting an
008628    ** intkey table, the caller should be inserting integer keys with a
008629    ** blob of associated data.  */
008630    assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
008631  
008632    /* Save the positions of any other cursors open on this table.
008633    **
008634    ** In some cases, the call to btreeMoveto() below is a no-op. For
008635    ** example, when inserting data into a table with auto-generated integer
008636    ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 
008637    ** integer key to use. It then calls this function to actually insert the 
008638    ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
008639    ** that the cursor is already where it needs to be and returns without
008640    ** doing any work. To avoid thwarting these optimizations, it is important
008641    ** not to clear the cursor here.
008642    */
008643    if( pCur->curFlags & BTCF_Multiple ){
008644      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
008645      if( rc ) return rc;
008646    }
008647  
008648    if( pCur->pKeyInfo==0 ){
008649      assert( pX->pKey==0 );
008650      /* If this is an insert into a table b-tree, invalidate any incrblob 
008651      ** cursors open on the row being replaced */
008652      invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
008653  
008654      /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 
008655      ** to a row with the same key as the new entry being inserted.
008656      */
008657  #ifdef SQLITE_DEBUG
008658      if( flags & BTREE_SAVEPOSITION ){
008659        assert( pCur->curFlags & BTCF_ValidNKey );
008660        assert( pX->nKey==pCur->info.nKey );
008661        assert( loc==0 );
008662      }
008663  #endif
008664  
008665      /* On the other hand, BTREE_SAVEPOSITION==0 does not imply
008666      ** that the cursor is not pointing to a row to be overwritten.
008667      ** So do a complete check.
008668      */
008669      if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
008670        /* The cursor is pointing to the entry that is to be
008671        ** overwritten */
008672        assert( pX->nData>=0 && pX->nZero>=0 );
008673        if( pCur->info.nSize!=0
008674         && pCur->info.nPayload==(u32)pX->nData+pX->nZero
008675        ){
008676          /* New entry is the same size as the old.  Do an overwrite */
008677          return btreeOverwriteCell(pCur, pX);
008678        }
008679        assert( loc==0 );
008680      }else if( loc==0 ){
008681        /* The cursor is *not* pointing to the cell to be overwritten, nor
008682        ** to an adjacent cell.  Move the cursor so that it is pointing either
008683        ** to the cell to be overwritten or an adjacent cell.
008684        */
008685        rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
008686        if( rc ) return rc;
008687      }
008688    }else{
008689      /* This is an index or a WITHOUT ROWID table */
008690  
008691      /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 
008692      ** to a row with the same key as the new entry being inserted.
008693      */
008694      assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 );
008695  
008696      /* If the cursor is not already pointing either to the cell to be
008697      ** overwritten, or if a new cell is being inserted, if the cursor is
008698      ** not pointing to an immediately adjacent cell, then move the cursor
008699      ** so that it does.
008700      */
008701      if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
008702        if( pX->nMem ){
008703          UnpackedRecord r;
008704          r.pKeyInfo = pCur->pKeyInfo;
008705          r.aMem = pX->aMem;
008706          r.nField = pX->nMem;
008707          r.default_rc = 0;
008708          r.errCode = 0;
008709          r.r1 = 0;
008710          r.r2 = 0;
008711          r.eqSeen = 0;
008712          rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
008713        }else{
008714          rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
008715        }
008716        if( rc ) return rc;
008717      }
008718  
008719      /* If the cursor is currently pointing to an entry to be overwritten
008720      ** and the new content is the same as as the old, then use the
008721      ** overwrite optimization.
008722      */
008723      if( loc==0 ){
008724        getCellInfo(pCur);
008725        if( pCur->info.nKey==pX->nKey ){
008726          BtreePayload x2;
008727          x2.pData = pX->pKey;
008728          x2.nData = pX->nKey;
008729          x2.nZero = 0;
008730          return btreeOverwriteCell(pCur, &x2);
008731        }
008732      }
008733  
008734    }
008735    assert( pCur->eState==CURSOR_VALID 
008736         || (pCur->eState==CURSOR_INVALID && loc)
008737         || CORRUPT_DB );
008738  
008739    pPage = pCur->pPage;
008740    assert( pPage->intKey || pX->nKey>=0 );
008741    assert( pPage->leaf || !pPage->intKey );
008742    if( pPage->nFree<0 ){
008743      rc = btreeComputeFreeSpace(pPage);
008744      if( rc ) return rc;
008745    }
008746  
008747    TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
008748            pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
008749            loc==0 ? "overwrite" : "new entry"));
008750    assert( pPage->isInit );
008751    newCell = pBt->pTmpSpace;
008752    assert( newCell!=0 );
008753    rc = fillInCell(pPage, newCell, pX, &szNew);
008754    if( rc ) goto end_insert;
008755    assert( szNew==pPage->xCellSize(pPage, newCell) );
008756    assert( szNew <= MX_CELL_SIZE(pBt) );
008757    idx = pCur->ix;
008758    if( loc==0 ){
008759      CellInfo info;
008760      assert( idx<pPage->nCell );
008761      rc = sqlite3PagerWrite(pPage->pDbPage);
008762      if( rc ){
008763        goto end_insert;
008764      }
008765      oldCell = findCell(pPage, idx);
008766      if( !pPage->leaf ){
008767        memcpy(newCell, oldCell, 4);
008768      }
008769      rc = clearCell(pPage, oldCell, &info);
008770      testcase( pCur->curFlags & BTCF_ValidOvfl );
008771      invalidateOverflowCache(pCur);
008772      if( info.nSize==szNew && info.nLocal==info.nPayload 
008773       && (!ISAUTOVACUUM || szNew<pPage->minLocal)
008774      ){
008775        /* Overwrite the old cell with the new if they are the same size.
008776        ** We could also try to do this if the old cell is smaller, then add
008777        ** the leftover space to the free list.  But experiments show that
008778        ** doing that is no faster then skipping this optimization and just
008779        ** calling dropCell() and insertCell(). 
008780        **
008781        ** This optimization cannot be used on an autovacuum database if the
008782        ** new entry uses overflow pages, as the insertCell() call below is
008783        ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
008784        assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
008785        if( oldCell < pPage->aData+pPage->hdrOffset+10 ){
008786          return SQLITE_CORRUPT_BKPT;
008787        }
008788        if( oldCell+szNew > pPage->aDataEnd ){
008789          return SQLITE_CORRUPT_BKPT;
008790        }
008791        memcpy(oldCell, newCell, szNew);
008792        return SQLITE_OK;
008793      }
008794      dropCell(pPage, idx, info.nSize, &rc);
008795      if( rc ) goto end_insert;
008796    }else if( loc<0 && pPage->nCell>0 ){
008797      assert( pPage->leaf );
008798      idx = ++pCur->ix;
008799      pCur->curFlags &= ~BTCF_ValidNKey;
008800    }else{
008801      assert( pPage->leaf );
008802    }
008803    insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
008804    assert( pPage->nOverflow==0 || rc==SQLITE_OK );
008805    assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
008806  
008807    /* If no error has occurred and pPage has an overflow cell, call balance() 
008808    ** to redistribute the cells within the tree. Since balance() may move
008809    ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
008810    ** variables.
008811    **
008812    ** Previous versions of SQLite called moveToRoot() to move the cursor
008813    ** back to the root page as balance() used to invalidate the contents
008814    ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
008815    ** set the cursor state to "invalid". This makes common insert operations
008816    ** slightly faster.
008817    **
008818    ** There is a subtle but important optimization here too. When inserting
008819    ** multiple records into an intkey b-tree using a single cursor (as can
008820    ** happen while processing an "INSERT INTO ... SELECT" statement), it
008821    ** is advantageous to leave the cursor pointing to the last entry in
008822    ** the b-tree if possible. If the cursor is left pointing to the last
008823    ** entry in the table, and the next row inserted has an integer key
008824    ** larger than the largest existing key, it is possible to insert the
008825    ** row without seeking the cursor. This can be a big performance boost.
008826    */
008827    pCur->info.nSize = 0;
008828    if( pPage->nOverflow ){
008829      assert( rc==SQLITE_OK );
008830      pCur->curFlags &= ~(BTCF_ValidNKey);
008831      rc = balance(pCur);
008832  
008833      /* Must make sure nOverflow is reset to zero even if the balance()
008834      ** fails. Internal data structure corruption will result otherwise. 
008835      ** Also, set the cursor state to invalid. This stops saveCursorPosition()
008836      ** from trying to save the current position of the cursor.  */
008837      pCur->pPage->nOverflow = 0;
008838      pCur->eState = CURSOR_INVALID;
008839      if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
008840        btreeReleaseAllCursorPages(pCur);
008841        if( pCur->pKeyInfo ){
008842          assert( pCur->pKey==0 );
008843          pCur->pKey = sqlite3Malloc( pX->nKey );
008844          if( pCur->pKey==0 ){
008845            rc = SQLITE_NOMEM;
008846          }else{
008847            memcpy(pCur->pKey, pX->pKey, pX->nKey);
008848          }
008849        }
008850        pCur->eState = CURSOR_REQUIRESEEK;
008851        pCur->nKey = pX->nKey;
008852      }
008853    }
008854    assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
008855  
008856  end_insert:
008857    return rc;
008858  }
008859  
008860  /*
008861  ** Delete the entry that the cursor is pointing to. 
008862  **
008863  ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
008864  ** the cursor is left pointing at an arbitrary location after the delete.
008865  ** But if that bit is set, then the cursor is left in a state such that
008866  ** the next call to BtreeNext() or BtreePrev() moves it to the same row
008867  ** as it would have been on if the call to BtreeDelete() had been omitted.
008868  **
008869  ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
008870  ** associated with a single table entry and its indexes.  Only one of those
008871  ** deletes is considered the "primary" delete.  The primary delete occurs
008872  ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
008873  ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
008874  ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
008875  ** but which might be used by alternative storage engines.
008876  */
008877  int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
008878    Btree *p = pCur->pBtree;
008879    BtShared *pBt = p->pBt;              
008880    int rc;                              /* Return code */
008881    MemPage *pPage;                      /* Page to delete cell from */
008882    unsigned char *pCell;                /* Pointer to cell to delete */
008883    int iCellIdx;                        /* Index of cell to delete */
008884    int iCellDepth;                      /* Depth of node containing pCell */ 
008885    CellInfo info;                       /* Size of the cell being deleted */
008886    int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
008887    u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
008888  
008889    assert( cursorOwnsBtShared(pCur) );
008890    assert( pBt->inTransaction==TRANS_WRITE );
008891    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
008892    assert( pCur->curFlags & BTCF_WriteFlag );
008893    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
008894    assert( !hasReadConflicts(p, pCur->pgnoRoot) );
008895    assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
008896    if( pCur->eState==CURSOR_REQUIRESEEK ){
008897      rc = btreeRestoreCursorPosition(pCur);
008898      if( rc ) return rc;
008899    }
008900    assert( pCur->eState==CURSOR_VALID );
008901  
008902    iCellDepth = pCur->iPage;
008903    iCellIdx = pCur->ix;
008904    pPage = pCur->pPage;
008905    pCell = findCell(pPage, iCellIdx);
008906    if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT;
008907  
008908    /* If the bPreserve flag is set to true, then the cursor position must
008909    ** be preserved following this delete operation. If the current delete
008910    ** will cause a b-tree rebalance, then this is done by saving the cursor
008911    ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 
008912    ** returning. 
008913    **
008914    ** Or, if the current delete will not cause a rebalance, then the cursor
008915    ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
008916    ** before or after the deleted entry. In this case set bSkipnext to true.  */
008917    if( bPreserve ){
008918      if( !pPage->leaf 
008919       || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
008920       || pPage->nCell==1  /* See dbfuzz001.test for a test case */
008921      ){
008922        /* A b-tree rebalance will be required after deleting this entry.
008923        ** Save the cursor key.  */
008924        rc = saveCursorKey(pCur);
008925        if( rc ) return rc;
008926      }else{
008927        bSkipnext = 1;
008928      }
008929    }
008930  
008931    /* If the page containing the entry to delete is not a leaf page, move
008932    ** the cursor to the largest entry in the tree that is smaller than
008933    ** the entry being deleted. This cell will replace the cell being deleted
008934    ** from the internal node. The 'previous' entry is used for this instead
008935    ** of the 'next' entry, as the previous entry is always a part of the
008936    ** sub-tree headed by the child page of the cell being deleted. This makes
008937    ** balancing the tree following the delete operation easier.  */
008938    if( !pPage->leaf ){
008939      rc = sqlite3BtreePrevious(pCur, 0);
008940      assert( rc!=SQLITE_DONE );
008941      if( rc ) return rc;
008942    }
008943  
008944    /* Save the positions of any other cursors open on this table before
008945    ** making any modifications.  */
008946    if( pCur->curFlags & BTCF_Multiple ){
008947      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
008948      if( rc ) return rc;
008949    }
008950  
008951    /* If this is a delete operation to remove a row from a table b-tree,
008952    ** invalidate any incrblob cursors open on the row being deleted.  */
008953    if( pCur->pKeyInfo==0 ){
008954      invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
008955    }
008956  
008957    /* Make the page containing the entry to be deleted writable. Then free any
008958    ** overflow pages associated with the entry and finally remove the cell
008959    ** itself from within the page.  */
008960    rc = sqlite3PagerWrite(pPage->pDbPage);
008961    if( rc ) return rc;
008962    rc = clearCell(pPage, pCell, &info);
008963    dropCell(pPage, iCellIdx, info.nSize, &rc);
008964    if( rc ) return rc;
008965  
008966    /* If the cell deleted was not located on a leaf page, then the cursor
008967    ** is currently pointing to the largest entry in the sub-tree headed
008968    ** by the child-page of the cell that was just deleted from an internal
008969    ** node. The cell from the leaf node needs to be moved to the internal
008970    ** node to replace the deleted cell.  */
008971    if( !pPage->leaf ){
008972      MemPage *pLeaf = pCur->pPage;
008973      int nCell;
008974      Pgno n;
008975      unsigned char *pTmp;
008976  
008977      if( pLeaf->nFree<0 ){
008978        rc = btreeComputeFreeSpace(pLeaf);
008979        if( rc ) return rc;
008980      }
008981      if( iCellDepth<pCur->iPage-1 ){
008982        n = pCur->apPage[iCellDepth+1]->pgno;
008983      }else{
008984        n = pCur->pPage->pgno;
008985      }
008986      pCell = findCell(pLeaf, pLeaf->nCell-1);
008987      if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
008988      nCell = pLeaf->xCellSize(pLeaf, pCell);
008989      assert( MX_CELL_SIZE(pBt) >= nCell );
008990      pTmp = pBt->pTmpSpace;
008991      assert( pTmp!=0 );
008992      rc = sqlite3PagerWrite(pLeaf->pDbPage);
008993      if( rc==SQLITE_OK ){
008994        insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
008995      }
008996      dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
008997      if( rc ) return rc;
008998    }
008999  
009000    /* Balance the tree. If the entry deleted was located on a leaf page,
009001    ** then the cursor still points to that page. In this case the first
009002    ** call to balance() repairs the tree, and the if(...) condition is
009003    ** never true.
009004    **
009005    ** Otherwise, if the entry deleted was on an internal node page, then
009006    ** pCur is pointing to the leaf page from which a cell was removed to
009007    ** replace the cell deleted from the internal node. This is slightly
009008    ** tricky as the leaf node may be underfull, and the internal node may
009009    ** be either under or overfull. In this case run the balancing algorithm
009010    ** on the leaf node first. If the balance proceeds far enough up the
009011    ** tree that we can be sure that any problem in the internal node has
009012    ** been corrected, so be it. Otherwise, after balancing the leaf node,
009013    ** walk the cursor up the tree to the internal node and balance it as 
009014    ** well.  */
009015    rc = balance(pCur);
009016    if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
009017      releasePageNotNull(pCur->pPage);
009018      pCur->iPage--;
009019      while( pCur->iPage>iCellDepth ){
009020        releasePage(pCur->apPage[pCur->iPage--]);
009021      }
009022      pCur->pPage = pCur->apPage[pCur->iPage];
009023      rc = balance(pCur);
009024    }
009025  
009026    if( rc==SQLITE_OK ){
009027      if( bSkipnext ){
009028        assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
009029        assert( pPage==pCur->pPage || CORRUPT_DB );
009030        assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
009031        pCur->eState = CURSOR_SKIPNEXT;
009032        if( iCellIdx>=pPage->nCell ){
009033          pCur->skipNext = -1;
009034          pCur->ix = pPage->nCell-1;
009035        }else{
009036          pCur->skipNext = 1;
009037        }
009038      }else{
009039        rc = moveToRoot(pCur);
009040        if( bPreserve ){
009041          btreeReleaseAllCursorPages(pCur);
009042          pCur->eState = CURSOR_REQUIRESEEK;
009043        }
009044        if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
009045      }
009046    }
009047    return rc;
009048  }
009049  
009050  /*
009051  ** Create a new BTree table.  Write into *piTable the page
009052  ** number for the root page of the new table.
009053  **
009054  ** The type of type is determined by the flags parameter.  Only the
009055  ** following values of flags are currently in use.  Other values for
009056  ** flags might not work:
009057  **
009058  **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
009059  **     BTREE_ZERODATA                  Used for SQL indices
009060  */
009061  static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
009062    BtShared *pBt = p->pBt;
009063    MemPage *pRoot;
009064    Pgno pgnoRoot;
009065    int rc;
009066    int ptfFlags;          /* Page-type flage for the root page of new table */
009067  
009068    assert( sqlite3BtreeHoldsMutex(p) );
009069    assert( pBt->inTransaction==TRANS_WRITE );
009070    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
009071  
009072  #ifdef SQLITE_OMIT_AUTOVACUUM
009073    rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
009074    if( rc ){
009075      return rc;
009076    }
009077  #else
009078    if( pBt->autoVacuum ){
009079      Pgno pgnoMove;      /* Move a page here to make room for the root-page */
009080      MemPage *pPageMove; /* The page to move to. */
009081  
009082      /* Creating a new table may probably require moving an existing database
009083      ** to make room for the new tables root page. In case this page turns
009084      ** out to be an overflow page, delete all overflow page-map caches
009085      ** held by open cursors.
009086      */
009087      invalidateAllOverflowCache(pBt);
009088  
009089      /* Read the value of meta[3] from the database to determine where the
009090      ** root page of the new table should go. meta[3] is the largest root-page
009091      ** created so far, so the new root-page is (meta[3]+1).
009092      */
009093      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
009094      pgnoRoot++;
009095  
009096      /* The new root-page may not be allocated on a pointer-map page, or the
009097      ** PENDING_BYTE page.
009098      */
009099      while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
009100          pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
009101        pgnoRoot++;
009102      }
009103      assert( pgnoRoot>=3 || CORRUPT_DB );
009104      testcase( pgnoRoot<3 );
009105  
009106      /* Allocate a page. The page that currently resides at pgnoRoot will
009107      ** be moved to the allocated page (unless the allocated page happens
009108      ** to reside at pgnoRoot).
009109      */
009110      rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
009111      if( rc!=SQLITE_OK ){
009112        return rc;
009113      }
009114  
009115      if( pgnoMove!=pgnoRoot ){
009116        /* pgnoRoot is the page that will be used for the root-page of
009117        ** the new table (assuming an error did not occur). But we were
009118        ** allocated pgnoMove. If required (i.e. if it was not allocated
009119        ** by extending the file), the current page at position pgnoMove
009120        ** is already journaled.
009121        */
009122        u8 eType = 0;
009123        Pgno iPtrPage = 0;
009124  
009125        /* Save the positions of any open cursors. This is required in
009126        ** case they are holding a reference to an xFetch reference
009127        ** corresponding to page pgnoRoot.  */
009128        rc = saveAllCursors(pBt, 0, 0);
009129        releasePage(pPageMove);
009130        if( rc!=SQLITE_OK ){
009131          return rc;
009132        }
009133  
009134        /* Move the page currently at pgnoRoot to pgnoMove. */
009135        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
009136        if( rc!=SQLITE_OK ){
009137          return rc;
009138        }
009139        rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
009140        if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
009141          rc = SQLITE_CORRUPT_BKPT;
009142        }
009143        if( rc!=SQLITE_OK ){
009144          releasePage(pRoot);
009145          return rc;
009146        }
009147        assert( eType!=PTRMAP_ROOTPAGE );
009148        assert( eType!=PTRMAP_FREEPAGE );
009149        rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
009150        releasePage(pRoot);
009151  
009152        /* Obtain the page at pgnoRoot */
009153        if( rc!=SQLITE_OK ){
009154          return rc;
009155        }
009156        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
009157        if( rc!=SQLITE_OK ){
009158          return rc;
009159        }
009160        rc = sqlite3PagerWrite(pRoot->pDbPage);
009161        if( rc!=SQLITE_OK ){
009162          releasePage(pRoot);
009163          return rc;
009164        }
009165      }else{
009166        pRoot = pPageMove;
009167      } 
009168  
009169      /* Update the pointer-map and meta-data with the new root-page number. */
009170      ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
009171      if( rc ){
009172        releasePage(pRoot);
009173        return rc;
009174      }
009175  
009176      /* When the new root page was allocated, page 1 was made writable in
009177      ** order either to increase the database filesize, or to decrement the
009178      ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
009179      */
009180      assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
009181      rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
009182      if( NEVER(rc) ){
009183        releasePage(pRoot);
009184        return rc;
009185      }
009186  
009187    }else{
009188      rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
009189      if( rc ) return rc;
009190    }
009191  #endif
009192    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
009193    if( createTabFlags & BTREE_INTKEY ){
009194      ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
009195    }else{
009196      ptfFlags = PTF_ZERODATA | PTF_LEAF;
009197    }
009198    zeroPage(pRoot, ptfFlags);
009199    sqlite3PagerUnref(pRoot->pDbPage);
009200    assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
009201    *piTable = (int)pgnoRoot;
009202    return SQLITE_OK;
009203  }
009204  int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
009205    int rc;
009206    sqlite3BtreeEnter(p);
009207    rc = btreeCreateTable(p, piTable, flags);
009208    sqlite3BtreeLeave(p);
009209    return rc;
009210  }
009211  
009212  /*
009213  ** Erase the given database page and all its children.  Return
009214  ** the page to the freelist.
009215  */
009216  static int clearDatabasePage(
009217    BtShared *pBt,           /* The BTree that contains the table */
009218    Pgno pgno,               /* Page number to clear */
009219    int freePageFlag,        /* Deallocate page if true */
009220    int *pnChange            /* Add number of Cells freed to this counter */
009221  ){
009222    MemPage *pPage;
009223    int rc;
009224    unsigned char *pCell;
009225    int i;
009226    int hdr;
009227    CellInfo info;
009228  
009229    assert( sqlite3_mutex_held(pBt->mutex) );
009230    if( pgno>btreePagecount(pBt) ){
009231      return SQLITE_CORRUPT_BKPT;
009232    }
009233    rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
009234    if( rc ) return rc;
009235    if( pPage->bBusy ){
009236      rc = SQLITE_CORRUPT_BKPT;
009237      goto cleardatabasepage_out;
009238    }
009239    pPage->bBusy = 1;
009240    hdr = pPage->hdrOffset;
009241    for(i=0; i<pPage->nCell; i++){
009242      pCell = findCell(pPage, i);
009243      if( !pPage->leaf ){
009244        rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
009245        if( rc ) goto cleardatabasepage_out;
009246      }
009247      rc = clearCell(pPage, pCell, &info);
009248      if( rc ) goto cleardatabasepage_out;
009249    }
009250    if( !pPage->leaf ){
009251      rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
009252      if( rc ) goto cleardatabasepage_out;
009253    }else if( pnChange ){
009254      assert( pPage->intKey || CORRUPT_DB );
009255      testcase( !pPage->intKey );
009256      *pnChange += pPage->nCell;
009257    }
009258    if( freePageFlag ){
009259      freePage(pPage, &rc);
009260    }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
009261      zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
009262    }
009263  
009264  cleardatabasepage_out:
009265    pPage->bBusy = 0;
009266    releasePage(pPage);
009267    return rc;
009268  }
009269  
009270  /*
009271  ** Delete all information from a single table in the database.  iTable is
009272  ** the page number of the root of the table.  After this routine returns,
009273  ** the root page is empty, but still exists.
009274  **
009275  ** This routine will fail with SQLITE_LOCKED if there are any open
009276  ** read cursors on the table.  Open write cursors are moved to the
009277  ** root of the table.
009278  **
009279  ** If pnChange is not NULL, then table iTable must be an intkey table. The
009280  ** integer value pointed to by pnChange is incremented by the number of
009281  ** entries in the table.
009282  */
009283  int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
009284    int rc;
009285    BtShared *pBt = p->pBt;
009286    sqlite3BtreeEnter(p);
009287    assert( p->inTrans==TRANS_WRITE );
009288  
009289    rc = saveAllCursors(pBt, (Pgno)iTable, 0);
009290  
009291    if( SQLITE_OK==rc ){
009292      /* Invalidate all incrblob cursors open on table iTable (assuming iTable
009293      ** is the root of a table b-tree - if it is not, the following call is
009294      ** a no-op).  */
009295      invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
009296      rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
009297    }
009298    sqlite3BtreeLeave(p);
009299    return rc;
009300  }
009301  
009302  /*
009303  ** Delete all information from the single table that pCur is open on.
009304  **
009305  ** This routine only work for pCur on an ephemeral table.
009306  */
009307  int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
009308    return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
009309  }
009310  
009311  /*
009312  ** Erase all information in a table and add the root of the table to
009313  ** the freelist.  Except, the root of the principle table (the one on
009314  ** page 1) is never added to the freelist.
009315  **
009316  ** This routine will fail with SQLITE_LOCKED if there are any open
009317  ** cursors on the table.
009318  **
009319  ** If AUTOVACUUM is enabled and the page at iTable is not the last
009320  ** root page in the database file, then the last root page 
009321  ** in the database file is moved into the slot formerly occupied by
009322  ** iTable and that last slot formerly occupied by the last root page
009323  ** is added to the freelist instead of iTable.  In this say, all
009324  ** root pages are kept at the beginning of the database file, which
009325  ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
009326  ** page number that used to be the last root page in the file before
009327  ** the move.  If no page gets moved, *piMoved is set to 0.
009328  ** The last root page is recorded in meta[3] and the value of
009329  ** meta[3] is updated by this procedure.
009330  */
009331  static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
009332    int rc;
009333    MemPage *pPage = 0;
009334    BtShared *pBt = p->pBt;
009335  
009336    assert( sqlite3BtreeHoldsMutex(p) );
009337    assert( p->inTrans==TRANS_WRITE );
009338    assert( iTable>=2 );
009339    if( iTable>btreePagecount(pBt) ){
009340      return SQLITE_CORRUPT_BKPT;
009341    }
009342  
009343    rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
009344    if( rc ) return rc;
009345    rc = sqlite3BtreeClearTable(p, iTable, 0);
009346    if( rc ){
009347      releasePage(pPage);
009348      return rc;
009349    }
009350  
009351    *piMoved = 0;
009352  
009353  #ifdef SQLITE_OMIT_AUTOVACUUM
009354    freePage(pPage, &rc);
009355    releasePage(pPage);
009356  #else
009357    if( pBt->autoVacuum ){
009358      Pgno maxRootPgno;
009359      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
009360  
009361      if( iTable==maxRootPgno ){
009362        /* If the table being dropped is the table with the largest root-page
009363        ** number in the database, put the root page on the free list. 
009364        */
009365        freePage(pPage, &rc);
009366        releasePage(pPage);
009367        if( rc!=SQLITE_OK ){
009368          return rc;
009369        }
009370      }else{
009371        /* The table being dropped does not have the largest root-page
009372        ** number in the database. So move the page that does into the 
009373        ** gap left by the deleted root-page.
009374        */
009375        MemPage *pMove;
009376        releasePage(pPage);
009377        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
009378        if( rc!=SQLITE_OK ){
009379          return rc;
009380        }
009381        rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
009382        releasePage(pMove);
009383        if( rc!=SQLITE_OK ){
009384          return rc;
009385        }
009386        pMove = 0;
009387        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
009388        freePage(pMove, &rc);
009389        releasePage(pMove);
009390        if( rc!=SQLITE_OK ){
009391          return rc;
009392        }
009393        *piMoved = maxRootPgno;
009394      }
009395  
009396      /* Set the new 'max-root-page' value in the database header. This
009397      ** is the old value less one, less one more if that happens to
009398      ** be a root-page number, less one again if that is the
009399      ** PENDING_BYTE_PAGE.
009400      */
009401      maxRootPgno--;
009402      while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
009403             || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
009404        maxRootPgno--;
009405      }
009406      assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
009407  
009408      rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
009409    }else{
009410      freePage(pPage, &rc);
009411      releasePage(pPage);
009412    }
009413  #endif
009414    return rc;  
009415  }
009416  int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
009417    int rc;
009418    sqlite3BtreeEnter(p);
009419    rc = btreeDropTable(p, iTable, piMoved);
009420    sqlite3BtreeLeave(p);
009421    return rc;
009422  }
009423  
009424  
009425  /*
009426  ** This function may only be called if the b-tree connection already
009427  ** has a read or write transaction open on the database.
009428  **
009429  ** Read the meta-information out of a database file.  Meta[0]
009430  ** is the number of free pages currently in the database.  Meta[1]
009431  ** through meta[15] are available for use by higher layers.  Meta[0]
009432  ** is read-only, the others are read/write.
009433  ** 
009434  ** The schema layer numbers meta values differently.  At the schema
009435  ** layer (and the SetCookie and ReadCookie opcodes) the number of
009436  ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
009437  **
009438  ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
009439  ** of reading the value out of the header, it instead loads the "DataVersion"
009440  ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
009441  ** database file.  It is a number computed by the pager.  But its access
009442  ** pattern is the same as header meta values, and so it is convenient to
009443  ** read it from this routine.
009444  */
009445  void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
009446    BtShared *pBt = p->pBt;
009447  
009448    sqlite3BtreeEnter(p);
009449    assert( p->inTrans>TRANS_NONE );
009450    assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
009451    assert( pBt->pPage1 );
009452    assert( idx>=0 && idx<=15 );
009453  
009454    if( idx==BTREE_DATA_VERSION ){
009455      *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
009456    }else{
009457      *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
009458    }
009459  
009460    /* If auto-vacuum is disabled in this build and this is an auto-vacuum
009461    ** database, mark the database as read-only.  */
009462  #ifdef SQLITE_OMIT_AUTOVACUUM
009463    if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
009464      pBt->btsFlags |= BTS_READ_ONLY;
009465    }
009466  #endif
009467  
009468    sqlite3BtreeLeave(p);
009469  }
009470  
009471  /*
009472  ** Write meta-information back into the database.  Meta[0] is
009473  ** read-only and may not be written.
009474  */
009475  int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
009476    BtShared *pBt = p->pBt;
009477    unsigned char *pP1;
009478    int rc;
009479    assert( idx>=1 && idx<=15 );
009480    sqlite3BtreeEnter(p);
009481    assert( p->inTrans==TRANS_WRITE );
009482    assert( pBt->pPage1!=0 );
009483    pP1 = pBt->pPage1->aData;
009484    rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
009485    if( rc==SQLITE_OK ){
009486      put4byte(&pP1[36 + idx*4], iMeta);
009487  #ifndef SQLITE_OMIT_AUTOVACUUM
009488      if( idx==BTREE_INCR_VACUUM ){
009489        assert( pBt->autoVacuum || iMeta==0 );
009490        assert( iMeta==0 || iMeta==1 );
009491        pBt->incrVacuum = (u8)iMeta;
009492      }
009493  #endif
009494    }
009495    sqlite3BtreeLeave(p);
009496    return rc;
009497  }
009498  
009499  #ifndef SQLITE_OMIT_BTREECOUNT
009500  /*
009501  ** The first argument, pCur, is a cursor opened on some b-tree. Count the
009502  ** number of entries in the b-tree and write the result to *pnEntry.
009503  **
009504  ** SQLITE_OK is returned if the operation is successfully executed. 
009505  ** Otherwise, if an error is encountered (i.e. an IO error or database
009506  ** corruption) an SQLite error code is returned.
009507  */
009508  int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){
009509    i64 nEntry = 0;                      /* Value to return in *pnEntry */
009510    int rc;                              /* Return code */
009511  
009512    rc = moveToRoot(pCur);
009513    if( rc==SQLITE_EMPTY ){
009514      *pnEntry = 0;
009515      return SQLITE_OK;
009516    }
009517  
009518    /* Unless an error occurs, the following loop runs one iteration for each
009519    ** page in the B-Tree structure (not including overflow pages). 
009520    */
009521    while( rc==SQLITE_OK && !db->u1.isInterrupted ){
009522      int iIdx;                          /* Index of child node in parent */
009523      MemPage *pPage;                    /* Current page of the b-tree */
009524  
009525      /* If this is a leaf page or the tree is not an int-key tree, then 
009526      ** this page contains countable entries. Increment the entry counter
009527      ** accordingly.
009528      */
009529      pPage = pCur->pPage;
009530      if( pPage->leaf || !pPage->intKey ){
009531        nEntry += pPage->nCell;
009532      }
009533  
009534      /* pPage is a leaf node. This loop navigates the cursor so that it 
009535      ** points to the first interior cell that it points to the parent of
009536      ** the next page in the tree that has not yet been visited. The
009537      ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
009538      ** of the page, or to the number of cells in the page if the next page
009539      ** to visit is the right-child of its parent.
009540      **
009541      ** If all pages in the tree have been visited, return SQLITE_OK to the
009542      ** caller.
009543      */
009544      if( pPage->leaf ){
009545        do {
009546          if( pCur->iPage==0 ){
009547            /* All pages of the b-tree have been visited. Return successfully. */
009548            *pnEntry = nEntry;
009549            return moveToRoot(pCur);
009550          }
009551          moveToParent(pCur);
009552        }while ( pCur->ix>=pCur->pPage->nCell );
009553  
009554        pCur->ix++;
009555        pPage = pCur->pPage;
009556      }
009557  
009558      /* Descend to the child node of the cell that the cursor currently 
009559      ** points at. This is the right-child if (iIdx==pPage->nCell).
009560      */
009561      iIdx = pCur->ix;
009562      if( iIdx==pPage->nCell ){
009563        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
009564      }else{
009565        rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
009566      }
009567    }
009568  
009569    /* An error has occurred. Return an error code. */
009570    return rc;
009571  }
009572  #endif
009573  
009574  /*
009575  ** Return the pager associated with a BTree.  This routine is used for
009576  ** testing and debugging only.
009577  */
009578  Pager *sqlite3BtreePager(Btree *p){
009579    return p->pBt->pPager;
009580  }
009581  
009582  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009583  /*
009584  ** Append a message to the error message string.
009585  */
009586  static void checkAppendMsg(
009587    IntegrityCk *pCheck,
009588    const char *zFormat,
009589    ...
009590  ){
009591    va_list ap;
009592    if( !pCheck->mxErr ) return;
009593    pCheck->mxErr--;
009594    pCheck->nErr++;
009595    va_start(ap, zFormat);
009596    if( pCheck->errMsg.nChar ){
009597      sqlite3_str_append(&pCheck->errMsg, "\n", 1);
009598    }
009599    if( pCheck->zPfx ){
009600      sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
009601    }
009602    sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap);
009603    va_end(ap);
009604    if( pCheck->errMsg.accError==SQLITE_NOMEM ){
009605      pCheck->mallocFailed = 1;
009606    }
009607  }
009608  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009609  
009610  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009611  
009612  /*
009613  ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
009614  ** corresponds to page iPg is already set.
009615  */
009616  static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
009617    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
009618    return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
009619  }
009620  
009621  /*
009622  ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
009623  */
009624  static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
009625    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
009626    pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
009627  }
009628  
009629  
009630  /*
009631  ** Add 1 to the reference count for page iPage.  If this is the second
009632  ** reference to the page, add an error message to pCheck->zErrMsg.
009633  ** Return 1 if there are 2 or more references to the page and 0 if
009634  ** if this is the first reference to the page.
009635  **
009636  ** Also check that the page number is in bounds.
009637  */
009638  static int checkRef(IntegrityCk *pCheck, Pgno iPage){
009639    if( iPage>pCheck->nPage || iPage==0 ){
009640      checkAppendMsg(pCheck, "invalid page number %d", iPage);
009641      return 1;
009642    }
009643    if( getPageReferenced(pCheck, iPage) ){
009644      checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
009645      return 1;
009646    }
009647    if( pCheck->db->u1.isInterrupted ) return 1;
009648    setPageReferenced(pCheck, iPage);
009649    return 0;
009650  }
009651  
009652  #ifndef SQLITE_OMIT_AUTOVACUUM
009653  /*
009654  ** Check that the entry in the pointer-map for page iChild maps to 
009655  ** page iParent, pointer type ptrType. If not, append an error message
009656  ** to pCheck.
009657  */
009658  static void checkPtrmap(
009659    IntegrityCk *pCheck,   /* Integrity check context */
009660    Pgno iChild,           /* Child page number */
009661    u8 eType,              /* Expected pointer map type */
009662    Pgno iParent           /* Expected pointer map parent page number */
009663  ){
009664    int rc;
009665    u8 ePtrmapType;
009666    Pgno iPtrmapParent;
009667  
009668    rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
009669    if( rc!=SQLITE_OK ){
009670      if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
009671      checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
009672      return;
009673    }
009674  
009675    if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
009676      checkAppendMsg(pCheck,
009677        "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
009678        iChild, eType, iParent, ePtrmapType, iPtrmapParent);
009679    }
009680  }
009681  #endif
009682  
009683  /*
009684  ** Check the integrity of the freelist or of an overflow page list.
009685  ** Verify that the number of pages on the list is N.
009686  */
009687  static void checkList(
009688    IntegrityCk *pCheck,  /* Integrity checking context */
009689    int isFreeList,       /* True for a freelist.  False for overflow page list */
009690    int iPage,            /* Page number for first page in the list */
009691    u32 N                 /* Expected number of pages in the list */
009692  ){
009693    int i;
009694    u32 expected = N;
009695    int nErrAtStart = pCheck->nErr;
009696    while( iPage!=0 && pCheck->mxErr ){
009697      DbPage *pOvflPage;
009698      unsigned char *pOvflData;
009699      if( checkRef(pCheck, iPage) ) break;
009700      N--;
009701      if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
009702        checkAppendMsg(pCheck, "failed to get page %d", iPage);
009703        break;
009704      }
009705      pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
009706      if( isFreeList ){
009707        u32 n = (u32)get4byte(&pOvflData[4]);
009708  #ifndef SQLITE_OMIT_AUTOVACUUM
009709        if( pCheck->pBt->autoVacuum ){
009710          checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
009711        }
009712  #endif
009713        if( n>pCheck->pBt->usableSize/4-2 ){
009714          checkAppendMsg(pCheck,
009715             "freelist leaf count too big on page %d", iPage);
009716          N--;
009717        }else{
009718          for(i=0; i<(int)n; i++){
009719            Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
009720  #ifndef SQLITE_OMIT_AUTOVACUUM
009721            if( pCheck->pBt->autoVacuum ){
009722              checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
009723            }
009724  #endif
009725            checkRef(pCheck, iFreePage);
009726          }
009727          N -= n;
009728        }
009729      }
009730  #ifndef SQLITE_OMIT_AUTOVACUUM
009731      else{
009732        /* If this database supports auto-vacuum and iPage is not the last
009733        ** page in this overflow list, check that the pointer-map entry for
009734        ** the following page matches iPage.
009735        */
009736        if( pCheck->pBt->autoVacuum && N>0 ){
009737          i = get4byte(pOvflData);
009738          checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
009739        }
009740      }
009741  #endif
009742      iPage = get4byte(pOvflData);
009743      sqlite3PagerUnref(pOvflPage);
009744    }
009745    if( N && nErrAtStart==pCheck->nErr ){
009746      checkAppendMsg(pCheck,
009747        "%s is %d but should be %d",
009748        isFreeList ? "size" : "overflow list length",
009749        expected-N, expected);
009750    }
009751  }
009752  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009753  
009754  /*
009755  ** An implementation of a min-heap.
009756  **
009757  ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
009758  ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
009759  ** and aHeap[N*2+1].
009760  **
009761  ** The heap property is this:  Every node is less than or equal to both
009762  ** of its daughter nodes.  A consequence of the heap property is that the
009763  ** root node aHeap[1] is always the minimum value currently in the heap.
009764  **
009765  ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
009766  ** the heap, preserving the heap property.  The btreeHeapPull() routine
009767  ** removes the root element from the heap (the minimum value in the heap)
009768  ** and then moves other nodes around as necessary to preserve the heap
009769  ** property.
009770  **
009771  ** This heap is used for cell overlap and coverage testing.  Each u32
009772  ** entry represents the span of a cell or freeblock on a btree page.  
009773  ** The upper 16 bits are the index of the first byte of a range and the
009774  ** lower 16 bits are the index of the last byte of that range.
009775  */
009776  static void btreeHeapInsert(u32 *aHeap, u32 x){
009777    u32 j, i = ++aHeap[0];
009778    aHeap[i] = x;
009779    while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
009780      x = aHeap[j];
009781      aHeap[j] = aHeap[i];
009782      aHeap[i] = x;
009783      i = j;
009784    }
009785  }
009786  static int btreeHeapPull(u32 *aHeap, u32 *pOut){
009787    u32 j, i, x;
009788    if( (x = aHeap[0])==0 ) return 0;
009789    *pOut = aHeap[1];
009790    aHeap[1] = aHeap[x];
009791    aHeap[x] = 0xffffffff;
009792    aHeap[0]--;
009793    i = 1;
009794    while( (j = i*2)<=aHeap[0] ){
009795      if( aHeap[j]>aHeap[j+1] ) j++;
009796      if( aHeap[i]<aHeap[j] ) break;
009797      x = aHeap[i];
009798      aHeap[i] = aHeap[j];
009799      aHeap[j] = x;
009800      i = j;
009801    }
009802    return 1;  
009803  }
009804  
009805  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009806  /*
009807  ** Do various sanity checks on a single page of a tree.  Return
009808  ** the tree depth.  Root pages return 0.  Parents of root pages
009809  ** return 1, and so forth.
009810  ** 
009811  ** These checks are done:
009812  **
009813  **      1.  Make sure that cells and freeblocks do not overlap
009814  **          but combine to completely cover the page.
009815  **      2.  Make sure integer cell keys are in order.
009816  **      3.  Check the integrity of overflow pages.
009817  **      4.  Recursively call checkTreePage on all children.
009818  **      5.  Verify that the depth of all children is the same.
009819  */
009820  static int checkTreePage(
009821    IntegrityCk *pCheck,  /* Context for the sanity check */
009822    int iPage,            /* Page number of the page to check */
009823    i64 *piMinKey,        /* Write minimum integer primary key here */
009824    i64 maxKey            /* Error if integer primary key greater than this */
009825  ){
009826    MemPage *pPage = 0;      /* The page being analyzed */
009827    int i;                   /* Loop counter */
009828    int rc;                  /* Result code from subroutine call */
009829    int depth = -1, d2;      /* Depth of a subtree */
009830    int pgno;                /* Page number */
009831    int nFrag;               /* Number of fragmented bytes on the page */
009832    int hdr;                 /* Offset to the page header */
009833    int cellStart;           /* Offset to the start of the cell pointer array */
009834    int nCell;               /* Number of cells */
009835    int doCoverageCheck = 1; /* True if cell coverage checking should be done */
009836    int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
009837                             ** False if IPK must be strictly less than maxKey */
009838    u8 *data;                /* Page content */
009839    u8 *pCell;               /* Cell content */
009840    u8 *pCellIdx;            /* Next element of the cell pointer array */
009841    BtShared *pBt;           /* The BtShared object that owns pPage */
009842    u32 pc;                  /* Address of a cell */
009843    u32 usableSize;          /* Usable size of the page */
009844    u32 contentOffset;       /* Offset to the start of the cell content area */
009845    u32 *heap = 0;           /* Min-heap used for checking cell coverage */
009846    u32 x, prev = 0;         /* Next and previous entry on the min-heap */
009847    const char *saved_zPfx = pCheck->zPfx;
009848    int saved_v1 = pCheck->v1;
009849    int saved_v2 = pCheck->v2;
009850    u8 savedIsInit = 0;
009851  
009852    /* Check that the page exists
009853    */
009854    pBt = pCheck->pBt;
009855    usableSize = pBt->usableSize;
009856    if( iPage==0 ) return 0;
009857    if( checkRef(pCheck, iPage) ) return 0;
009858    pCheck->zPfx = "Page %d: ";
009859    pCheck->v1 = iPage;
009860    if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
009861      checkAppendMsg(pCheck,
009862         "unable to get the page. error code=%d", rc);
009863      goto end_of_check;
009864    }
009865  
009866    /* Clear MemPage.isInit to make sure the corruption detection code in
009867    ** btreeInitPage() is executed.  */
009868    savedIsInit = pPage->isInit;
009869    pPage->isInit = 0;
009870    if( (rc = btreeInitPage(pPage))!=0 ){
009871      assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
009872      checkAppendMsg(pCheck,
009873                     "btreeInitPage() returns error code %d", rc);
009874      goto end_of_check;
009875    }
009876    if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
009877      assert( rc==SQLITE_CORRUPT );
009878      checkAppendMsg(pCheck, "free space corruption", rc);
009879      goto end_of_check;
009880    }
009881    data = pPage->aData;
009882    hdr = pPage->hdrOffset;
009883  
009884    /* Set up for cell analysis */
009885    pCheck->zPfx = "On tree page %d cell %d: ";
009886    contentOffset = get2byteNotZero(&data[hdr+5]);
009887    assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
009888  
009889    /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
009890    ** number of cells on the page. */
009891    nCell = get2byte(&data[hdr+3]);
009892    assert( pPage->nCell==nCell );
009893  
009894    /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
009895    ** immediately follows the b-tree page header. */
009896    cellStart = hdr + 12 - 4*pPage->leaf;
009897    assert( pPage->aCellIdx==&data[cellStart] );
009898    pCellIdx = &data[cellStart + 2*(nCell-1)];
009899  
009900    if( !pPage->leaf ){
009901      /* Analyze the right-child page of internal pages */
009902      pgno = get4byte(&data[hdr+8]);
009903  #ifndef SQLITE_OMIT_AUTOVACUUM
009904      if( pBt->autoVacuum ){
009905        pCheck->zPfx = "On page %d at right child: ";
009906        checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
009907      }
009908  #endif
009909      depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
009910      keyCanBeEqual = 0;
009911    }else{
009912      /* For leaf pages, the coverage check will occur in the same loop
009913      ** as the other cell checks, so initialize the heap.  */
009914      heap = pCheck->heap;
009915      heap[0] = 0;
009916    }
009917  
009918    /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
009919    ** integer offsets to the cell contents. */
009920    for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
009921      CellInfo info;
009922  
009923      /* Check cell size */
009924      pCheck->v2 = i;
009925      assert( pCellIdx==&data[cellStart + i*2] );
009926      pc = get2byteAligned(pCellIdx);
009927      pCellIdx -= 2;
009928      if( pc<contentOffset || pc>usableSize-4 ){
009929        checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
009930                               pc, contentOffset, usableSize-4);
009931        doCoverageCheck = 0;
009932        continue;
009933      }
009934      pCell = &data[pc];
009935      pPage->xParseCell(pPage, pCell, &info);
009936      if( pc+info.nSize>usableSize ){
009937        checkAppendMsg(pCheck, "Extends off end of page");
009938        doCoverageCheck = 0;
009939        continue;
009940      }
009941  
009942      /* Check for integer primary key out of range */
009943      if( pPage->intKey ){
009944        if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
009945          checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
009946        }
009947        maxKey = info.nKey;
009948        keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
009949      }
009950  
009951      /* Check the content overflow list */
009952      if( info.nPayload>info.nLocal ){
009953        u32 nPage;       /* Number of pages on the overflow chain */
009954        Pgno pgnoOvfl;   /* First page of the overflow chain */
009955        assert( pc + info.nSize - 4 <= usableSize );
009956        nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
009957        pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
009958  #ifndef SQLITE_OMIT_AUTOVACUUM
009959        if( pBt->autoVacuum ){
009960          checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
009961        }
009962  #endif
009963        checkList(pCheck, 0, pgnoOvfl, nPage);
009964      }
009965  
009966      if( !pPage->leaf ){
009967        /* Check sanity of left child page for internal pages */
009968        pgno = get4byte(pCell);
009969  #ifndef SQLITE_OMIT_AUTOVACUUM
009970        if( pBt->autoVacuum ){
009971          checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
009972        }
009973  #endif
009974        d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
009975        keyCanBeEqual = 0;
009976        if( d2!=depth ){
009977          checkAppendMsg(pCheck, "Child page depth differs");
009978          depth = d2;
009979        }
009980      }else{
009981        /* Populate the coverage-checking heap for leaf pages */
009982        btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
009983      }
009984    }
009985    *piMinKey = maxKey;
009986  
009987    /* Check for complete coverage of the page
009988    */
009989    pCheck->zPfx = 0;
009990    if( doCoverageCheck && pCheck->mxErr>0 ){
009991      /* For leaf pages, the min-heap has already been initialized and the
009992      ** cells have already been inserted.  But for internal pages, that has
009993      ** not yet been done, so do it now */
009994      if( !pPage->leaf ){
009995        heap = pCheck->heap;
009996        heap[0] = 0;
009997        for(i=nCell-1; i>=0; i--){
009998          u32 size;
009999          pc = get2byteAligned(&data[cellStart+i*2]);
010000          size = pPage->xCellSize(pPage, &data[pc]);
010001          btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
010002        }
010003      }
010004      /* Add the freeblocks to the min-heap
010005      **
010006      ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
010007      ** is the offset of the first freeblock, or zero if there are no
010008      ** freeblocks on the page. 
010009      */
010010      i = get2byte(&data[hdr+1]);
010011      while( i>0 ){
010012        int size, j;
010013        assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
010014        size = get2byte(&data[i+2]);
010015        assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */
010016        btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
010017        /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
010018        ** big-endian integer which is the offset in the b-tree page of the next
010019        ** freeblock in the chain, or zero if the freeblock is the last on the
010020        ** chain. */
010021        j = get2byte(&data[i]);
010022        /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
010023        ** increasing offset. */
010024        assert( j==0 || j>i+size );     /* Enforced by btreeComputeFreeSpace() */
010025        assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
010026        i = j;
010027      }
010028      /* Analyze the min-heap looking for overlap between cells and/or 
010029      ** freeblocks, and counting the number of untracked bytes in nFrag.
010030      ** 
010031      ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
010032      ** There is an implied first entry the covers the page header, the cell
010033      ** pointer index, and the gap between the cell pointer index and the start
010034      ** of cell content.  
010035      **
010036      ** The loop below pulls entries from the min-heap in order and compares
010037      ** the start_address against the previous end_address.  If there is an
010038      ** overlap, that means bytes are used multiple times.  If there is a gap,
010039      ** that gap is added to the fragmentation count.
010040      */
010041      nFrag = 0;
010042      prev = contentOffset - 1;   /* Implied first min-heap entry */
010043      while( btreeHeapPull(heap,&x) ){
010044        if( (prev&0xffff)>=(x>>16) ){
010045          checkAppendMsg(pCheck,
010046            "Multiple uses for byte %u of page %d", x>>16, iPage);
010047          break;
010048        }else{
010049          nFrag += (x>>16) - (prev&0xffff) - 1;
010050          prev = x;
010051        }
010052      }
010053      nFrag += usableSize - (prev&0xffff) - 1;
010054      /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
010055      ** is stored in the fifth field of the b-tree page header.
010056      ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
010057      ** number of fragmented free bytes within the cell content area.
010058      */
010059      if( heap[0]==0 && nFrag!=data[hdr+7] ){
010060        checkAppendMsg(pCheck,
010061            "Fragmentation of %d bytes reported as %d on page %d",
010062            nFrag, data[hdr+7], iPage);
010063      }
010064    }
010065  
010066  end_of_check:
010067    if( !doCoverageCheck ) pPage->isInit = savedIsInit;
010068    releasePage(pPage);
010069    pCheck->zPfx = saved_zPfx;
010070    pCheck->v1 = saved_v1;
010071    pCheck->v2 = saved_v2;
010072    return depth+1;
010073  }
010074  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
010075  
010076  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
010077  /*
010078  ** This routine does a complete check of the given BTree file.  aRoot[] is
010079  ** an array of pages numbers were each page number is the root page of
010080  ** a table.  nRoot is the number of entries in aRoot.
010081  **
010082  ** A read-only or read-write transaction must be opened before calling
010083  ** this function.
010084  **
010085  ** Write the number of error seen in *pnErr.  Except for some memory
010086  ** allocation errors,  an error message held in memory obtained from
010087  ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
010088  ** returned.  If a memory allocation error occurs, NULL is returned.
010089  */
010090  char *sqlite3BtreeIntegrityCheck(
010091    sqlite3 *db,  /* Database connection that is running the check */
010092    Btree *p,     /* The btree to be checked */
010093    int *aRoot,   /* An array of root pages numbers for individual trees */
010094    int nRoot,    /* Number of entries in aRoot[] */
010095    int mxErr,    /* Stop reporting errors after this many */
010096    int *pnErr    /* Write number of errors seen to this variable */
010097  ){
010098    Pgno i;
010099    IntegrityCk sCheck;
010100    BtShared *pBt = p->pBt;
010101    u64 savedDbFlags = pBt->db->flags;
010102    char zErr[100];
010103    VVA_ONLY( int nRef );
010104  
010105    sqlite3BtreeEnter(p);
010106    assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
010107    VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
010108    assert( nRef>=0 );
010109    sCheck.db = db;
010110    sCheck.pBt = pBt;
010111    sCheck.pPager = pBt->pPager;
010112    sCheck.nPage = btreePagecount(sCheck.pBt);
010113    sCheck.mxErr = mxErr;
010114    sCheck.nErr = 0;
010115    sCheck.mallocFailed = 0;
010116    sCheck.zPfx = 0;
010117    sCheck.v1 = 0;
010118    sCheck.v2 = 0;
010119    sCheck.aPgRef = 0;
010120    sCheck.heap = 0;
010121    sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
010122    sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
010123    if( sCheck.nPage==0 ){
010124      goto integrity_ck_cleanup;
010125    }
010126  
010127    sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
010128    if( !sCheck.aPgRef ){
010129      sCheck.mallocFailed = 1;
010130      goto integrity_ck_cleanup;
010131    }
010132    sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
010133    if( sCheck.heap==0 ){
010134      sCheck.mallocFailed = 1;
010135      goto integrity_ck_cleanup;
010136    }
010137  
010138    i = PENDING_BYTE_PAGE(pBt);
010139    if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
010140  
010141    /* Check the integrity of the freelist
010142    */
010143    sCheck.zPfx = "Main freelist: ";
010144    checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
010145              get4byte(&pBt->pPage1->aData[36]));
010146    sCheck.zPfx = 0;
010147  
010148    /* Check all the tables.
010149    */
010150  #ifndef SQLITE_OMIT_AUTOVACUUM
010151    if( pBt->autoVacuum ){
010152      int mx = 0;
010153      int mxInHdr;
010154      for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i];
010155      mxInHdr = get4byte(&pBt->pPage1->aData[52]);
010156      if( mx!=mxInHdr ){
010157        checkAppendMsg(&sCheck,
010158          "max rootpage (%d) disagrees with header (%d)",
010159          mx, mxInHdr
010160        );
010161      }
010162    }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){
010163      checkAppendMsg(&sCheck,
010164        "incremental_vacuum enabled with a max rootpage of zero"
010165      );
010166    }
010167  #endif
010168    testcase( pBt->db->flags & SQLITE_CellSizeCk );
010169    pBt->db->flags &= ~(u64)SQLITE_CellSizeCk;
010170    for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
010171      i64 notUsed;
010172      if( aRoot[i]==0 ) continue;
010173  #ifndef SQLITE_OMIT_AUTOVACUUM
010174      if( pBt->autoVacuum && aRoot[i]>1 ){
010175        checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
010176      }
010177  #endif
010178      checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
010179    }
010180    pBt->db->flags = savedDbFlags;
010181  
010182    /* Make sure every page in the file is referenced
010183    */
010184    for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
010185  #ifdef SQLITE_OMIT_AUTOVACUUM
010186      if( getPageReferenced(&sCheck, i)==0 ){
010187        checkAppendMsg(&sCheck, "Page %d is never used", i);
010188      }
010189  #else
010190      /* If the database supports auto-vacuum, make sure no tables contain
010191      ** references to pointer-map pages.
010192      */
010193      if( getPageReferenced(&sCheck, i)==0 && 
010194         (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
010195        checkAppendMsg(&sCheck, "Page %d is never used", i);
010196      }
010197      if( getPageReferenced(&sCheck, i)!=0 && 
010198         (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
010199        checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
010200      }
010201  #endif
010202    }
010203  
010204    /* Clean  up and report errors.
010205    */
010206  integrity_ck_cleanup:
010207    sqlite3PageFree(sCheck.heap);
010208    sqlite3_free(sCheck.aPgRef);
010209    if( sCheck.mallocFailed ){
010210      sqlite3_str_reset(&sCheck.errMsg);
010211      sCheck.nErr++;
010212    }
010213    *pnErr = sCheck.nErr;
010214    if( sCheck.nErr==0 ) sqlite3_str_reset(&sCheck.errMsg);
010215    /* Make sure this analysis did not leave any unref() pages. */
010216    assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
010217    sqlite3BtreeLeave(p);
010218    return sqlite3StrAccumFinish(&sCheck.errMsg);
010219  }
010220  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
010221  
010222  /*
010223  ** Return the full pathname of the underlying database file.  Return
010224  ** an empty string if the database is in-memory or a TEMP database.
010225  **
010226  ** The pager filename is invariant as long as the pager is
010227  ** open so it is safe to access without the BtShared mutex.
010228  */
010229  const char *sqlite3BtreeGetFilename(Btree *p){
010230    assert( p->pBt->pPager!=0 );
010231    return sqlite3PagerFilename(p->pBt->pPager, 1);
010232  }
010233  
010234  /*
010235  ** Return the pathname of the journal file for this database. The return
010236  ** value of this routine is the same regardless of whether the journal file
010237  ** has been created or not.
010238  **
010239  ** The pager journal filename is invariant as long as the pager is
010240  ** open so it is safe to access without the BtShared mutex.
010241  */
010242  const char *sqlite3BtreeGetJournalname(Btree *p){
010243    assert( p->pBt->pPager!=0 );
010244    return sqlite3PagerJournalname(p->pBt->pPager);
010245  }
010246  
010247  /*
010248  ** Return non-zero if a transaction is active.
010249  */
010250  int sqlite3BtreeIsInTrans(Btree *p){
010251    assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
010252    return (p && (p->inTrans==TRANS_WRITE));
010253  }
010254  
010255  #ifndef SQLITE_OMIT_WAL
010256  /*
010257  ** Run a checkpoint on the Btree passed as the first argument.
010258  **
010259  ** Return SQLITE_LOCKED if this or any other connection has an open 
010260  ** transaction on the shared-cache the argument Btree is connected to.
010261  **
010262  ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
010263  */
010264  int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
010265    int rc = SQLITE_OK;
010266    if( p ){
010267      BtShared *pBt = p->pBt;
010268      sqlite3BtreeEnter(p);
010269      if( pBt->inTransaction!=TRANS_NONE ){
010270        rc = SQLITE_LOCKED;
010271      }else{
010272        rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
010273      }
010274      sqlite3BtreeLeave(p);
010275    }
010276    return rc;
010277  }
010278  #endif
010279  
010280  /*
010281  ** Return non-zero if a read (or write) transaction is active.
010282  */
010283  int sqlite3BtreeIsInReadTrans(Btree *p){
010284    assert( p );
010285    assert( sqlite3_mutex_held(p->db->mutex) );
010286    return p->inTrans!=TRANS_NONE;
010287  }
010288  
010289  int sqlite3BtreeIsInBackup(Btree *p){
010290    assert( p );
010291    assert( sqlite3_mutex_held(p->db->mutex) );
010292    return p->nBackup!=0;
010293  }
010294  
010295  /*
010296  ** This function returns a pointer to a blob of memory associated with
010297  ** a single shared-btree. The memory is used by client code for its own
010298  ** purposes (for example, to store a high-level schema associated with 
010299  ** the shared-btree). The btree layer manages reference counting issues.
010300  **
010301  ** The first time this is called on a shared-btree, nBytes bytes of memory
010302  ** are allocated, zeroed, and returned to the caller. For each subsequent 
010303  ** call the nBytes parameter is ignored and a pointer to the same blob
010304  ** of memory returned. 
010305  **
010306  ** If the nBytes parameter is 0 and the blob of memory has not yet been
010307  ** allocated, a null pointer is returned. If the blob has already been
010308  ** allocated, it is returned as normal.
010309  **
010310  ** Just before the shared-btree is closed, the function passed as the 
010311  ** xFree argument when the memory allocation was made is invoked on the 
010312  ** blob of allocated memory. The xFree function should not call sqlite3_free()
010313  ** on the memory, the btree layer does that.
010314  */
010315  void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
010316    BtShared *pBt = p->pBt;
010317    sqlite3BtreeEnter(p);
010318    if( !pBt->pSchema && nBytes ){
010319      pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
010320      pBt->xFreeSchema = xFree;
010321    }
010322    sqlite3BtreeLeave(p);
010323    return pBt->pSchema;
010324  }
010325  
010326  /*
010327  ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 
010328  ** btree as the argument handle holds an exclusive lock on the 
010329  ** sqlite_master table. Otherwise SQLITE_OK.
010330  */
010331  int sqlite3BtreeSchemaLocked(Btree *p){
010332    int rc;
010333    assert( sqlite3_mutex_held(p->db->mutex) );
010334    sqlite3BtreeEnter(p);
010335    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
010336    assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
010337    sqlite3BtreeLeave(p);
010338    return rc;
010339  }
010340  
010341  
010342  #ifndef SQLITE_OMIT_SHARED_CACHE
010343  /*
010344  ** Obtain a lock on the table whose root page is iTab.  The
010345  ** lock is a write lock if isWritelock is true or a read lock
010346  ** if it is false.
010347  */
010348  int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
010349    int rc = SQLITE_OK;
010350    assert( p->inTrans!=TRANS_NONE );
010351    if( p->sharable ){
010352      u8 lockType = READ_LOCK + isWriteLock;
010353      assert( READ_LOCK+1==WRITE_LOCK );
010354      assert( isWriteLock==0 || isWriteLock==1 );
010355  
010356      sqlite3BtreeEnter(p);
010357      rc = querySharedCacheTableLock(p, iTab, lockType);
010358      if( rc==SQLITE_OK ){
010359        rc = setSharedCacheTableLock(p, iTab, lockType);
010360      }
010361      sqlite3BtreeLeave(p);
010362    }
010363    return rc;
010364  }
010365  #endif
010366  
010367  #ifndef SQLITE_OMIT_INCRBLOB
010368  /*
010369  ** Argument pCsr must be a cursor opened for writing on an 
010370  ** INTKEY table currently pointing at a valid table entry. 
010371  ** This function modifies the data stored as part of that entry.
010372  **
010373  ** Only the data content may only be modified, it is not possible to 
010374  ** change the length of the data stored. If this function is called with
010375  ** parameters that attempt to write past the end of the existing data,
010376  ** no modifications are made and SQLITE_CORRUPT is returned.
010377  */
010378  int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
010379    int rc;
010380    assert( cursorOwnsBtShared(pCsr) );
010381    assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
010382    assert( pCsr->curFlags & BTCF_Incrblob );
010383  
010384    rc = restoreCursorPosition(pCsr);
010385    if( rc!=SQLITE_OK ){
010386      return rc;
010387    }
010388    assert( pCsr->eState!=CURSOR_REQUIRESEEK );
010389    if( pCsr->eState!=CURSOR_VALID ){
010390      return SQLITE_ABORT;
010391    }
010392  
010393    /* Save the positions of all other cursors open on this table. This is
010394    ** required in case any of them are holding references to an xFetch
010395    ** version of the b-tree page modified by the accessPayload call below.
010396    **
010397    ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
010398    ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
010399    ** saveAllCursors can only return SQLITE_OK.
010400    */
010401    VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
010402    assert( rc==SQLITE_OK );
010403  
010404    /* Check some assumptions: 
010405    **   (a) the cursor is open for writing,
010406    **   (b) there is a read/write transaction open,
010407    **   (c) the connection holds a write-lock on the table (if required),
010408    **   (d) there are no conflicting read-locks, and
010409    **   (e) the cursor points at a valid row of an intKey table.
010410    */
010411    if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
010412      return SQLITE_READONLY;
010413    }
010414    assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
010415                && pCsr->pBt->inTransaction==TRANS_WRITE );
010416    assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
010417    assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
010418    assert( pCsr->pPage->intKey );
010419  
010420    return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
010421  }
010422  
010423  /* 
010424  ** Mark this cursor as an incremental blob cursor.
010425  */
010426  void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
010427    pCur->curFlags |= BTCF_Incrblob;
010428    pCur->pBtree->hasIncrblobCur = 1;
010429  }
010430  #endif
010431  
010432  /*
010433  ** Set both the "read version" (single byte at byte offset 18) and 
010434  ** "write version" (single byte at byte offset 19) fields in the database
010435  ** header to iVersion.
010436  */
010437  int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
010438    BtShared *pBt = pBtree->pBt;
010439    int rc;                         /* Return code */
010440   
010441    assert( iVersion==1 || iVersion==2 );
010442  
010443    /* If setting the version fields to 1, do not automatically open the
010444    ** WAL connection, even if the version fields are currently set to 2.
010445    */
010446    pBt->btsFlags &= ~BTS_NO_WAL;
010447    if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
010448  
010449    rc = sqlite3BtreeBeginTrans(pBtree, 0, 0);
010450    if( rc==SQLITE_OK ){
010451      u8 *aData = pBt->pPage1->aData;
010452      if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
010453        rc = sqlite3BtreeBeginTrans(pBtree, 2, 0);
010454        if( rc==SQLITE_OK ){
010455          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
010456          if( rc==SQLITE_OK ){
010457            aData[18] = (u8)iVersion;
010458            aData[19] = (u8)iVersion;
010459          }
010460        }
010461      }
010462    }
010463  
010464    pBt->btsFlags &= ~BTS_NO_WAL;
010465    return rc;
010466  }
010467  
010468  /*
010469  ** Return true if the cursor has a hint specified.  This routine is
010470  ** only used from within assert() statements
010471  */
010472  int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
010473    return (pCsr->hints & mask)!=0;
010474  }
010475  
010476  /*
010477  ** Return true if the given Btree is read-only.
010478  */
010479  int sqlite3BtreeIsReadonly(Btree *p){
010480    return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
010481  }
010482  
010483  /*
010484  ** Return the size of the header added to each page by this module.
010485  */
010486  int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
010487  
010488  #if !defined(SQLITE_OMIT_SHARED_CACHE)
010489  /*
010490  ** Return true if the Btree passed as the only argument is sharable.
010491  */
010492  int sqlite3BtreeSharable(Btree *p){
010493    return p->sharable;
010494  }
010495  
010496  /*
010497  ** Return the number of connections to the BtShared object accessed by
010498  ** the Btree handle passed as the only argument. For private caches 
010499  ** this is always 1. For shared caches it may be 1 or greater.
010500  */
010501  int sqlite3BtreeConnectionCount(Btree *p){
010502    testcase( p->sharable );
010503    return p->pBt->nRef;
010504  }
010505  #endif