2017-06-10 05:56:41 +08:00
/*
* VersionedBTree . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
*
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
*
* http : //www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
# include "flow/flow.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/IVersionedStore.h"
# include "fdbserver/IPager.h"
2017-06-10 05:56:41 +08:00
# include "fdbclient/Tuple.h"
# include "flow/serialize.h"
# include "flow/genericactors.actor.h"
# include "flow/UnitTest.h"
2019-08-07 17:36:33 +08:00
# include "fdbserver/IPager.h"
# include "fdbrpc/IAsyncFile.h"
2019-08-07 19:31:11 +08:00
# include "fdbrpc/crc32c.h"
2019-08-07 17:36:33 +08:00
# include "flow/ActorCollection.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/MemoryPager.h"
# include "fdbserver/IndirectShadowPager.h"
2017-06-10 05:56:41 +08:00
# include <map>
# include <vector>
2017-08-04 15:01:25 +08:00
# include "fdbclient/CommitTransaction.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/IKeyValueStore.h"
2019-02-21 18:46:30 +08:00
# include "fdbserver/DeltaTree.h"
2018-07-23 18:09:13 +08:00
# include <string.h>
2018-10-19 11:26:45 +08:00
# include "flow/actorcompiler.h"
2019-05-05 01:52:02 +08:00
# include <cinttypes>
2019-08-07 17:36:33 +08:00
# include <boost/intrusive/list.hpp>
2019-09-28 06:08:05 +08:00
// Some convenience functions for debugging to stringify various structures
2019-10-01 17:06:00 +08:00
// Classes can add compatibility by either specializing toString<T> or implementing
// std::string toString() const;
2019-09-28 06:08:05 +08:00
template < typename T >
std : : string toString ( const T & o ) {
return o . toString ( ) ;
}
2019-10-24 00:31:06 +08:00
std : : string toString ( StringRef s ) {
return s . printable ( ) ;
}
2019-09-28 06:08:05 +08:00
std : : string toString ( LogicalPageID id ) {
2019-10-15 18:10:50 +08:00
if ( id = = invalidLogicalPageID ) {
return " LogicalPageID{invalid} " ;
}
return format ( " LogicalPageID{% " PRId64 " } " , id ) ;
2019-09-28 06:08:05 +08:00
}
template < typename T >
std : : string toString ( const Standalone < T > & s ) {
return toString ( ( T ) s ) ;
}
template < typename T >
std : : string toString ( const T * begin , const T * end ) {
std : : string r = " { " ;
bool comma = false ;
while ( begin ! = end ) {
if ( comma ) {
r + = " , " ;
}
else {
comma = true ;
}
r + = toString ( * begin + + ) ;
}
r + = " } " ;
return r ;
}
template < typename T >
std : : string toString ( const std : : vector < T > & v ) {
return toString ( v . begin ( ) , v . end ( ) ) ;
}
template < typename T >
std : : string toString ( const VectorRef < T > & v ) {
return toString ( v . begin ( ) , v . end ( ) ) ;
}
2019-10-15 18:10:50 +08:00
template < typename T >
std : : string toString ( const Optional < T > & o ) {
if ( o . present ( ) ) {
return toString ( o . get ( ) ) ;
}
return " <not present> " ;
}
2019-08-07 17:36:33 +08:00
// A FIFO queue of T stored as a linked list of pages.
2019-10-15 18:10:50 +08:00
// Main operations are pop(), pushBack(), pushFront(), and flush().
//
// flush() will ensure all queue pages are written to the pager and move the unflushed
// pushFront()'d records onto the front of the queue, in FIFO order.
//
2019-10-01 17:06:00 +08:00
// pop() will only return records that have been flushed, and pops
// from the front of the queue.
2019-09-02 14:03:31 +08:00
//
2019-10-15 18:10:50 +08:00
// Each page contains some number of T items and a link to the next page and starting position on that page.
2019-09-02 14:03:31 +08:00
// When the queue is flushed, the last page in the chain is ended and linked to a newly allocated
// but not-yet-written-to pageID, which future writes after the flush will write to.
// Items pushed onto the front of the queue are written to a separate linked list until flushed,
// at which point that list becomes the new front of the queue.
//
2019-10-15 18:10:50 +08:00
// The write pattern is designed such that no page is ever expected to be valid after
// being written to or updated but not fsync'd. This is why a new unused page is added
// to the queue, linked to by the last data page, before commit. The new page can't be
// added and filled with data as part of the next commit because that would mean modifying
// the previous tail page to update its next link, which risks corrupting it and losing
// data that was not yet popped if that write is never fsync'd.
2019-10-01 17:06:00 +08:00
//
// Requirements on T
// - must be trivially copyable
// OR have a specialization for FIFOQueueCodec<T>
// OR have the following methods
// // Deserialize from src into *this, return number of bytes from src consumed
// int readFromBytes(const uint8_t *src);
// // Return the size of *this serialized
// int bytesNeeded() const;
// // Serialize *this to dst, return number of bytes written to dst
// int writeToBytes(uint8_t *dst) const;
2019-10-15 18:10:50 +08:00
// - must be supported by toString(object) (see above)
2019-10-01 17:06:00 +08:00
template < typename T , typename Enable = void >
struct FIFOQueueCodec {
static T readFromBytes ( const uint8_t * src , int & bytesRead ) {
T x ;
bytesRead = x . readFromBytes ( src ) ;
return x ;
}
static int bytesNeeded ( const T & x ) {
return x . bytesNeeded ( ) ;
}
static int writeToBytes ( uint8_t * dst , const T & x ) {
return x . writeToBytes ( dst ) ;
}
} ;
2019-08-07 17:36:33 +08:00
template < typename T >
2019-10-01 17:06:00 +08:00
struct FIFOQueueCodec < T , typename std : : enable_if < std : : is_trivially_copyable < T > : : value > : : type > {
2019-08-07 17:36:33 +08:00
static_assert ( std : : is_trivially_copyable < T > : : value ) ;
2019-10-01 17:06:00 +08:00
static T readFromBytes ( const uint8_t * src , int & bytesRead ) {
bytesRead = sizeof ( T ) ;
return * ( T * ) src ;
}
static int bytesNeeded ( const T & x ) {
return sizeof ( T ) ;
}
static int writeToBytes ( uint8_t * dst , const T & x ) {
* ( T * ) dst = x ;
return sizeof ( T ) ;
}
} ;
2019-08-07 17:36:33 +08:00
2019-10-01 17:06:00 +08:00
template < typename T , typename Codec = FIFOQueueCodec < T > >
class FIFOQueue {
2019-08-07 17:36:33 +08:00
public :
# pragma pack(push, 1)
struct QueueState {
2019-10-15 18:10:50 +08:00
bool operator = = ( const QueueState & rhs ) const {
return memcmp ( this , & rhs , sizeof ( QueueState ) ) = = 0 ;
}
2019-08-07 17:36:33 +08:00
LogicalPageID headPageID = invalidLogicalPageID ;
LogicalPageID tailPageID = invalidLogicalPageID ;
2019-10-01 17:06:00 +08:00
uint16_t headOffset ;
2019-08-14 13:41:41 +08:00
// Note that there is no tail index because the tail page is always never-before-written and its index will start at 0
2019-08-07 17:36:33 +08:00
int64_t numPages ;
int64_t numEntries ;
std : : string toString ( ) const {
2019-10-15 18:10:50 +08:00
return format ( " {head: %s:%d tail: %s numPages: % " PRId64 " numEntries: % " PRId64 " } " , : : toString ( headPageID ) . c_str ( ) , ( int ) headOffset , : : toString ( tailPageID ) . c_str ( ) , numPages , numEntries ) ;
2019-08-07 17:36:33 +08:00
}
} ;
# pragma pack(pop)
struct Cursor {
2019-10-15 18:10:50 +08:00
enum Mode {
NONE ,
2019-11-04 19:04:03 +08:00
POP ,
READONLY ,
2019-10-15 18:10:50 +08:00
WRITE
} ;
2019-08-07 17:36:33 +08:00
2019-10-15 18:10:50 +08:00
// The current page being read or written to
LogicalPageID pageID ;
2019-08-14 13:41:41 +08:00
2019-10-15 18:10:50 +08:00
// The first page ID to be written to the pager, if this cursor has written anything
LogicalPageID firstPageIDWritten ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
// Offset after RawPage header to next read from or write to
int offset ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
// A read cursor will not read this page (or beyond)
LogicalPageID endPageID ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
Reference < IPage > page ;
FIFOQueue * queue ;
Future < Void > operation ;
Mode mode ;
2019-08-07 17:36:33 +08:00
2019-10-15 18:10:50 +08:00
Cursor ( ) : mode ( NONE ) {
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
// Initialize a cursor.
2019-10-15 18:10:50 +08:00
void init ( FIFOQueue * q = nullptr , Mode m = NONE , LogicalPageID initialPageID = invalidLogicalPageID , int readOffset = 0 , LogicalPageID endPage = invalidLogicalPageID ) {
if ( operation . isValid ( ) ) {
operation . cancel ( ) ;
}
2019-08-07 17:36:33 +08:00
queue = q ;
2019-10-15 18:10:50 +08:00
mode = m ;
firstPageIDWritten = invalidLogicalPageID ;
offset = readOffset ;
endPageID = endPage ;
page . clear ( ) ;
2019-11-04 19:04:03 +08:00
if ( mode = = POP | | mode = = READONLY ) {
2019-10-15 18:10:50 +08:00
// If cursor is not pointed at the end page then start loading it.
// The end page will not have been written to disk yet.
pageID = initialPageID ;
operation = ( pageID = = endPageID ) ? Void ( ) : loadPage ( ) ;
}
else {
pageID = invalidLogicalPageID ;
ASSERT ( mode = = WRITE | | ( initialPageID = = invalidLogicalPageID & & readOffset = = 0 & & endPage = = invalidLogicalPageID ) ) ;
operation = Void ( ) ;
}
2019-08-14 13:41:41 +08:00
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) initialized \n " , toString ( ) . c_str ( ) ) ;
2019-08-14 13:41:41 +08:00
2019-10-15 18:10:50 +08:00
if ( mode = = WRITE & & initialPageID ! = invalidLogicalPageID ) {
2019-10-18 12:34:17 +08:00
addNewPage ( initialPageID , 0 , true ) ;
2019-10-15 18:10:50 +08:00
}
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
// Since cursors can have async operations pending which modify their state they can't be copied cleanly
2019-10-15 18:10:50 +08:00
Cursor ( const Cursor & other ) = delete ;
2019-08-07 17:36:33 +08:00
2019-11-04 19:04:03 +08:00
// A read cursor can be initialized from a pop cursor
void initReadOnly ( const Cursor & c ) {
ASSERT ( c . mode = = READONLY | | c . mode = = POP ) ;
init ( c . queue , READONLY , c . pageID , c . offset , c . endPageID ) ;
}
2019-10-15 18:10:50 +08:00
~ Cursor ( ) {
operation . cancel ( ) ;
2019-08-14 13:41:41 +08:00
}
2019-10-15 18:10:50 +08:00
std : : string toString ( ) const {
2019-10-18 12:34:17 +08:00
if ( mode = = WRITE ) {
return format ( " {WriteCursor %s:%p pos=%s:%d endOffset=%d} " , queue - > name . c_str ( ) , this , : : toString ( pageID ) . c_str ( ) , offset , page ? raw ( ) - > endOffset : - 1 ) ;
}
2019-11-04 19:04:03 +08:00
if ( mode = = POP | | mode = = READONLY ) {
2019-10-18 12:34:17 +08:00
return format ( " {ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s} " , queue - > name . c_str ( ) , this , : : toString ( pageID ) . c_str ( ) , offset , page ? raw ( ) - > endOffset : - 1 , : : toString ( endPageID ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
}
2019-10-18 12:34:17 +08:00
ASSERT ( mode = = NONE ) ;
return format ( " {NullCursor=%p} " , this ) ;
2019-09-02 14:03:31 +08:00
}
2019-08-07 17:36:33 +08:00
# pragma pack(push, 1)
struct RawPage {
2019-09-05 15:47:57 +08:00
static constexpr int FORMAT_VERSION = 1 ;
uint16_t formatVersion ;
2019-09-02 14:03:31 +08:00
LogicalPageID nextPageID ;
2019-10-01 17:06:00 +08:00
uint16_t nextOffset ;
uint16_t endOffset ;
uint8_t * begin ( ) {
return ( uint8_t * ) ( this + 1 ) ;
2019-08-07 17:36:33 +08:00
}
} ;
# pragma pack(pop)
2019-10-15 18:10:50 +08:00
Future < Void > notBusy ( ) {
return operation ;
}
// Returns true if any items have been written to the last page
bool pendingWrites ( ) const {
return mode = = WRITE & & offset ! = 0 ;
}
2019-08-14 13:41:41 +08:00
RawPage * raw ( ) const {
return ( ( RawPage * ) ( page - > begin ( ) ) ) ;
2019-08-07 17:36:33 +08:00
}
2019-10-01 17:06:00 +08:00
void setNext ( LogicalPageID pageID , int offset ) {
2019-10-15 18:10:50 +08:00
ASSERT ( mode = = WRITE ) ;
2019-09-02 14:03:31 +08:00
RawPage * p = raw ( ) ;
p - > nextPageID = pageID ;
2019-10-01 17:06:00 +08:00
p - > nextOffset = offset ;
2019-09-02 14:03:31 +08:00
}
2019-08-14 13:41:41 +08:00
Future < Void > loadPage ( ) {
2019-11-04 19:04:03 +08:00
ASSERT ( mode = = POP | mode = = READONLY ) ;
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) loadPage \n " , toString ( ) . c_str ( ) ) ;
2019-09-28 06:08:05 +08:00
return map ( queue - > pager - > readPage ( pageID , true ) , [ = ] ( Reference < IPage > p ) {
2019-08-07 17:36:33 +08:00
page = p ;
2019-09-05 15:47:57 +08:00
ASSERT ( raw ( ) - > formatVersion = = RawPage : : FORMAT_VERSION ) ;
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) loadPage done \n " , toString ( ) . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
return Void ( ) ;
} ) ;
}
void writePage ( ) {
2019-10-15 18:10:50 +08:00
ASSERT ( mode = = WRITE ) ;
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) writePage \n " , toString ( ) . c_str ( ) ) ;
2019-10-02 21:43:11 +08:00
VALGRIND_MAKE_MEM_DEFINED ( raw ( ) - > begin ( ) , offset ) ;
VALGRIND_MAKE_MEM_DEFINED ( raw ( ) - > begin ( ) + offset , queue - > dataBytesPerPage - raw ( ) - > endOffset ) ;
2019-08-14 13:41:41 +08:00
queue - > pager - > updatePage ( pageID , page ) ;
2019-10-15 18:10:50 +08:00
if ( firstPageIDWritten = = invalidLogicalPageID ) {
firstPageIDWritten = pageID ;
}
2019-08-07 17:36:33 +08:00
}
2019-10-18 12:34:17 +08:00
// Link the current page to newPageID:newOffset and then write it to the pager.
// If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized
// as a new tail page.
void addNewPage ( LogicalPageID newPageID , int newOffset , bool initializeNewPage ) {
ASSERT ( mode = = WRITE ) ;
ASSERT ( newPageID ! = invalidLogicalPageID ) ;
debug_printf ( " FIFOQueue::Cursor(%s) Adding page %s init=%d \n " , toString ( ) . c_str ( ) , : : toString ( newPageID ) . c_str ( ) , initializeNewPage ) ;
2019-08-07 17:36:33 +08:00
2019-10-15 18:10:50 +08:00
// Update existing page and write, if it exists
2019-10-18 12:34:17 +08:00
if ( page ) {
setNext ( newPageID , newOffset ) ;
debug_printf ( " FIFOQueue::Cursor(%s) Linked new page \n " , toString ( ) . c_str ( ) ) ;
writePage ( ) ;
2019-10-01 17:06:00 +08:00
}
2019-10-15 18:10:50 +08:00
2019-10-18 12:34:17 +08:00
pageID = newPageID ;
offset = newOffset ;
2019-10-15 18:10:50 +08:00
if ( initializeNewPage ) {
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) Initializing new page \n " , toString ( ) . c_str ( ) ) ;
page = queue - > pager - > newPageBuffer ( ) ;
setNext ( 0 , 0 ) ;
auto p = raw ( ) ;
2019-10-15 18:10:50 +08:00
p - > formatVersion = RawPage : : FORMAT_VERSION ;
2019-10-18 12:34:17 +08:00
ASSERT ( newOffset = = 0 ) ;
2019-10-15 18:10:50 +08:00
p - > endOffset = 0 ;
}
2019-10-18 12:34:17 +08:00
else {
page . clear ( ) ;
}
2019-09-02 14:03:31 +08:00
}
2019-10-15 18:10:50 +08:00
// Write item to the next position in the current page or, if it won't fit, add a new page and write it there.
2019-10-18 12:34:17 +08:00
ACTOR static Future < Void > write_impl ( Cursor * self , T item , Future < Void > start ) {
2019-10-15 18:10:50 +08:00
ASSERT ( self - > mode = = WRITE ) ;
2019-10-18 12:34:17 +08:00
// Wait for the previous operation to finish
state Future < Void > previous = self - > operation ;
wait ( start ) ;
2019-10-15 18:10:50 +08:00
wait ( previous ) ;
2019-10-18 12:34:17 +08:00
2019-10-15 18:10:50 +08:00
state int bytesNeeded = Codec : : bytesNeeded ( item ) ;
2019-10-23 08:17:29 +08:00
if ( self - > pageID = = invalidLogicalPageID | | self - > offset + bytesNeeded > self - > queue - > dataBytesPerPage ) {
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) write(%s) page is full, adding new page \n " , self - > toString ( ) . c_str ( ) , : : toString ( item ) . c_str ( ) ) ;
LogicalPageID newPageID = wait ( self - > queue - > pager - > newPageID ( ) ) ;
self - > addNewPage ( newPageID , 0 , true ) ;
2019-10-24 00:31:06 +08:00
+ + self - > queue - > numPages ;
2019-10-15 18:10:50 +08:00
wait ( yield ( ) ) ;
2019-09-02 14:03:31 +08:00
}
2019-10-23 08:17:29 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) before write(%s) \n " , self - > toString ( ) . c_str ( ) , : : toString ( item ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
auto p = self - > raw ( ) ;
Codec : : writeToBytes ( p - > begin ( ) + self - > offset , item ) ;
self - > offset + = bytesNeeded ;
p - > endOffset = self - > offset ;
2019-10-15 18:36:22 +08:00
+ + self - > queue - > numEntries ;
2019-10-01 17:06:00 +08:00
return Void ( ) ;
2019-08-07 17:36:33 +08:00
}
2019-10-15 18:10:50 +08:00
void write ( const T & item ) {
2019-10-18 12:34:17 +08:00
Promise < Void > p ;
operation = write_impl ( this , item , p . getFuture ( ) ) ;
p . send ( Void ( ) ) ;
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
// Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is exhausted
2019-10-18 12:34:17 +08:00
ACTOR static Future < Optional < T > > readNext_impl ( Cursor * self , Optional < T > upperBound , Future < Void > start ) {
2019-11-04 19:04:03 +08:00
ASSERT ( self - > mode = = POP | | self - > mode = = READONLY ) ;
2019-10-18 12:34:17 +08:00
// Wait for the previous operation to finish
state Future < Void > previous = self - > operation ;
wait ( start ) ;
2019-10-15 18:10:50 +08:00
wait ( previous ) ;
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) readNext begin \n " , self - > toString ( ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
if ( self - > pageID = = invalidLogicalPageID | | self - > pageID = = self - > endPageID ) {
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) readNext returning nothing \n " , self - > toString ( ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
return Optional < T > ( ) ;
2019-08-14 13:41:41 +08:00
}
2019-10-15 18:10:50 +08:00
// We now know we are pointing to PageID and it should be read and used, but it may not be loaded yet.
if ( ! self - > page ) {
wait ( self - > loadPage ( ) ) ;
wait ( yield ( ) ) ;
}
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
auto p = self - > raw ( ) ;
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) readNext reading at current position \n " , self - > toString ( ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
ASSERT ( self - > offset < p - > endOffset ) ;
int bytesRead ;
T result = Codec : : readFromBytes ( p - > begin ( ) + self - > offset , bytesRead ) ;
2019-08-07 17:36:33 +08:00
2019-10-15 18:10:50 +08:00
if ( upperBound . present ( ) & & upperBound . get ( ) < result ) {
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s \n " ,
self - > toString ( ) . c_str ( ) , : : toString ( result ) . c_str ( ) , : : toString ( upperBound . get ( ) ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
return Optional < T > ( ) ;
}
2019-08-14 13:41:41 +08:00
2019-10-15 18:10:50 +08:00
self - > offset + = bytesRead ;
2019-11-04 19:04:03 +08:00
if ( self - > mode = = POP ) {
- - self - > queue - > numEntries ;
}
2019-10-23 08:17:29 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) after read of %s \n " , self - > toString ( ) . c_str ( ) , : : toString ( result ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
ASSERT ( self - > offset < = p - > endOffset ) ;
if ( self - > offset = = p - > endOffset ) {
2019-10-18 12:34:17 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) Page exhausted \n " , self - > toString ( ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
LogicalPageID oldPageID = self - > pageID ;
self - > pageID = p - > nextPageID ;
self - > offset = p - > nextOffset ;
2019-11-04 19:04:03 +08:00
if ( self - > mode = = POP ) {
- - self - > queue - > numPages ;
}
2019-10-15 18:10:50 +08:00
self - > page . clear ( ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page \n " , self - > toString ( ) . c_str ( ) ) ;
if ( self - > mode = = POP ) {
// Freeing the old page must happen after advancing the cursor and clearing the page reference because
// freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this
// very same queue.
// Queue pages are freed at page 0 because they can be reused after the next commit.
self - > queue - > pager - > freePage ( oldPageID , 0 ) ;
}
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
debug_printf ( " FIFOQueue(%s) %s(upperBound=%s) -> %s \n " , self - > queue - > name . c_str ( ) , ( self - > mode = = POP ? " pop " : " peek " ) , : : toString ( upperBound ) . c_str ( ) , : : toString ( result ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
return result ;
}
2019-11-04 19:04:03 +08:00
// Read and move past the next item if is <= upperBound or if upperBound is not present
2019-10-15 18:10:50 +08:00
Future < Optional < T > > readNext ( const Optional < T > & upperBound = { } ) {
if ( mode = = NONE ) {
return Optional < T > ( ) ;
}
2019-10-18 12:34:17 +08:00
Promise < Void > p ;
Future < Optional < T > > read = readNext_impl ( this , upperBound , p . getFuture ( ) ) ;
2019-10-15 18:10:50 +08:00
operation = success ( read ) ;
2019-10-18 12:34:17 +08:00
p . send ( Void ( ) ) ;
2019-10-15 18:10:50 +08:00
return read ;
2019-08-07 17:36:33 +08:00
}
} ;
public :
FIFOQueue ( ) : pager ( nullptr ) {
}
2019-10-15 18:10:50 +08:00
~ FIFOQueue ( ) {
newTailPage . cancel ( ) ;
}
2019-08-07 17:36:33 +08:00
FIFOQueue ( const FIFOQueue & other ) = delete ;
void operator = ( const FIFOQueue & rhs ) = delete ;
// Create a new queue at newPageID
2019-08-14 13:41:41 +08:00
void create ( IPager2 * p , LogicalPageID newPageID , std : : string queueName ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " FIFOQueue(%s) create from page %s \n " , queueName . c_str ( ) , toString ( newPageID ) . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
pager = p ;
name = queueName ;
numPages = 1 ;
numEntries = 0 ;
2019-10-01 17:06:00 +08:00
dataBytesPerPage = pager - > getUsablePageSize ( ) - sizeof ( typename Cursor : : RawPage ) ;
2019-11-04 19:04:03 +08:00
headReader . init ( this , Cursor : : POP , newPageID , 0 , newPageID ) ;
2019-10-15 18:10:50 +08:00
tailWriter . init ( this , Cursor : : WRITE , newPageID ) ;
headWriter . init ( this , Cursor : : WRITE ) ;
newTailPage = invalidLogicalPageID ;
debug_printf ( " FIFOQueue(%s) created \n " , queueName . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
}
// Load an existing queue from its queue state
2019-08-14 13:41:41 +08:00
void recover ( IPager2 * p , const QueueState & qs , std : : string queueName ) {
2019-10-15 18:10:50 +08:00
debug_printf ( " FIFOQueue(%s) recover from queue state %s \n " , queueName . c_str ( ) , qs . toString ( ) . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
pager = p ;
name = queueName ;
numPages = qs . numPages ;
numEntries = qs . numEntries ;
2019-10-01 17:06:00 +08:00
dataBytesPerPage = pager - > getUsablePageSize ( ) - sizeof ( typename Cursor : : RawPage ) ;
2019-11-04 19:04:03 +08:00
headReader . init ( this , Cursor : : POP , qs . headPageID , qs . headOffset , qs . tailPageID ) ;
2019-10-15 18:10:50 +08:00
tailWriter . init ( this , Cursor : : WRITE , qs . tailPageID ) ;
headWriter . init ( this , Cursor : : WRITE ) ;
newTailPage = invalidLogicalPageID ;
debug_printf ( " FIFOQueue(%s) recovered \n " , queueName . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
ACTOR static Future < Standalone < VectorRef < T > > > peekAll_impl ( FIFOQueue * self ) {
state Standalone < VectorRef < T > > results ;
state Cursor c ;
c . initReadOnly ( self - > headReader ) ;
results . reserve ( results . arena ( ) , self - > numEntries ) ;
loop {
Optional < T > x = wait ( c . readNext ( ) ) ;
if ( ! x . present ( ) ) {
break ;
}
results . push_back ( results . arena ( ) , x . get ( ) ) ;
}
return results ;
}
Future < Standalone < VectorRef < T > > > peekAll ( ) {
return peekAll_impl ( this ) ;
}
// Pop the next item on front of queue if it is <= upperBound or if upperBound is not present
2019-08-07 17:36:33 +08:00
Future < Optional < T > > pop ( Optional < T > upperBound = { } ) {
2019-10-15 18:10:50 +08:00
return headReader . readNext ( upperBound ) ;
2019-08-07 17:36:33 +08:00
}
QueueState getState ( ) const {
QueueState s ;
2019-10-01 17:06:00 +08:00
s . headOffset = headReader . offset ;
2019-09-02 14:03:31 +08:00
s . headPageID = headReader . pageID ;
s . tailPageID = tailWriter . pageID ;
2019-08-07 17:36:33 +08:00
s . numEntries = numEntries ;
s . numPages = numPages ;
2019-08-16 18:24:55 +08:00
2019-10-15 18:10:50 +08:00
debug_printf ( " FIFOQueue(%s) getState(): %s \n " , name . c_str ( ) , s . toString ( ) . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
return s ;
}
2019-10-15 18:10:50 +08:00
void pushBack ( const T & item ) {
debug_printf ( " FIFOQueue(%s) pushBack(%s) \n " , name . c_str ( ) , toString ( item ) . c_str ( ) ) ;
tailWriter . write ( item ) ;
2019-08-07 17:36:33 +08:00
}
2019-10-15 18:10:50 +08:00
void pushFront ( const T & item ) {
debug_printf ( " FIFOQueue(%s) pushFront(%s) \n " , name . c_str ( ) , toString ( item ) . c_str ( ) ) ;
headWriter . write ( item ) ;
}
// Wait until the most recently started operations on each cursor as of now are ready
Future < Void > notBusy ( ) {
return headWriter . notBusy ( ) & & headReader . notBusy ( ) & & tailWriter . notBusy ( ) & & ready ( newTailPage ) ;
}
// Returns true if any most recently started operations on any cursors are not ready
bool busy ( ) {
return ! headWriter . notBusy ( ) . isReady ( ) | | ! headReader . notBusy ( ) . isReady ( ) | | ! tailWriter . notBusy ( ) . isReady ( ) | | ! newTailPage . isReady ( ) ;
}
// preFlush() prepares this queue to be flushed to disk, but doesn't actually do it so the queue can still
// be pushed and popped after this operation. It returns whether or not any operations were pending or
// started during execution.
//
// If one or more queues are used by their pager in newPageID() or freePage() operations, then preFlush()
// must be called on each of them inside a loop that runs until each of the preFlush() calls have returned
// false.
//
// The reason for all this is that:
// - queue pop() can call pager->freePage() which can call push() on the same or another queue
// - queue push() can call pager->newPageID() which can call pop() on the same or another queue
// This creates a circular dependency with 1 or more queues when those queues are used by the pager
// to manage free page IDs.
ACTOR static Future < bool > preFlush_impl ( FIFOQueue * self ) {
debug_printf ( " FIFOQueue(%s) preFlush begin \n " , self - > name . c_str ( ) ) ;
wait ( self - > notBusy ( ) ) ;
// Completion of the pending operations as of the start of notBusy() could have began new operations,
// so see if any work is pending now.
bool workPending = self - > busy ( ) ;
if ( ! workPending ) {
// A newly created or flushed queue starts out in a state where its tail page to be written to is empty.
// After pushBack() is called, this is no longer the case and never will be again until the queue is flushed.
// Before the non-empty tail page is written it must be linked to a new empty page for use after the next
// flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only be written
// once because once they contain durable data a second write to link to a new page could corrupt the existing
// data if the subsequent commit never succeeds.)
if ( self - > newTailPage . isReady ( ) & & self - > newTailPage . get ( ) = = invalidLogicalPageID & & self - > tailWriter . pendingWrites ( ) ) {
self - > newTailPage = self - > pager - > newPageID ( ) ;
workPending = true ;
2019-09-02 14:03:31 +08:00
}
}
2019-10-15 18:10:50 +08:00
debug_printf ( " FIFOQueue(%s) preFlush returning %d \n " , self - > name . c_str ( ) , workPending ) ;
return workPending ;
2019-09-02 14:03:31 +08:00
}
2019-10-15 18:10:50 +08:00
Future < bool > preFlush ( ) {
return preFlush_impl ( this ) ;
2019-08-07 17:36:33 +08:00
}
2019-10-15 18:10:50 +08:00
void finishFlush ( ) {
debug_printf ( " FIFOQueue(%s) finishFlush start \n " , name . c_str ( ) ) ;
ASSERT ( ! busy ( ) ) ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
// If a new tail page was allocated, link the last page of the tail writer to it.
if ( newTailPage . get ( ) ! = invalidLogicalPageID ) {
2019-10-18 12:34:17 +08:00
tailWriter . addNewPage ( newTailPage . get ( ) , 0 , false ) ;
2019-10-15 18:36:22 +08:00
// The flush sequence allocated a page and added it to the queue so increment numPages
+ + numPages ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
// newPage() should be ready immediately since a pageID is being explicitly passed.
ASSERT ( tailWriter . notBusy ( ) . isReady ( ) ) ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
newTailPage = invalidLogicalPageID ;
2019-08-07 17:36:33 +08:00
}
2019-10-15 18:10:50 +08:00
// If the headWriter wrote anything, link its tail page to the headReader position and point the headReader
// to the start of the headWriter
if ( headWriter . pendingWrites ( ) ) {
2019-10-18 12:34:17 +08:00
headWriter . addNewPage ( headReader . pageID , headReader . offset , false ) ;
2019-10-15 18:10:50 +08:00
headReader . pageID = headWriter . firstPageIDWritten ;
headReader . offset = 0 ;
2019-10-23 08:17:29 +08:00
headReader . page . clear ( ) ;
2019-09-02 14:03:31 +08:00
}
2019-10-15 18:10:50 +08:00
// Update headReader's end page to the new tail page
headReader . endPageID = tailWriter . pageID ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
// Reset the write cursors
tailWriter . init ( this , Cursor : : WRITE , tailWriter . pageID ) ;
headWriter . init ( this , Cursor : : WRITE ) ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
debug_printf ( " FIFOQueue(%s) finishFlush end \n " , name . c_str ( ) ) ;
}
ACTOR static Future < Void > flush_impl ( FIFOQueue * self ) {
loop {
bool notDone = wait ( self - > preFlush ( ) ) ;
if ( ! notDone ) {
break ;
}
}
self - > finishFlush ( ) ;
return Void ( ) ;
2019-09-02 14:03:31 +08:00
}
2019-10-15 18:10:50 +08:00
Future < Void > flush ( ) {
2019-09-02 14:03:31 +08:00
return flush_impl ( this ) ;
2019-08-07 17:36:33 +08:00
}
IPager2 * pager ;
int64_t numPages ;
int64_t numEntries ;
2019-10-01 17:06:00 +08:00
int dataBytesPerPage ;
2019-08-07 17:36:33 +08:00
2019-09-02 14:03:31 +08:00
Cursor headReader ;
Cursor tailWriter ;
2019-10-15 18:10:50 +08:00
Cursor headWriter ;
2019-09-02 14:03:31 +08:00
2019-10-15 18:10:50 +08:00
Future < LogicalPageID > newTailPage ;
2019-08-07 17:36:33 +08:00
// For debugging
std : : string name ;
} ;
int nextPowerOf2 ( uint32_t x ) {
return 1 < < ( 32 - clz ( x - 1 ) ) ;
}
2019-09-28 06:08:05 +08:00
class FastAllocatedPage : public IPage , public FastAllocated < FastAllocatedPage > , ReferenceCounted < FastAllocatedPage > {
2019-08-07 17:36:33 +08:00
public :
// Create a fast-allocated page with size total bytes INCLUDING checksum
FastAllocatedPage ( int size , int bufferSize ) : logicalSize ( size ) , bufferSize ( bufferSize ) {
buffer = ( uint8_t * ) allocateFast ( bufferSize ) ;
VALGRIND_MAKE_MEM_DEFINED ( buffer + logicalSize , bufferSize - logicalSize ) ;
} ;
virtual ~ FastAllocatedPage ( ) {
freeFast ( bufferSize , buffer ) ;
}
// Usable size, without checksum
int size ( ) const {
return logicalSize - sizeof ( Checksum ) ;
}
uint8_t const * begin ( ) const {
return buffer ;
}
uint8_t * mutate ( ) {
return buffer ;
}
void addref ( ) const {
ReferenceCounted < FastAllocatedPage > : : addref ( ) ;
}
void delref ( ) const {
ReferenceCounted < FastAllocatedPage > : : delref ( ) ;
}
typedef uint32_t Checksum ;
Checksum & getChecksum ( ) {
return * ( Checksum * ) ( buffer + size ( ) ) ;
}
Checksum calculateChecksum ( LogicalPageID pageID ) {
return crc32c_append ( pageID , buffer , size ( ) ) ;
}
void updateChecksum ( LogicalPageID pageID ) {
getChecksum ( ) = calculateChecksum ( pageID ) ;
}
bool verifyChecksum ( LogicalPageID pageID ) {
return getChecksum ( ) = = calculateChecksum ( pageID ) ;
}
private :
int logicalSize ;
int bufferSize ;
uint8_t * buffer ;
} ;
// Holds an index of recently used objects.
// ObjectType must have the method
// bool evictable() const;
// indicating if it is safe to evict.
template < class IndexType , class ObjectType >
class ObjectCache {
public :
2019-08-14 18:05:37 +08:00
ObjectCache ( int sizeLimit = 0 ) : sizeLimit ( sizeLimit ) {
2019-08-07 17:36:33 +08:00
}
2019-09-29 04:26:01 +08:00
// Get the object for i if it exists, else return nullptr.
// If the object exists, its eviction order will NOT change as this is not a cache hit.
ObjectType * getIfExists ( const IndexType & index ) {
auto i = cache . find ( index ) ;
if ( i ! = cache . end ( ) ) {
return & i - > second . item ;
}
return nullptr ;
}
2019-08-07 17:36:33 +08:00
// Get the object for i or create a new one.
// After a get(), the object for i is the last in evictionOrder.
ObjectType & get ( const IndexType & index ) {
Entry & entry = cache [ index ] ;
// If entry is linked into evictionOrder then move it to the back of the order
if ( entry . is_linked ( ) ) {
// Move the entry to the back of the eviction order
evictionOrder . erase ( evictionOrder . iterator_to ( entry ) ) ;
evictionOrder . push_back ( entry ) ;
}
else {
// Finish initializing entry
entry . index = index ;
// Insert the newly created Entry at the back of the eviction order
evictionOrder . push_back ( entry ) ;
// If the cache is too big, try to evict the first Entry in the eviction order
if ( cache . size ( ) > sizeLimit ) {
Entry & toEvict = evictionOrder . front ( ) ;
2019-08-16 06:44:54 +08:00
// Don't evict the entry that was just added as then we can't return a reference to it.
if ( toEvict . index ! = index & & toEvict . item . evictable ( ) ) {
2019-10-18 12:34:17 +08:00
debug_printf ( " Evicting %s to make room for %s \n " , toString ( toEvict . index ) . c_str ( ) , toString ( index ) . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
evictionOrder . pop_front ( ) ;
cache . erase ( toEvict . index ) ;
}
}
}
return entry . item ;
}
2019-08-11 18:26:00 +08:00
// Clears the cache and calls destroy() on each ObjectType
void destroy ( ) {
2019-08-07 17:36:33 +08:00
evictionOrder . clear ( ) ;
2019-08-11 18:26:00 +08:00
for ( auto & entry : cache ) {
entry . second . item . destroy ( ) ;
}
2019-08-07 17:36:33 +08:00
cache . clear ( ) ;
}
2019-10-02 21:43:11 +08:00
int count ( ) const {
ASSERT ( evictionOrder . size ( ) = = cache . size ( ) ) ;
return evictionOrder . size ( ) ;
}
2019-08-07 17:36:33 +08:00
private :
struct Entry : public boost : : intrusive : : list_base_hook < > {
IndexType index ;
ObjectType item ;
} ;
int sizeLimit ;
// TODO: Use boost intrusive unordered set instead, with a comparator that only considers entry.index
std : : unordered_map < IndexType , Entry > cache ;
2019-10-02 21:43:11 +08:00
boost : : intrusive : : list < Entry > evictionOrder ;
2019-08-07 17:36:33 +08:00
} ;
2019-08-11 18:26:00 +08:00
ACTOR template < class T > Future < T > forwardError ( Future < T > f , Promise < Void > target ) {
try {
T x = wait ( f ) ;
return x ;
}
catch ( Error & e ) {
if ( e . code ( ) ! = error_code_actor_cancelled & & target . canBeSet ( ) ) {
target . sendError ( e ) ;
}
throw e ;
}
}
2019-08-07 17:36:33 +08:00
2019-11-04 19:04:03 +08:00
class DWALPagerSnapshot ;
// An implementation of IPager2 that supports atomicUpdate() of a page without forcing a change to new page ID.
// It does this internally mapping the original page ID to alternate page IDs by write version.
// The page id remaps are kept in memory and also logged to a "remap queue" which must be reloaded on cold start.
// To prevent the set of remaps from growing unboundedly, once a remap is old enough to be at or before the
// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue,
// copying the alternate page ID's data over top of the original page ID's data, and deleting the remap from memory.
// This process basically describes a "Delayed" Write-Ahead-Log (DWAL) because the remap queue and the newly allocated
// alternate pages it references basically serve as a write ahead log for pages that will eventially be copied
// back to their original location once the original version is no longer needed.
class DWALPager : public IPager2 {
2019-08-07 17:36:33 +08:00
public :
typedef FastAllocatedPage Page ;
typedef FIFOQueue < LogicalPageID > LogicalPageQueueT ;
2019-11-04 19:04:03 +08:00
# pragma pack(push, 1)
2019-09-02 14:03:31 +08:00
struct DelayedFreePage {
Version version ;
LogicalPageID pageID ;
bool operator < ( const DelayedFreePage & rhs ) const {
return version < rhs . version ;
}
2019-10-01 17:06:00 +08:00
std : : string toString ( ) const {
2019-11-04 19:04:03 +08:00
return format ( " DelayedFreePage{%s @% " PRId64 " } " , : : toString ( pageID ) . c_str ( ) , version ) ;
2019-10-01 17:06:00 +08:00
}
2019-09-02 14:03:31 +08:00
} ;
2019-11-04 19:04:03 +08:00
struct RemappedPage {
Version version ;
LogicalPageID originalPageID ;
LogicalPageID newPageID ;
bool operator < ( const RemappedPage & rhs ) {
return version < rhs . version ;
}
std : : string toString ( ) const {
return format ( " RemappedPage(%s -> %s @% " PRId64 " } " , : : toString ( originalPageID ) . c_str ( ) , : : toString ( newPageID ) . c_str ( ) , version ) ;
}
} ;
# pragma pack(pop)
typedef FIFOQueue < DelayedFreePage > DelayedFreePageQueueT ;
typedef FIFOQueue < RemappedPage > RemapQueueT ;
2019-09-02 14:03:31 +08:00
2019-08-07 17:36:33 +08:00
// If the file already exists, pageSize might be different than desiredPageSize
2019-08-14 18:05:37 +08:00
// Use pageCacheSizeBytes == 0 for default
2019-11-04 19:04:03 +08:00
DWALPager ( int desiredPageSize , std : : string filename , int pageCacheSizeBytes )
2019-10-15 18:10:50 +08:00
: desiredPageSize ( desiredPageSize ) , filename ( filename ) , pHeader ( nullptr ) , pageCacheBytes ( pageCacheSizeBytes )
{
2019-08-14 18:05:37 +08:00
if ( pageCacheBytes = = 0 ) {
pageCacheBytes = g_network - > isSimulated ( ) ? ( BUGGIFY ? FLOW_KNOBS - > BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS - > SIM_PAGE_CACHE_4K ) : FLOW_KNOBS - > PAGE_CACHE_4K ;
}
2019-08-07 17:36:33 +08:00
commitFuture = Void ( ) ;
2019-08-11 18:26:00 +08:00
recoverFuture = forwardError ( recover ( this ) , errorPromise ) ;
2019-08-07 17:36:33 +08:00
}
void setPageSize ( int size ) {
logicalPageSize = size ;
physicalPageSize = smallestPhysicalBlock ;
while ( logicalPageSize > physicalPageSize ) {
physicalPageSize + = smallestPhysicalBlock ;
}
if ( pHeader ! = nullptr ) {
pHeader - > pageSize = logicalPageSize ;
}
2019-10-02 21:43:11 +08:00
ASSERT ( pageCache . count ( ) = = 0 ) ;
pageCache = PageCacheT ( pageCacheBytes / physicalPageSize ) ;
2019-08-07 17:36:33 +08:00
}
2019-08-16 18:24:55 +08:00
void updateCommittedHeader ( ) {
memcpy ( lastCommittedHeaderPage - > mutate ( ) , headerPage - > begin ( ) , smallestPhysicalBlock ) ;
}
2019-11-04 19:04:03 +08:00
ACTOR static Future < Void > recover ( DWALPager * self ) {
2019-08-07 17:36:33 +08:00
ASSERT ( ! self - > recoverFuture . isValid ( ) ) ;
2019-11-04 19:04:03 +08:00
self - > remapUndoFuture = Void ( ) ;
2019-08-07 17:36:33 +08:00
int64_t flags = IAsyncFile : : OPEN_UNCACHED | IAsyncFile : : OPEN_READWRITE | IAsyncFile : : OPEN_LOCK ;
state bool exists = fileExists ( self - > filename ) ;
if ( ! exists ) {
flags | = IAsyncFile : : OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile : : OPEN_CREATE ;
}
wait ( store ( self - > pageFile , IAsyncFileSystem : : filesystem ( ) - > open ( self - > filename , flags , 0644 ) ) ) ;
// Header page is always treated as having a page size of smallestPhysicalBlock
self - > setPageSize ( smallestPhysicalBlock ) ;
2019-08-16 18:24:55 +08:00
self - > lastCommittedHeaderPage = self - > newPageBuffer ( ) ;
self - > pLastCommittedHeader = ( Header * ) self - > lastCommittedHeaderPage - > begin ( ) ;
2019-08-07 17:36:33 +08:00
2019-08-16 18:24:55 +08:00
state int64_t fileSize = 0 ;
2019-08-07 17:36:33 +08:00
if ( exists ) {
2019-08-14 18:01:46 +08:00
wait ( store ( fileSize , self - > pageFile - > size ( ) ) ) ;
}
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) recover exists=%d fileSize=% " PRId64 " \n " , self - > filename . c_str ( ) , exists , fileSize ) ;
2019-08-14 19:41:12 +08:00
// TODO: If the file exists but appears to never have been successfully committed is this an error or
// should recovery proceed with a new pager instance?
2019-08-14 18:01:46 +08:00
2019-08-14 19:41:12 +08:00
// If there are at least 2 pages then try to recover the existing file
if ( exists & & fileSize > = ( self - > smallestPhysicalBlock * 2 ) ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) recovering using existing file \n " ) ;
2019-08-07 17:36:33 +08:00
2019-08-14 19:41:12 +08:00
state bool recoveredHeader = false ;
2019-08-16 18:24:55 +08:00
// Read physical page 0 directly
wait ( store ( self - > headerPage , self - > readHeaderPage ( self , 0 ) ) ) ;
2019-08-14 19:41:12 +08:00
2019-08-16 18:24:55 +08:00
// If the checksum fails for the header page, try to recover committed header backup from page 1
2019-08-16 19:17:29 +08:00
if ( ! self - > headerPage . castTo < Page > ( ) - > verifyChecksum ( 0 ) ) {
2019-11-04 19:04:03 +08:00
TraceEvent ( SevWarn , " DWALPagerRecoveringHeader " ) . detail ( " Filename " , self - > filename ) ;
2019-08-14 19:41:12 +08:00
2019-08-16 18:24:55 +08:00
wait ( store ( self - > headerPage , self - > readHeaderPage ( self , 1 ) ) ) ;
2019-08-14 19:41:12 +08:00
2019-08-14 20:22:08 +08:00
if ( ! self - > headerPage . castTo < Page > ( ) - > verifyChecksum ( 1 ) ) {
2019-08-14 19:41:12 +08:00
if ( g_network - > isSimulated ( ) ) {
// TODO: Detect if process is being restarted and only throw injected if so?
throw io_error ( ) . asInjectedFault ( ) ;
}
2019-08-16 18:24:55 +08:00
Error e = checksum_failed ( ) ;
2019-11-04 19:04:03 +08:00
TraceEvent ( SevError , " DWALPagerRecoveryFailed " )
2019-08-16 18:24:55 +08:00
. detail ( " Filename " , self - > filename )
. error ( e ) ;
throw e ;
2019-08-14 19:41:12 +08:00
}
recoveredHeader = true ;
}
2019-08-07 17:36:33 +08:00
self - > pHeader = ( Header * ) self - > headerPage - > begin ( ) ;
self - > setPageSize ( self - > pHeader - > pageSize ) ;
if ( self - > logicalPageSize ! = self - > desiredPageSize ) {
2019-11-04 19:04:03 +08:00
TraceEvent ( SevWarn , " DWALPagerPageSizeNotDesired " )
2019-08-07 17:36:33 +08:00
. detail ( " Filename " , self - > filename )
. detail ( " ExistingPageSize " , self - > logicalPageSize )
. detail ( " DesiredPageSize " , self - > desiredPageSize ) ;
}
2019-08-14 13:41:41 +08:00
self - > freeList . recover ( self , self - > pHeader - > freeList , " FreeListRecovered " ) ;
2019-09-02 14:03:31 +08:00
self - > delayedFreeList . recover ( self , self - > pHeader - > delayedFreeList , " DelayedFreeListRecovered " ) ;
2019-11-04 19:04:03 +08:00
self - > remapQueue . recover ( self , self - > pHeader - > remapQueue , " RemapQueueRecovered " ) ;
Standalone < VectorRef < RemappedPage > > remaps = wait ( self - > remapQueue . peekAll ( ) ) ;
for ( auto & r : remaps ) {
if ( r . newPageID ! = invalidLogicalPageID ) {
self - > remappedPages [ r . originalPageID ] [ r . version ] = r . newPageID ;
}
}
2019-08-14 19:41:12 +08:00
2019-08-16 18:24:55 +08:00
// If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing.
// If this fails, the backup header is still in tact for the next recovery attempt.
2019-08-14 19:41:12 +08:00
if ( recoveredHeader ) {
// Write the header to page 0
2019-08-16 18:24:55 +08:00
wait ( self - > writeHeaderPage ( 0 , self - > headerPage ) ) ;
2019-08-14 19:41:12 +08:00
// Wait for all outstanding writes to complete
2019-09-02 14:03:31 +08:00
wait ( self - > operations . signalAndCollapse ( ) ) ;
2019-08-14 19:41:12 +08:00
// Sync header
wait ( self - > pageFile - > sync ( ) ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) Header recovery complete. \n " , self - > filename . c_str ( ) ) ;
2019-08-14 19:41:12 +08:00
}
2019-08-16 18:24:55 +08:00
// Update the last committed header with the one that was recovered (which is the last known committed header)
self - > updateCommittedHeader ( ) ;
2019-09-02 14:03:31 +08:00
self - > addLatestSnapshot ( ) ;
2019-08-07 17:36:33 +08:00
}
else {
2019-08-16 18:24:55 +08:00
// Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully committed.
// A new pager will be created in its place.
// TODO: Is the right behavior?
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) creating new pager \n " ) ;
2019-08-07 17:36:33 +08:00
self - > headerPage = self - > newPageBuffer ( ) ;
self - > pHeader = ( Header * ) self - > headerPage - > begin ( ) ;
// Now that the header page has been allocated, set page size to desired
self - > setPageSize ( self - > desiredPageSize ) ;
// Write new header using desiredPageSize
2019-09-05 15:47:57 +08:00
self - > pHeader - > formatVersion = Header : : FORMAT_VERSION ;
2019-08-07 17:36:33 +08:00
self - > pHeader - > committedVersion = 1 ;
2019-10-15 18:10:50 +08:00
self - > pHeader - > oldestVersion = 1 ;
2019-08-07 17:36:33 +08:00
// No meta key until a user sets one and commits
self - > pHeader - > setMetaKey ( Key ( ) ) ;
2019-08-14 19:41:12 +08:00
// There are 2 reserved pages:
// Page 0 - header
2019-08-16 18:24:55 +08:00
// Page 1 - header backup
2019-08-07 17:36:33 +08:00
self - > pHeader - > pageCount = 2 ;
2019-11-04 19:04:03 +08:00
// Create queues
2019-10-18 16:27:00 +08:00
self - > freeList . create ( self , self - > newLastPageID ( ) , " FreeList " ) ;
self - > delayedFreeList . create ( self , self - > newLastPageID ( ) , " delayedFreeList " ) ;
2019-11-04 19:04:03 +08:00
self - > remapQueue . create ( self , self - > newLastPageID ( ) , " remapQueue " ) ;
2019-08-07 17:36:33 +08:00
2019-09-02 14:03:31 +08:00
// The first commit() below will flush the queues and update the queue states in the header,
// but since the queues will not be used between now and then their states will not change.
// In order to populate lastCommittedHeader, update the header now with the queue states.
2019-08-16 18:24:55 +08:00
self - > pHeader - > freeList = self - > freeList . getState ( ) ;
2019-09-02 14:03:31 +08:00
self - > pHeader - > delayedFreeList = self - > delayedFreeList . getState ( ) ;
2019-11-04 19:04:03 +08:00
self - > pHeader - > remapQueue = self - > remapQueue . getState ( ) ;
2019-08-16 18:24:55 +08:00
// Set remaining header bytes to \xff
memset ( self - > headerPage - > mutate ( ) + self - > pHeader - > size ( ) , 0xff , self - > headerPage - > size ( ) - self - > pHeader - > size ( ) ) ;
// Since there is no previously committed header use the initial header for the initial commit.
self - > updateCommittedHeader ( ) ;
2019-08-07 17:36:33 +08:00
wait ( self - > commit ( ) ) ;
}
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) recovered. committedVersion=% " PRId64 " logicalPageSize=%d physicalPageSize=%d \n " , self - > filename . c_str ( ) , self - > pHeader - > committedVersion , self - > logicalPageSize , self - > physicalPageSize ) ;
2019-08-07 17:36:33 +08:00
return Void ( ) ;
}
2019-09-28 06:08:05 +08:00
Reference < IPage > newPageBuffer ( ) override {
2019-08-07 17:36:33 +08:00
return Reference < IPage > ( new FastAllocatedPage ( logicalPageSize , physicalPageSize ) ) ;
}
// Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead).
// For a given pager instance, separate calls to this function must return the same value.
2019-09-28 06:08:05 +08:00
int getUsablePageSize ( ) override {
2019-08-07 17:36:33 +08:00
return logicalPageSize - sizeof ( FastAllocatedPage : : Checksum ) ;
}
// Get a new, previously available page ID. The page will be considered in-use after the next commit
2019-10-15 18:10:50 +08:00
// regardless of whether or not it was written to, until it is returned to the pager via freePage()
2019-11-04 19:04:03 +08:00
ACTOR static Future < LogicalPageID > newPageID_impl ( DWALPager * self ) {
2019-10-15 18:10:50 +08:00
// First try the free list
Optional < LogicalPageID > freePageID = wait ( self - > freeList . pop ( ) ) ;
if ( freePageID . present ( ) ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) newPageID() returning %s from free list \n " , self - > filename . c_str ( ) , toString ( freePageID . get ( ) ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
return freePageID . get ( ) ;
}
2019-10-18 16:27:00 +08:00
// Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in the snapshots list
ASSERT ( ! self - > snapshots . empty ( ) ) ;
2019-10-23 08:17:29 +08:00
Optional < DelayedFreePage > delayedFreePageID = wait ( self - > delayedFreeList . pop ( DelayedFreePage { self - > effectiveOldestVersion ( ) , 0 } ) ) ;
2019-10-15 18:10:50 +08:00
if ( delayedFreePageID . present ( ) ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) newPageID() returning %s from delayed free list \n " , self - > filename . c_str ( ) , toString ( delayedFreePageID . get ( ) ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
return delayedFreePageID . get ( ) . pageID ;
}
2019-10-18 16:27:00 +08:00
// Lastly, add a new page to the pager
LogicalPageID id = self - > newLastPageID ( ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) newPageID() returning %s at end of file \n " , self - > filename . c_str ( ) , toString ( id ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
return id ;
2019-08-07 17:36:33 +08:00
} ;
2019-10-18 16:27:00 +08:00
// Grow the pager file by pone page and return it
LogicalPageID newLastPageID ( ) {
LogicalPageID id = pHeader - > pageCount ;
+ + pHeader - > pageCount ;
return id ;
}
2019-10-15 18:10:50 +08:00
Future < LogicalPageID > newPageID ( ) override {
return forwardError ( newPageID_impl ( this ) , errorPromise ) ;
}
2019-08-16 18:24:55 +08:00
Future < Void > writeHeaderPage ( PhysicalPageID pageID , Reference < IPage > page ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) header op=write %s \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-08-16 18:24:55 +08:00
( ( Page * ) page . getPtr ( ) ) - > updateChecksum ( pageID ) ;
return holdWhile ( page , pageFile - > write ( page - > begin ( ) , smallestPhysicalBlock , ( int64_t ) pageID * smallestPhysicalBlock ) ) ;
}
2019-08-07 17:36:33 +08:00
Future < Void > writePhysicalPage ( PhysicalPageID pageID , Reference < IPage > page ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=write %s \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
( ( Page * ) page . getPtr ( ) ) - > updateChecksum ( pageID ) ;
2019-08-16 18:24:55 +08:00
return holdWhile ( page , pageFile - > write ( page - > begin ( ) , physicalPageSize , ( int64_t ) pageID * physicalPageSize ) ) ;
2019-08-07 17:36:33 +08:00
}
2019-09-28 06:08:05 +08:00
void updatePage ( LogicalPageID pageID , Reference < IPage > data ) override {
2019-08-07 17:36:33 +08:00
// Get the cache entry for this page
PageCacheEntry & cacheEntry = pageCache . get ( pageID ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=write %s cached=%d reading=%d writing=%d \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , cacheEntry . readFuture . isValid ( ) , cacheEntry . reading ( ) , cacheEntry . writing ( ) ) ;
2019-08-07 17:36:33 +08:00
2019-08-07 19:11:33 +08:00
// If the page is still being read then it's not also being written because a write places
// the new content in the cache entry when the write is launched, not when it is completed.
// Any waiting readers should not see this write (though this might change)
if ( cacheEntry . reading ( ) ) {
2019-09-02 14:03:31 +08:00
// Wait for the read to finish, then start the write.
2019-10-18 12:34:17 +08:00
cacheEntry . writeFuture = map ( success ( cacheEntry . readFuture ) , [ = ] ( Void ) {
2019-08-07 19:11:33 +08:00
writePhysicalPage ( pageID , data ) ;
return Void ( ) ;
} ) ;
}
2019-11-04 19:04:03 +08:00
// If the page is being written, wait for this write before issuing the new write
else if ( cacheEntry . writing ( ) ) {
cacheEntry . writeFuture = map ( cacheEntry . writeFuture , [ = ] ( Void ) {
writePhysicalPage ( pageID , data ) ;
return Void ( ) ;
} ) ;
}
2019-08-07 19:11:33 +08:00
else {
2019-11-04 19:04:03 +08:00
cacheEntry . writeFuture = writePhysicalPage ( pageID , data ) ;
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
cacheEntry . writeFuture = forwardError ( cacheEntry . writeFuture , errorPromise ) ;
operations . add ( cacheEntry . writeFuture ) ;
2019-08-07 19:11:33 +08:00
// Always update the page contents immediately regardless of what happened above.
2019-10-18 12:34:17 +08:00
cacheEntry . readFuture = data ;
2019-08-07 17:36:33 +08:00
}
2019-09-28 06:08:05 +08:00
Future < LogicalPageID > atomicUpdatePage ( LogicalPageID pageID , Reference < IPage > data , Version v ) override {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=writeAtomic %s @% " PRId64 " \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , v ) ;
2019-09-28 06:08:05 +08:00
// This pager does not support atomic update, so it always allocates and uses a new pageID
2019-08-11 18:26:00 +08:00
Future < LogicalPageID > f = map ( newPageID ( ) , [ = ] ( LogicalPageID newPageID ) {
2019-08-07 17:36:33 +08:00
updatePage ( newPageID , data ) ;
2019-11-04 19:04:03 +08:00
// TODO: Possibly limit size of remap queue since it must be recovered on cold start
RemappedPage r { v , pageID , newPageID } ;
remapQueue . pushBack ( r ) ;
remappedPages [ pageID ] [ v ] = newPageID ;
debug_printf ( " DWALPager(%s) pushed %s \n " , filename . c_str ( ) , RemappedPage ( r ) . toString ( ) . c_str ( ) ) ;
return pageID ;
2019-08-07 17:36:33 +08:00
} ) ;
2019-08-11 18:26:00 +08:00
2019-11-04 19:04:03 +08:00
// No need for forwardError here because newPageID() is already wrapped in forwardError
return f ;
2019-08-07 17:36:33 +08:00
}
2019-09-28 06:08:05 +08:00
void freePage ( LogicalPageID pageID , Version v ) override {
2019-11-04 19:04:03 +08:00
// If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, so queue it for later deletion
if ( remappedPages . find ( pageID ) ! = remappedPages . end ( ) ) {
debug_printf ( " DWALPager(%s) op=freeRemapped %s @% " PRId64 " oldestVersion=% " PRId64 " \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , v , pLastCommittedHeader - > oldestVersion ) ;
remapQueue . pushBack ( RemappedPage { v , pageID , invalidLogicalPageID } ) ;
return ;
}
2019-09-05 15:47:57 +08:00
// If v is older than the oldest version still readable then mark pageID as free as of the next commit
2019-10-23 08:17:29 +08:00
if ( v < effectiveOldestVersion ( ) ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=freeNow %s @% " PRId64 " oldestVersion=% " PRId64 " \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , v , pLastCommittedHeader - > oldestVersion ) ;
2019-09-05 15:47:57 +08:00
freeList . pushBack ( pageID ) ;
}
else {
// Otherwise add it to the delayed free list
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=freeLater %s @% " PRId64 " oldestVersion=% " PRId64 " \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , v , pLastCommittedHeader - > oldestVersion ) ;
2019-09-05 15:47:57 +08:00
delayedFreeList . pushBack ( { v , pageID } ) ;
}
2019-08-07 17:36:33 +08:00
} ;
2019-08-16 18:24:55 +08:00
// Header pages use a page size of smallestPhysicalBlock
// If the user chosen physical page size is larger, then there will be a gap of unused space after
// between the end of page 1 and the start of page 2.
2019-11-04 19:04:03 +08:00
ACTOR static Future < Reference < IPage > > readHeaderPage ( DWALPager * self , PhysicalPageID pageID ) {
2019-10-26 05:52:06 +08:00
if ( g_network - > getCurrentTask ( ) > TaskPriority : : DiskRead ) {
wait ( delay ( 0 , TaskPriority : : DiskRead ) ) ;
}
2019-08-16 18:24:55 +08:00
state Reference < IPage > page ( new FastAllocatedPage ( smallestPhysicalBlock , smallestPhysicalBlock ) ) ;
int readBytes = wait ( self - > pageFile - > read ( page - > mutate ( ) , smallestPhysicalBlock , ( int64_t ) pageID * smallestPhysicalBlock ) ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) header op=read_complete %s bytes=%d \n " , self - > filename . c_str ( ) , toString ( pageID ) . c_str ( ) , readBytes ) ;
2019-08-16 18:24:55 +08:00
ASSERT ( readBytes = = smallestPhysicalBlock ) ;
return page ;
}
2019-11-04 19:04:03 +08:00
ACTOR static Future < Reference < IPage > > readPhysicalPage ( DWALPager * self , PhysicalPageID pageID ) {
2019-10-26 05:52:06 +08:00
if ( g_network - > getCurrentTask ( ) > TaskPriority : : DiskRead ) {
wait ( delay ( 0 , TaskPriority : : DiskRead ) ) ;
}
2019-08-07 17:36:33 +08:00
state Reference < IPage > page = self - > newPageBuffer ( ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=read_physical_start %s \n " , self - > filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-08-11 18:26:00 +08:00
int readBytes = wait ( self - > pageFile - > read ( page - > mutate ( ) , self - > physicalPageSize , ( int64_t ) pageID * self - > physicalPageSize ) ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=read_complete %s bytes=%d \n " , self - > filename . c_str ( ) , toString ( pageID ) . c_str ( ) , readBytes ) ;
2019-08-11 18:26:00 +08:00
ASSERT ( readBytes = = self - > physicalPageSize ) ;
2019-08-14 18:01:46 +08:00
Page * p = ( Page * ) page . getPtr ( ) ;
2019-08-16 18:24:55 +08:00
if ( ! p - > verifyChecksum ( pageID ) ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) checksum failed for %s \n " , self - > filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-08-14 18:01:46 +08:00
Error e = checksum_failed ( ) ;
2019-11-04 19:04:03 +08:00
TraceEvent ( SevError , " DWALPagerChecksumFailed " )
2019-08-14 18:01:46 +08:00
. detail ( " Filename " , self - > filename . c_str ( ) )
. detail ( " PageID " , pageID )
. detail ( " PageSize " , self - > physicalPageSize )
. detail ( " Offset " , pageID * self - > physicalPageSize )
. detail ( " CalculatedChecksum " , p - > calculateChecksum ( pageID ) )
. detail ( " ChecksumInPage " , p - > getChecksum ( ) )
. error ( e ) ;
throw e ;
}
2019-08-07 17:36:33 +08:00
return page ;
}
// Reads the most recent version of pageID either committed or written using updatePage()
2019-09-28 06:08:05 +08:00
Future < Reference < IPage > > readPage ( LogicalPageID pageID , bool cacheable ) override {
2019-09-29 04:26:01 +08:00
// Use cached page if present, without triggering a cache hit.
// Otherwise, read the page and return it but don't add it to the cache
2019-09-28 06:08:05 +08:00
if ( ! cacheable ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=read_nocache %s \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-09-29 04:26:01 +08:00
PageCacheEntry * pCacheEntry = pageCache . getIfExists ( pageID ) ;
if ( pCacheEntry ! = nullptr ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=read_nocache_hit %s \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-10-18 12:34:17 +08:00
return pCacheEntry - > readFuture ;
2019-09-29 04:26:01 +08:00
}
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=read_nocache_miss %s \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-09-29 04:26:01 +08:00
return forwardError ( readPhysicalPage ( this , ( PhysicalPageID ) pageID ) , errorPromise ) ;
2019-09-28 06:08:05 +08:00
}
2019-08-07 17:36:33 +08:00
PageCacheEntry & cacheEntry = pageCache . get ( pageID ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) op=read %s cached=%d reading=%d writing=%d \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , cacheEntry . readFuture . isValid ( ) , cacheEntry . reading ( ) , cacheEntry . writing ( ) ) ;
2019-08-07 17:36:33 +08:00
2019-10-18 12:34:17 +08:00
if ( ! cacheEntry . readFuture . isValid ( ) ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) issuing actual read of %s \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) ) ;
2019-10-18 12:34:17 +08:00
cacheEntry . readFuture = readPhysicalPage ( this , ( PhysicalPageID ) pageID ) ;
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
cacheEntry . readFuture = forwardError ( cacheEntry . readFuture , errorPromise ) ;
return cacheEntry . readFuture ;
}
Future < Reference < IPage > > readPageAtVersion ( LogicalPageID pageID , Version v , bool cacheable ) {
auto i = remappedPages . find ( pageID ) ;
if ( i ! = remappedPages . end ( ) ) {
auto j = i - > second . upper_bound ( v ) ;
if ( j ! = i - > second . begin ( ) ) {
- - j ;
debug_printf ( " DWALPager(%s) read %s @% " PRId64 " -> %s \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , v , toString ( j - > second ) . c_str ( ) ) ;
pageID = j - > second ;
}
}
else {
debug_printf ( " DWALPager(%s) read %s @% " PRId64 " (not remapped) \n " , filename . c_str ( ) , toString ( pageID ) . c_str ( ) , v ) ;
}
return readPage ( pageID , cacheable ) ;
2019-08-07 17:36:33 +08:00
}
// Get snapshot as of the most recent committed version of the pager
2019-09-28 06:08:05 +08:00
Reference < IPagerSnapshot > getReadSnapshot ( Version v ) override ;
2019-10-18 16:27:00 +08:00
void addLatestSnapshot ( ) ;
2019-09-02 14:03:31 +08:00
2019-10-23 08:17:29 +08:00
// Set the pending oldest versiont to keep as of the next commit
2019-09-28 06:08:05 +08:00
void setOldestVersion ( Version v ) override {
2019-10-15 18:10:50 +08:00
ASSERT ( v > = pHeader - > oldestVersion ) ;
ASSERT ( v < = pHeader - > committedVersion ) ;
pHeader - > oldestVersion = v ;
expireSnapshots ( v ) ;
2019-09-02 14:03:31 +08:00
} ;
2019-10-23 08:17:29 +08:00
// Get the oldest version set as of the last commit.
Version getOldestVersion ( ) override {
return pLastCommittedHeader - > oldestVersion ;
2019-09-02 14:03:31 +08:00
} ;
2019-08-07 17:36:33 +08:00
2019-10-23 08:17:29 +08:00
// Calculate the *effective* oldest version, which can be older than the one set in the last commit since we
// are allowing active snapshots to temporarily delay page reuse.
Version effectiveOldestVersion ( ) {
return std : : min ( pLastCommittedHeader - > oldestVersion , snapshots . front ( ) . version ) ;
}
2019-11-04 19:04:03 +08:00
ACTOR static Future < Void > undoRemaps ( DWALPager * self ) {
state RemappedPage cutoff ;
cutoff . version = self - > effectiveOldestVersion ( ) ;
// TODO: Use parallel reads
// TODO: One run of this actor might write to the same original page more than once, in which case just unmap the latest
loop {
if ( self - > remapUndoStop ) {
break ;
}
state Optional < RemappedPage > p = wait ( self - > remapQueue . pop ( cutoff ) ) ;
if ( ! p . present ( ) ) {
break ;
}
debug_printf ( " DWALPager(%s) undoRemaps popped %s \n " , self - > filename . c_str ( ) , p . get ( ) . toString ( ) . c_str ( ) ) ;
if ( p . get ( ) . newPageID = = invalidLogicalPageID ) {
debug_printf ( " DWALPager(%s) undoRemaps freeing %s \n " , self - > filename . c_str ( ) , p . get ( ) . toString ( ) . c_str ( ) ) ;
self - > freePage ( p . get ( ) . originalPageID , p . get ( ) . version ) ;
}
else {
// Read the data from the page that the original was mapped to
Reference < IPage > data = wait ( self - > readPage ( p . get ( ) . newPageID , false ) ) ;
// Some page reads will mark the unused portion of the page as undefined to catch bugs with valgrind.
// We are blindly copying the page data to a new location regardless of its format so mark all of it defined.
VALGRIND_MAKE_MEM_DEFINED ( data - > begin ( ) , data - > size ( ) ) ;
// Write the data to the original page so it can be read using its original pageID
self - > updatePage ( p . get ( ) . originalPageID , data ) ;
// Remove the remap from this page, deleting the entry for the pageID if its map becomes empty
auto i = self - > remappedPages . find ( p . get ( ) . originalPageID ) ;
if ( i - > second . size ( ) = = 1 ) {
self - > remappedPages . erase ( i ) ;
}
else {
i - > second . erase ( p . get ( ) . version ) ;
}
// Now that the remap has been undone nothing will read this page so it can be freed as of the next commit.
self - > freePage ( p . get ( ) . newPageID , 0 ) ;
}
}
debug_printf ( " DWALPager(%s) undoRemaps stopped, remapQueue size is %d \n " , self - > filename . c_str ( ) , self - > remapQueue . numEntries ) ;
return Void ( ) ;
}
ACTOR static Future < Void > commit_impl ( DWALPager * self ) {
debug_printf ( " DWALPager(%s) commit begin \n " , self - > filename . c_str ( ) ) ;
2019-09-05 15:47:57 +08:00
2019-08-16 18:24:55 +08:00
// Write old committed header to Page 1
2019-09-02 14:03:31 +08:00
self - > operations . add ( self - > writeHeaderPage ( 1 , self - > lastCommittedHeaderPage ) ) ;
2019-11-04 19:04:03 +08:00
// Trigger the remap eraser to stop and then wait for it.
self - > remapUndoStop = true ;
wait ( self - > remapUndoFuture ) ;
// Flush remap queue separately, it's not involved in free page management
wait ( self - > remapQueue . flush ( ) ) ;
self - > pHeader - > remapQueue = self - > remapQueue . getState ( ) ;
2019-10-24 00:31:06 +08:00
// Flush the free list and delayed free list queues together as they are used by freePage() and newPageID()
2019-10-15 18:10:50 +08:00
loop {
state bool freeBusy = wait ( self - > freeList . preFlush ( ) ) ;
state bool delayedFreeBusy = wait ( self - > delayedFreeList . preFlush ( ) ) ;
2019-10-18 16:27:00 +08:00
// Once preFlush() returns false for both queues then there are no more operations pending
// on either queue. If preFlush() returns true for either queue in one loop execution then
// it could have generated new work for itself or the other queue.
2019-10-15 18:10:50 +08:00
if ( ! freeBusy & & ! delayedFreeBusy ) {
break ;
}
}
self - > freeList . finishFlush ( ) ;
self - > delayedFreeList . finishFlush ( ) ;
2019-08-07 17:36:33 +08:00
2019-10-15 18:10:50 +08:00
self - > pHeader - > freeList = self - > freeList . getState ( ) ;
self - > pHeader - > delayedFreeList = self - > delayedFreeList . getState ( ) ;
2019-08-14 19:41:12 +08:00
2019-08-07 17:36:33 +08:00
// Wait for all outstanding writes to complete
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) waiting for outstanding writes \n " , self - > filename . c_str ( ) ) ;
2019-09-02 14:03:31 +08:00
wait ( self - > operations . signalAndCollapse ( ) ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) Syncing \n " , self - > filename . c_str ( ) ) ;
2019-08-07 17:36:33 +08:00
// Sync everything except the header
2019-10-26 05:52:06 +08:00
if ( g_network - > getCurrentTask ( ) > TaskPriority : : DiskWrite ) {
wait ( delay ( 0 , TaskPriority : : DiskWrite ) ) ;
}
2019-08-07 17:36:33 +08:00
wait ( self - > pageFile - > sync ( ) ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) commit version % " PRId64 " sync 1 \n " , self - > filename . c_str ( ) , self - > pHeader - > committedVersion ) ;
2019-08-07 17:36:33 +08:00
2019-08-08 17:57:23 +08:00
// Update header on disk and sync again.
2019-08-16 18:24:55 +08:00
wait ( self - > writeHeaderPage ( 0 , self - > headerPage ) ) ;
2019-10-26 05:52:06 +08:00
if ( g_network - > getCurrentTask ( ) > TaskPriority : : DiskWrite ) {
wait ( delay ( 0 , TaskPriority : : DiskWrite ) ) ;
}
2019-08-07 17:36:33 +08:00
wait ( self - > pageFile - > sync ( ) ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) commit version % " PRId64 " sync 2 \n " , self - > filename . c_str ( ) , self - > pHeader - > committedVersion ) ;
2019-08-07 17:36:33 +08:00
2019-08-16 18:24:55 +08:00
// Update the last committed header for use in the next commit.
self - > updateCommittedHeader ( ) ;
2019-09-02 14:03:31 +08:00
self - > addLatestSnapshot ( ) ;
2019-08-07 17:36:33 +08:00
2019-10-18 16:27:00 +08:00
// Try to expire snapshots up to the oldest version, in case some were being kept around due to being in use,
// because maybe some are no longer in use.
self - > expireSnapshots ( self - > pHeader - > oldestVersion ) ;
2019-11-04 19:04:03 +08:00
// Start unmapping pages for expired versions
self - > remapUndoStop = false ;
self - > remapUndoFuture = undoRemaps ( self ) ;
2019-08-07 17:36:33 +08:00
return Void ( ) ;
}
2019-09-28 06:08:05 +08:00
Future < Void > commit ( ) override {
2019-08-07 17:36:33 +08:00
// Can't have more than one commit outstanding.
ASSERT ( commitFuture . isReady ( ) ) ;
2019-08-11 18:26:00 +08:00
commitFuture = forwardError ( commit_impl ( this ) , errorPromise ) ;
2019-08-07 17:36:33 +08:00
return commitFuture ;
}
2019-09-28 06:08:05 +08:00
Key getMetaKey ( ) const override {
2019-08-07 17:36:33 +08:00
ASSERT ( recoverFuture . isReady ( ) ) ;
return pHeader - > getMetaKey ( ) ;
}
2019-10-15 18:10:50 +08:00
void setCommitVersion ( Version v ) override {
2019-08-07 17:36:33 +08:00
pHeader - > committedVersion = v ;
}
2019-09-28 06:08:05 +08:00
void setMetaKey ( KeyRef metaKey ) override {
2019-08-07 17:36:33 +08:00
pHeader - > setMetaKey ( metaKey ) ;
}
2019-11-04 19:04:03 +08:00
ACTOR void shutdown ( DWALPager * self , bool dispose ) {
2019-08-07 17:36:33 +08:00
self - > recoverFuture . cancel ( ) ;
2019-08-11 18:26:00 +08:00
self - > commitFuture . cancel ( ) ;
2019-11-04 19:04:03 +08:00
self - > remapUndoFuture . cancel ( ) ;
2019-08-07 17:36:33 +08:00
2019-11-04 19:04:03 +08:00
if ( self - > errorPromise . canBeSet ( ) ) {
2019-08-07 17:36:33 +08:00
self - > errorPromise . sendError ( actor_cancelled ( ) ) ; // Ideally this should be shutdown_in_progress
2019-11-04 19:04:03 +08:00
}
2019-08-07 17:36:33 +08:00
2019-11-04 19:04:03 +08:00
self - > operations . clear ( ) ;
2019-08-11 18:26:00 +08:00
// Destroy the cache, cancelling reads and writes in progress
self - > pageCache . destroy ( ) ;
2019-08-07 17:36:33 +08:00
2019-11-04 19:04:03 +08:00
// Unreference the file and clear
2019-08-07 17:36:33 +08:00
self - > pageFile . clear ( ) ;
2019-08-17 20:21:54 +08:00
if ( dispose ) {
wait ( IAsyncFileSystem : : filesystem ( ) - > incrementalDeleteFile ( self - > filename , true ) ) ;
}
2019-08-07 17:36:33 +08:00
self - > closedPromise . send ( Void ( ) ) ;
delete self ;
}
2019-09-28 06:08:05 +08:00
void dispose ( ) override {
2019-08-07 17:36:33 +08:00
shutdown ( this , true ) ;
}
2019-09-28 06:08:05 +08:00
void close ( ) override {
2019-08-07 17:36:33 +08:00
shutdown ( this , false ) ;
}
2019-09-28 06:08:05 +08:00
Future < Void > getError ( ) override {
2019-08-07 17:36:33 +08:00
return errorPromise . getFuture ( ) ;
}
2019-09-28 06:08:05 +08:00
Future < Void > onClosed ( ) override {
2019-08-07 17:36:33 +08:00
return closedPromise . getFuture ( ) ;
}
2019-09-28 06:08:05 +08:00
StorageBytes getStorageBytes ( ) override {
2019-08-07 17:36:33 +08:00
ASSERT ( recoverFuture . isReady ( ) ) ;
int64_t free ;
int64_t total ;
g_network - > getDiskBytes ( parentDirectory ( filename ) , free , total ) ;
int64_t pagerSize = pHeader - > pageCount * physicalPageSize ;
2019-10-24 00:31:06 +08:00
// It is not exactly known how many pages on the delayed free list are usable as of right now. It could be,
// if each commit delayed entries that were freeable were shuffled from the delayed free queue to the free queue.
// but this doesn't seem necessary most of the time.
int64_t reusable = ( freeList . numEntries + delayedFreeList . numEntries ) * physicalPageSize ;
2019-08-07 17:36:33 +08:00
return StorageBytes ( free , total , pagerSize , free + reusable ) ;
}
2019-10-24 00:31:06 +08:00
// Get the number of pages in use but not by the pager itself.
2019-11-04 19:04:03 +08:00
Future < int64_t > getUserPageCount ( ) override {
return map ( remapUndoFuture , [ = ] ( Void ) {
int64_t userPages = pHeader - > pageCount - 2 - freeList . numPages - freeList . numEntries - delayedFreeList . numPages - delayedFreeList . numEntries - remapQueue . numPages ;
debug_printf ( " DWALPager(%s) userPages=% " PRId64 " totalPageCount=% " PRId64 " freeQueuePages=% " PRId64 " freeQueueCount=% " PRId64 " delayedFreeQueuePages=% " PRId64 " delayedFreeQueueCount=% " PRId64 " remapQueuePages=% " PRId64 " remapQueueCount=% " PRId64 " \n " ,
filename . c_str ( ) , userPages , pHeader - > pageCount , freeList . numPages , freeList . numEntries , delayedFreeList . numPages , delayedFreeList . numEntries , remapQueue . numPages , remapQueue . numEntries ) ;
return userPages ;
} ) ;
2019-10-24 00:31:06 +08:00
}
2019-10-23 08:17:29 +08:00
Future < Void > init ( ) override {
return recoverFuture ;
}
Version getLatestVersion ( ) override {
return pLastCommittedHeader - > committedVersion ;
2019-08-07 17:36:33 +08:00
}
private :
2019-11-04 19:04:03 +08:00
~ DWALPager ( ) { }
2019-08-07 17:36:33 +08:00
2019-10-18 16:27:00 +08:00
// Try to expire snapshots up to but not including v, but do not expire any snapshots that are in use.
void expireSnapshots ( Version v ) ;
2019-09-02 14:03:31 +08:00
2019-08-07 17:36:33 +08:00
# pragma pack(push, 1)
// Header is the format of page 0 of the database
struct Header {
2019-09-05 15:47:57 +08:00
static constexpr int FORMAT_VERSION = 1 ;
uint16_t formatVersion ;
2019-08-07 17:36:33 +08:00
uint32_t pageSize ;
int64_t pageCount ;
FIFOQueue < LogicalPageID > : : QueueState freeList ;
2019-09-02 14:03:31 +08:00
FIFOQueue < DelayedFreePage > : : QueueState delayedFreeList ;
2019-11-04 19:04:03 +08:00
FIFOQueue < RemappedPage > : : QueueState remapQueue ;
2019-08-07 17:36:33 +08:00
Version committedVersion ;
2019-10-15 18:10:50 +08:00
Version oldestVersion ;
2019-08-07 17:36:33 +08:00
int32_t metaKeySize ;
2019-09-28 06:08:05 +08:00
KeyRef getMetaKey ( ) const {
return KeyRef ( ( const uint8_t * ) ( this + 1 ) , metaKeySize ) ;
2019-08-07 17:36:33 +08:00
}
void setMetaKey ( StringRef key ) {
ASSERT ( key . size ( ) < ( smallestPhysicalBlock - sizeof ( Header ) ) ) ;
metaKeySize = key . size ( ) ;
2019-09-28 06:08:05 +08:00
memcpy ( this + 1 , key . begin ( ) , key . size ( ) ) ;
2019-09-05 15:47:57 +08:00
ASSERT ( formatVersion = = FORMAT_VERSION ) ;
2019-08-07 17:36:33 +08:00
}
int size ( ) const {
return sizeof ( Header ) + metaKeySize ;
}
private :
Header ( ) ;
} ;
# pragma pack(pop)
struct PageCacheEntry {
2019-10-18 12:34:17 +08:00
Future < Reference < IPage > > readFuture ;
2019-08-07 19:11:33 +08:00
Future < Void > writeFuture ;
2019-08-07 17:36:33 +08:00
2019-08-07 19:11:33 +08:00
bool reading ( ) const {
2019-10-18 12:34:17 +08:00
return readFuture . isValid ( ) & & ! readFuture . isReady ( ) ;
2019-08-07 19:11:33 +08:00
}
bool writing ( ) const {
return writeFuture . isValid ( ) & & ! writeFuture . isReady ( ) ;
2019-08-07 17:36:33 +08:00
}
2019-08-07 19:11:33 +08:00
bool evictable ( ) const {
// Don't evict if a page is still being read or written
2019-10-18 12:34:17 +08:00
return ! reading ( ) & & ! writing ( ) ;
2019-08-07 17:36:33 +08:00
}
2019-08-11 18:26:00 +08:00
void destroy ( ) {
2019-10-18 12:34:17 +08:00
readFuture . cancel ( ) ;
2019-08-11 18:26:00 +08:00
writeFuture . cancel ( ) ;
}
2019-08-07 17:36:33 +08:00
} ;
// Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires
// this in simulation, and it also makes sense for current SSDs.
// Allowing a smaller 'logical' page size is very useful for testing.
static constexpr int smallestPhysicalBlock = 4096 ;
int physicalPageSize ;
int logicalPageSize ; // In simulation testing it can be useful to use a small logical page size
2019-08-14 18:05:37 +08:00
int64_t pageCacheBytes ;
2019-08-07 17:36:33 +08:00
// The header will be written to / read from disk as a smallestPhysicalBlock sized chunk.
Reference < IPage > headerPage ;
Header * pHeader ;
int desiredPageSize ;
2019-08-16 18:24:55 +08:00
Reference < IPage > lastCommittedHeaderPage ;
Header * pLastCommittedHeader ;
2019-08-07 17:36:33 +08:00
std : : string filename ;
2019-08-14 18:05:37 +08:00
typedef ObjectCache < LogicalPageID , PageCacheEntry > PageCacheT ;
PageCacheT pageCache ;
2019-08-07 17:36:33 +08:00
Promise < Void > closedPromise ;
Promise < Void > errorPromise ;
Future < Void > commitFuture ;
2019-09-02 14:03:31 +08:00
SignalableActorCollection operations ;
2019-08-07 17:36:33 +08:00
Future < Void > recoverFuture ;
2019-11-04 19:04:03 +08:00
Future < Void > remapUndoFuture ;
bool remapUndoStop ;
2019-09-02 14:03:31 +08:00
2019-08-07 17:36:33 +08:00
Reference < IAsyncFile > pageFile ;
LogicalPageQueueT freeList ;
2019-11-04 19:04:03 +08:00
2019-09-05 15:47:57 +08:00
// The delayed free list will be approximately in Version order.
// TODO: Make this an ordered container some day.
2019-11-04 19:04:03 +08:00
DelayedFreePageQueueT delayedFreeList ;
RemapQueueT remapQueue ;
2019-09-02 14:03:31 +08:00
struct SnapshotEntry {
Version version ;
Promise < Void > expired ;
2019-11-04 19:04:03 +08:00
Reference < DWALPagerSnapshot > snapshot ;
2019-09-02 14:03:31 +08:00
} ;
struct SnapshotEntryLessThanVersion {
bool operator ( ) ( Version v , const SnapshotEntry & snapshot ) {
return v < snapshot . version ;
}
bool operator ( ) ( const SnapshotEntry & snapshot , Version v ) {
return snapshot . version < v ;
}
} ;
2019-11-04 19:04:03 +08:00
// TODO: Better data structure
std : : unordered_map < LogicalPageID , std : : map < Version , LogicalPageID > > remappedPages ;
2019-09-02 14:03:31 +08:00
std : : deque < SnapshotEntry > snapshots ;
2019-08-07 17:36:33 +08:00
} ;
// Prevents pager from reusing freed pages from version until the snapshot is destroyed
2019-11-04 19:04:03 +08:00
class DWALPagerSnapshot : public IPagerSnapshot , public ReferenceCounted < DWALPagerSnapshot > {
2019-08-07 17:36:33 +08:00
public :
2019-11-04 19:04:03 +08:00
DWALPagerSnapshot ( DWALPager * pager , Key meta , Version version , Future < Void > expiredFuture ) : pager ( pager ) , metaKey ( meta ) , version ( version ) , expired ( expiredFuture ) {
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
virtual ~ DWALPagerSnapshot ( ) {
2019-08-07 17:36:33 +08:00
}
2019-09-28 06:08:05 +08:00
Future < Reference < const IPage > > getPhysicalPage ( LogicalPageID pageID , bool cacheable ) override {
2019-09-02 14:03:31 +08:00
if ( expired . isError ( ) ) {
throw expired . getError ( ) ;
}
2019-11-04 19:04:03 +08:00
return map ( pager - > readPageAtVersion ( pageID , version , cacheable ) , [ = ] ( Reference < IPage > p ) {
2019-08-07 17:36:33 +08:00
return Reference < const IPage > ( p ) ;
} ) ;
}
2019-09-28 06:08:05 +08:00
Key getMetaKey ( ) const override {
2019-08-07 17:36:33 +08:00
return metaKey ;
}
2019-09-28 06:08:05 +08:00
Version getVersion ( ) const override {
2019-08-07 17:36:33 +08:00
return version ;
}
2019-09-28 06:08:05 +08:00
void addref ( ) override {
2019-11-04 19:04:03 +08:00
ReferenceCounted < DWALPagerSnapshot > : : addref ( ) ;
2019-08-07 17:36:33 +08:00
}
2019-09-28 06:08:05 +08:00
void delref ( ) override {
2019-11-04 19:04:03 +08:00
ReferenceCounted < DWALPagerSnapshot > : : delref ( ) ;
2019-08-07 17:36:33 +08:00
}
2019-11-04 19:04:03 +08:00
DWALPager * pager ;
2019-09-02 14:03:31 +08:00
Future < Void > expired ;
2019-08-07 17:36:33 +08:00
Version version ;
Key metaKey ;
} ;
2019-11-04 19:04:03 +08:00
void DWALPager : : expireSnapshots ( Version v ) {
debug_printf ( " DWALPager(%s) expiring snapshots through % " PRId64 " snapshot count %d \n " , filename . c_str ( ) , v , ( int ) snapshots . size ( ) ) ;
2019-10-18 16:27:00 +08:00
while ( snapshots . size ( ) > 1 & & snapshots . front ( ) . version < v & & snapshots . front ( ) . snapshot - > isSoleOwner ( ) ) {
2019-11-04 19:04:03 +08:00
debug_printf ( " DWALPager(%s) expiring snapshot for % " PRId64 " soleOwner=%d \n " , filename . c_str ( ) , snapshots . front ( ) . version , snapshots . front ( ) . snapshot - > isSoleOwner ( ) ) ;
2019-10-18 16:27:00 +08:00
// The snapshot contract could be made such that the expired promise isn't need anymore. In practice it
// probably is already not needed but it will gracefully handle the case where a user begins a page read
// with a snapshot reference, keeps the page read future, and drops the snapshot reference.
snapshots . front ( ) . expired . sendError ( transaction_too_old ( ) ) ;
snapshots . pop_front ( ) ;
}
}
2019-11-04 19:04:03 +08:00
Reference < IPagerSnapshot > DWALPager : : getReadSnapshot ( Version v ) {
2019-09-02 14:03:31 +08:00
ASSERT ( ! snapshots . empty ( ) ) ;
auto i = std : : upper_bound ( snapshots . begin ( ) , snapshots . end ( ) , v , SnapshotEntryLessThanVersion ( ) ) ;
if ( i = = snapshots . begin ( ) ) {
throw version_invalid ( ) ;
}
- - i ;
return i - > snapshot ;
}
2019-11-04 19:04:03 +08:00
void DWALPager : : addLatestSnapshot ( ) {
2019-09-02 14:03:31 +08:00
Promise < Void > expired ;
snapshots . push_back ( {
pLastCommittedHeader - > committedVersion ,
expired ,
2019-11-04 19:04:03 +08:00
Reference < DWALPagerSnapshot > ( new DWALPagerSnapshot ( this , pLastCommittedHeader - > getMetaKey ( ) , pLastCommittedHeader - > committedVersion , expired . getFuture ( ) ) )
2019-09-02 14:03:31 +08:00
} ) ;
2019-08-07 17:36:33 +08:00
}
2018-07-23 18:09:13 +08:00
2019-09-02 14:03:31 +08:00
2019-05-29 21:23:32 +08:00
// TODO: Move this to a flow header once it is mature.
struct SplitStringRef {
StringRef a ;
StringRef b ;
SplitStringRef ( StringRef a = StringRef ( ) , StringRef b = StringRef ( ) ) : a ( a ) , b ( b ) {
}
SplitStringRef ( Arena & arena , const SplitStringRef & toCopy )
: a ( toStringRef ( arena ) ) , b ( ) {
}
SplitStringRef prefix ( int len ) const {
if ( len < = a . size ( ) ) {
return SplitStringRef ( a . substr ( 0 , len ) ) ;
}
len - = a . size ( ) ;
return SplitStringRef ( a , b . substr ( 0 , len ) ) ;
}
StringRef toStringRef ( Arena & arena ) const {
StringRef c = makeString ( size ( ) , arena ) ;
memcpy ( mutateString ( c ) , a . begin ( ) , a . size ( ) ) ;
memcpy ( mutateString ( c ) + a . size ( ) , b . begin ( ) , b . size ( ) ) ;
return c ;
}
Standalone < StringRef > toStringRef ( ) const {
Arena a ;
return Standalone < StringRef > ( toStringRef ( a ) , a ) ;
}
int size ( ) const {
return a . size ( ) + b . size ( ) ;
}
int expectedSize ( ) const {
return size ( ) ;
}
std : : string toString ( ) const {
return format ( " %s%s " , a . toString ( ) . c_str ( ) , b . toString ( ) . c_str ( ) ) ;
}
std : : string toHexString ( ) const {
return format ( " %s%s " , a . toHexString ( ) . c_str ( ) , b . toHexString ( ) . c_str ( ) ) ;
}
struct const_iterator {
const uint8_t * ptr ;
const uint8_t * end ;
const uint8_t * next ;
inline bool operator = = ( const const_iterator & rhs ) const {
return ptr = = rhs . ptr ;
}
inline const_iterator & operator + + ( ) {
+ + ptr ;
if ( ptr = = end ) {
ptr = next ;
}
return * this ;
}
inline const_iterator & operator + ( int n ) {
ptr + = n ;
if ( ptr > = end ) {
ptr = next + ( ptr - end ) ;
}
return * this ;
}
inline uint8_t operator * ( ) const {
return * ptr ;
}
} ;
inline const_iterator begin ( ) const {
return { a . begin ( ) , a . end ( ) , b . begin ( ) } ;
}
inline const_iterator end ( ) const {
return { b . end ( ) } ;
}
template < typename StringT >
int compare ( const StringT & rhs ) const {
auto j = begin ( ) ;
auto k = rhs . begin ( ) ;
auto jEnd = end ( ) ;
auto kEnd = rhs . end ( ) ;
while ( j ! = jEnd & & k ! = kEnd ) {
int cmp = * j - * k ;
if ( cmp ! = 0 ) {
return cmp ;
}
}
// If we've reached the end of *this, then values are equal if rhs is also exhausted, otherwise *this is less than rhs
if ( j = = jEnd ) {
return k = = kEnd ? 0 : - 1 ;
}
return 1 ;
}
} ;
2019-09-28 06:08:05 +08:00
// A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together.
// NOTE: Uses host byte order
typedef VectorRef < LogicalPageID > BTreePageID ;
2019-10-15 18:10:50 +08:00
std : : string toString ( BTreePageID id ) {
return std : : string ( " BTreePageID " ) + toString ( id . begin ( ) , id . end ( ) ) ;
}
2019-02-21 18:46:30 +08:00
# define STR(x) LiteralStringRef(x)
struct RedwoodRecordRef {
2019-05-29 21:23:32 +08:00
typedef uint8_t byte ;
2019-02-21 18:46:30 +08:00
2019-04-30 08:00:29 +08:00
RedwoodRecordRef ( KeyRef key = KeyRef ( ) , Version ver = 0 , Optional < ValueRef > value = { } , uint32_t chunkTotal = 0 , uint32_t chunkStart = 0 )
: key ( key ) , version ( ver ) , value ( value ) , chunk ( { chunkTotal , chunkStart } )
2019-02-21 18:46:30 +08:00
{
}
2019-05-29 21:23:32 +08:00
RedwoodRecordRef ( Arena & arena , const RedwoodRecordRef & toCopy )
2019-06-04 19:03:52 +08:00
: key ( arena , toCopy . key ) , version ( toCopy . version ) , chunk ( toCopy . chunk )
{
if ( toCopy . value . present ( ) ) {
2019-09-28 06:08:05 +08:00
value = ValueRef ( arena , toCopy . value . get ( ) ) ;
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
RedwoodRecordRef ( KeyRef key , Optional < ValueRef > value , const byte intFields [ 14 ] )
: key ( key ) , value ( value )
{
deserializeIntFields ( intFields ) ;
}
2019-06-04 19:03:52 +08:00
// RedwoodRecordRefs are used for both internal and leaf pages of the BTree.
// Boundary records in internal pages are made from leaf records.
// These functions make creating and working with internal page records more convenient.
2019-09-28 06:08:05 +08:00
inline BTreePageID getChildPage ( ) const {
2019-06-04 19:03:52 +08:00
ASSERT ( value . present ( ) ) ;
2019-09-28 06:08:05 +08:00
return BTreePageID ( ( LogicalPageID * ) value . get ( ) . begin ( ) , value . get ( ) . size ( ) / sizeof ( LogicalPageID ) ) ;
}
inline void setChildPage ( BTreePageID id ) {
value = ValueRef ( ( const uint8_t * ) id . begin ( ) , id . size ( ) * sizeof ( LogicalPageID ) ) ;
2019-06-04 19:03:52 +08:00
}
2019-09-28 06:08:05 +08:00
inline void setChildPage ( Arena & arena , BTreePageID id ) {
value = ValueRef ( arena , ( const uint8_t * ) id . begin ( ) , id . size ( ) * sizeof ( LogicalPageID ) ) ;
2019-06-04 19:03:52 +08:00
}
2019-09-28 06:08:05 +08:00
inline RedwoodRecordRef withPageID ( BTreePageID id ) const {
return RedwoodRecordRef ( key , version , ValueRef ( ( const uint8_t * ) id . begin ( ) , id . size ( ) * sizeof ( LogicalPageID ) ) , chunk . total , chunk . start ) ;
2019-06-04 19:03:52 +08:00
}
inline RedwoodRecordRef withoutValue ( ) const {
2019-05-22 10:16:32 +08:00
return RedwoodRecordRef ( key , version , { } , chunk . total , chunk . start ) ;
}
2019-05-29 21:23:32 +08:00
// Returns how many bytes are in common between the integer fields of *this and other, assuming that
// all values are BigEndian, version is 64 bits, chunk total is 24 bits, and chunk start is 24 bits
int getCommonIntFieldPrefix ( const RedwoodRecordRef & other ) const {
if ( version ! = other . version ) {
return clzll ( version ^ other . version ) > > 3 ;
}
if ( chunk . total ! = other . chunk . total ) {
// the -1 is because we are only considering the lower 3 bytes
return 8 + ( clz ( chunk . total ^ other . chunk . total ) > > 3 ) - 1 ;
}
if ( chunk . start ! = other . chunk . start ) {
// the -1 is because we are only considering the lower 3 bytes
return 11 + ( clz ( chunk . start ^ other . chunk . start ) > > 3 ) - 1 ;
}
return 14 ;
}
2019-05-30 17:10:07 +08:00
// Truncate (key, version, chunk.total, chunk.start) tuple to len bytes.
void truncate ( int len ) {
if ( len < = key . size ( ) ) {
key = key . substr ( 0 , len ) ;
version = 0 ;
chunk . total = 0 ;
chunk . start = 0 ;
}
else {
byte fields [ intFieldArraySize ] ;
serializeIntFields ( fields ) ;
int end = len - key . size ( ) ;
for ( int i = intFieldArraySize - 1 ; i > = end ; - - i ) {
fields [ i ] = 0 ;
}
}
}
2019-05-29 21:23:32 +08:00
// Find the common prefix between two records, assuming that the first
// skip bytes are the same.
inline int getCommonPrefixLen ( const RedwoodRecordRef & other , int skip ) const {
int skipStart = std : : min ( skip , key . size ( ) ) ;
int common = skipStart + commonPrefixLength ( key . begin ( ) + skipStart , other . key . begin ( ) + skipStart , std : : min ( other . key . size ( ) , key . size ( ) ) - skipStart ) ;
if ( common = = key . size ( ) & & key . size ( ) = = other . key . size ( ) ) {
common + = getCommonIntFieldPrefix ( other ) ;
}
return common ;
}
static const int intFieldArraySize = 14 ;
// Write big endian values of version (64 bits), total (24 bits), and start (24 bits) fields
// to an array of 14 bytes
void serializeIntFields ( byte * dst ) const {
* ( uint32_t * ) ( dst + 10 ) = bigEndian32 ( chunk . start ) ;
* ( uint32_t * ) ( dst + 7 ) = bigEndian32 ( chunk . total ) ;
* ( uint64_t * ) dst = bigEndian64 ( version ) ;
}
// Initialize int fields from the array format that serializeIntFields produces
void deserializeIntFields ( const byte * src ) {
version = bigEndian64 ( * ( uint64_t * ) src ) ;
chunk . total = bigEndian32 ( * ( uint32_t * ) ( src + 7 ) ) & 0xffffff ;
chunk . start = bigEndian32 ( * ( uint32_t * ) ( src + 10 ) ) & 0xffffff ;
}
// TODO: Use SplitStringRef (unless it ends up being slower)
2019-02-21 18:46:30 +08:00
KeyRef key ;
Optional < ValueRef > value ;
2019-05-29 21:23:32 +08:00
Version version ;
2019-04-30 08:00:29 +08:00
struct {
uint32_t total ;
2019-09-29 04:27:00 +08:00
// TODO: Change start to chunk number?
2019-04-30 08:00:29 +08:00
uint32_t start ;
} chunk ;
2019-02-21 18:46:30 +08:00
int expectedSize ( ) const {
2019-04-30 08:00:29 +08:00
return key . expectedSize ( ) + value . expectedSize ( ) ;
2019-02-21 18:46:30 +08:00
}
bool isMultiPart ( ) const {
2019-05-29 21:23:32 +08:00
return chunk . total ! = 0 ;
2019-02-21 18:46:30 +08:00
}
// Generate a kv shard from a complete kv
RedwoodRecordRef split ( int start , int len ) {
2019-04-30 08:00:29 +08:00
ASSERT ( ! isMultiPart ( ) ) ;
return RedwoodRecordRef ( key , version , value . get ( ) . substr ( start , len ) , value . get ( ) . size ( ) , start ) ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
class Writer {
public :
Writer ( byte * ptr ) : wptr ( ptr ) { }
byte * wptr ;
template < typename T > void write ( const T & in ) {
* ( T * ) wptr = in ;
wptr + = sizeof ( T ) ;
}
// Write a big endian 1 or 2 byte integer using the high bit of the first byte as an "extension" bit.
// Values > 15 bits in length are not valid input but this is not checked for.
void writeVarInt ( int x ) {
if ( x > = 128 ) {
2019-05-30 07:47:53 +08:00
* wptr + + = ( uint8_t ) ( ( x > > 8 ) | 0x80 ) ;
2019-05-29 21:23:32 +08:00
}
* wptr + + = ( uint8_t ) x ;
}
void writeString ( StringRef s ) {
memcpy ( wptr , s . begin ( ) , s . size ( ) ) ;
wptr + = s . size ( ) ;
}
} ;
class Reader {
public :
Reader ( const void * ptr ) : rptr ( ( const byte * ) ptr ) { }
const byte * rptr ;
template < typename T > T read ( ) {
T r = * ( const T * ) rptr ;
rptr + = sizeof ( T ) ;
return r ;
}
// Read a big endian 1 or 2 byte integer using the high bit of the first byte as an "extension" bit.
int readVarInt ( ) {
int x = * rptr + + ;
// If the high bit is set
if ( x & 0x80 ) {
// Clear the high bit
x & = 0x7f ;
// Shift low byte left
x < < = 8 ;
// Read the new low byte and OR it in
x | = * rptr + + ;
}
return x ;
}
StringRef readString ( int len ) {
StringRef s ( rptr , len ) ;
rptr + = len ;
return s ;
}
const byte * readBytes ( int len ) {
const byte * b = rptr ;
rptr + = len ;
return b ;
}
} ;
2019-02-21 18:46:30 +08:00
# pragma pack(push,1)
struct Delta {
2019-05-29 21:23:32 +08:00
// Serialized Format
//
// 1 byte for Flags + a 4 bit length
// borrow source is prev ancestor - 0 or 1
// has_key_suffix
// has_value
// has_version
// other_fields suffix len - 4 bits
//
// If has value and value is not 4 bytes
// 1 byte value length
//
// 1 or 2 bytes for Prefix Borrow Length (hi bit indicates presence of second byte)
//
// IF has_key_suffix is set
// 1 or 2 bytes for Key Suffix Length
//
// Key suffix bytes
// Meta suffix bytes
// Value bytes
//
// For a series of RedwoodRecordRef's containing shards of the same KV pair where the key size is < 104 bytes,
// the overhead per middle chunk is 7 bytes:
// 4 bytes of child pointers in the DeltaTree Node
// 1 flag byte
// 1 prefix borrow length byte
// 1 meta suffix byte describing chunk start position
enum EFlags {
PREFIX_SOURCE = 0x80 ,
HAS_KEY_SUFFIX = 0x40 ,
HAS_VALUE = 0x20 ,
HAS_VERSION = 0x10 ,
INT_FIELD_SUFFIX_BITS = 0x0f
} ;
2019-02-21 18:46:30 +08:00
uint8_t flags ;
2019-07-02 15:58:43 +08:00
inline byte * data ( ) {
return ( byte * ) ( this + 1 ) ;
}
inline const byte * data ( ) const {
return ( const byte * ) ( this + 1 ) ;
}
2019-05-29 21:23:32 +08:00
void setPrefixSource ( bool val ) {
if ( val ) {
flags | = PREFIX_SOURCE ;
2019-04-30 08:00:29 +08:00
}
else {
2019-05-29 21:23:32 +08:00
flags & = ~ PREFIX_SOURCE ;
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
bool getPrefixSource ( ) const {
return flags & PREFIX_SOURCE ;
}
RedwoodRecordRef apply ( const RedwoodRecordRef & base , Arena & arena ) const {
2019-07-02 15:58:43 +08:00
Reader r ( data ( ) ) ;
2019-05-29 21:23:32 +08:00
int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS ;
int prefixLen = r . readVarInt ( ) ;
int valueLen = ( flags & HAS_VALUE ) ? r . read < uint8_t > ( ) : 0 ;
StringRef k ;
2019-09-02 14:03:31 +08:00
// Separate the borrowed key string byte count from the borrowed int field byte count
2019-05-29 21:23:32 +08:00
int keyPrefixLen = std : : min ( prefixLen , base . key . size ( ) ) ;
int intFieldPrefixLen = prefixLen - keyPrefixLen ;
int keySuffixLen = ( flags & HAS_KEY_SUFFIX ) ? r . readVarInt ( ) : 0 ;
2019-09-02 14:03:31 +08:00
// If there is a key suffix, reconstitute the complete key into a contiguous string
2019-05-29 21:23:32 +08:00
if ( keySuffixLen > 0 ) {
k = makeString ( keyPrefixLen + keySuffixLen , arena ) ;
memcpy ( mutateString ( k ) , base . key . begin ( ) , keyPrefixLen ) ;
memcpy ( mutateString ( k ) + keyPrefixLen , r . readString ( keySuffixLen ) . begin ( ) , keySuffixLen ) ;
}
else {
k = base . key . substr ( 0 , keyPrefixLen ) ;
}
// Now decode the integer fields
const byte * intFieldSuffix = r . readBytes ( intFieldSuffixLen ) ;
// Create big endian array in which to reassemble the integer fields from prefix and suffix bytes
byte intFields [ intFieldArraySize ] ;
// If borrowing any bytes, get the source's integer field array
if ( intFieldPrefixLen > 0 ) {
base . serializeIntFields ( intFields ) ;
}
else {
memset ( intFields , 0 , intFieldArraySize ) ;
}
// Version offset is used to skip the version bytes in the int field array when version is missing (aka 0)
2019-05-30 07:26:58 +08:00
int versionOffset = ( ( intFieldPrefixLen = = 0 ) & & ( ~ flags & HAS_VERSION ) ) ? 8 : 0 ;
2019-05-29 21:23:32 +08:00
// If there are suffix bytes, copy those into place after the prefix
if ( intFieldSuffixLen > 0 ) {
memcpy ( intFields + versionOffset + intFieldPrefixLen , intFieldSuffix , intFieldSuffixLen ) ;
}
// Zero out any remaining bytes if the array was initialized from base
if ( intFieldPrefixLen > 0 ) {
for ( int i = versionOffset + intFieldPrefixLen + intFieldSuffixLen ; i < intFieldArraySize ; + + i ) {
intFields [ i ] = 0 ;
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
return RedwoodRecordRef ( k , flags & HAS_VALUE ? r . readString ( valueLen ) : Optional < ValueRef > ( ) , intFields ) ;
}
int size ( ) const {
2019-07-02 15:58:43 +08:00
Reader r ( data ( ) ) ;
2019-05-29 21:23:32 +08:00
int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS ;
2019-06-18 09:55:49 +08:00
r . readVarInt ( ) ; // prefixlen
2019-05-29 21:23:32 +08:00
int valueLen = ( flags & HAS_VALUE ) ? r . read < uint8_t > ( ) : 0 ;
int keySuffixLen = ( flags & HAS_KEY_SUFFIX ) ? r . readVarInt ( ) : 0 ;
2019-07-02 15:58:43 +08:00
return sizeof ( Delta ) + r . rptr - data ( ) + intFieldSuffixLen + valueLen + keySuffixLen ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
// Delta can't be determined without the RedwoodRecordRef upon which the Delta is based.
2019-02-21 18:46:30 +08:00
std : : string toString ( ) const {
2019-07-02 15:58:43 +08:00
Reader r ( data ( ) ) ;
2019-05-29 21:23:32 +08:00
2019-05-30 17:10:07 +08:00
std : : string flagString = " " ;
2019-05-30 07:26:58 +08:00
if ( flags & PREFIX_SOURCE ) flagString + = " prefixSource " ;
if ( flags & HAS_KEY_SUFFIX ) flagString + = " keySuffix " ;
if ( flags & HAS_VERSION ) flagString + = " Version " ;
if ( flags & HAS_VALUE ) flagString + = " Value " ;
2019-05-29 21:23:32 +08:00
int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS ;
int prefixLen = r . readVarInt ( ) ;
int valueLen = ( flags & HAS_VALUE ) ? r . read < uint8_t > ( ) : 0 ;
int keySuffixLen = ( flags & HAS_KEY_SUFFIX ) ? r . readVarInt ( ) : 0 ;
2019-05-30 17:10:07 +08:00
return format ( " len: %d flags: %s prefixLen: %d keySuffixLen: %d intFieldSuffix: %d valueLen %d raw: %s " ,
size ( ) , flagString . c_str ( ) , prefixLen , keySuffixLen , intFieldSuffixLen , valueLen , StringRef ( ( const uint8_t * ) this , size ( ) ) . toHexString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
}
} ;
2019-09-02 14:03:31 +08:00
// Using this class as an alternative for Delta enables reading a DeltaTree<RecordRef> while only decoding
// its values, so the Reader does not require the original prev/next ancestors.
struct DeltaValueOnly : Delta {
RedwoodRecordRef apply ( const RedwoodRecordRef & base , Arena & arena ) const {
Reader r ( data ( ) ) ;
// Skip prefix length
r . readVarInt ( ) ;
// Get value length
int valueLen = ( flags & HAS_VALUE ) ? r . read < uint8_t > ( ) : 0 ;
// Skip key suffix length and bytes if exists
if ( flags & HAS_KEY_SUFFIX ) {
r . readString ( r . readVarInt ( ) ) ;
}
// Skip int field suffix if present
r . readBytes ( flags & INT_FIELD_SUFFIX_BITS ) ;
return RedwoodRecordRef ( StringRef ( ) , 0 , ( flags & HAS_VALUE ? r . readString ( valueLen ) : Optional < ValueRef > ( ) ) ) ;
}
} ;
2019-02-21 18:46:30 +08:00
# pragma pack(pop)
2019-06-04 19:03:52 +08:00
// Compares and orders by key, version, chunk.start, chunk.total.
// Value is not considered, as it is does not make sense for a container
// to have two records which differ only in value.
2019-02-21 18:46:30 +08:00
int compare ( const RedwoodRecordRef & rhs ) const {
int cmp = key . compare ( rhs . key ) ;
if ( cmp = = 0 ) {
cmp = version - rhs . version ;
if ( cmp = = 0 ) {
2019-05-22 10:16:32 +08:00
// It is assumed that in any data set there will never be more than one
// unique chunk total size for the same key and version, so sort by start, total
// Chunked (represented by chunk.total > 0) sorts higher than whole
cmp = chunk . start - rhs . chunk . start ;
if ( cmp = = 0 ) {
2019-04-30 08:00:29 +08:00
cmp = chunk . total - rhs . chunk . total ;
2019-02-21 18:46:30 +08:00
}
}
}
return cmp ;
}
2019-06-04 19:03:52 +08:00
// Compares key fields and value for equality
bool identical ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) = = 0 & & value = = rhs . value ;
}
2019-02-21 18:46:30 +08:00
bool operator = = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) = = 0 ;
}
2019-06-04 19:03:52 +08:00
bool operator ! = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) ! = 0 ;
}
bool operator < ( const RedwoodRecordRef & rhs ) const {
2019-02-21 18:46:30 +08:00
return compare ( rhs ) < 0 ;
}
bool operator > ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) > 0 ;
}
bool operator < = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) < = 0 ;
}
bool operator > = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) > = 0 ;
}
2019-06-06 11:58:47 +08:00
int deltaSize ( const RedwoodRecordRef & base , bool worstCase = true ) const {
2019-05-29 21:23:32 +08:00
int size = sizeof ( Delta ) ;
2019-02-21 18:46:30 +08:00
if ( value . present ( ) ) {
2019-05-29 21:23:32 +08:00
size + = value . get ( ) . size ( ) ;
+ + size ;
}
int prefixLen = getCommonPrefixLen ( base , 0 ) ;
2019-06-06 11:58:47 +08:00
size + = ( worstCase | | prefixLen > = 128 ) ? 2 : 1 ;
2019-05-29 21:23:32 +08:00
int intFieldPrefixLen ;
// Currently using a worst-guess guess where int fields in suffix are stored in their entirety if nonzero.
if ( prefixLen < key . size ( ) ) {
int keySuffixLen = key . size ( ) - prefixLen ;
2019-06-06 11:58:47 +08:00
size + = ( worstCase | | keySuffixLen > = 128 ) ? 2 : 1 ;
2019-05-29 21:23:32 +08:00
size + = keySuffixLen ;
intFieldPrefixLen = 0 ;
}
else {
intFieldPrefixLen = prefixLen - key . size ( ) ;
2019-06-06 11:58:47 +08:00
if ( worstCase ) {
size + = 2 ;
}
2019-05-29 21:23:32 +08:00
}
if ( version = = 0 & & chunk . total = = 0 & & chunk . start = = 0 ) {
// No int field suffix needed
}
else {
byte fields [ intFieldArraySize ] ;
serializeIntFields ( fields ) ;
const byte * end = fields + intFieldArraySize - 1 ;
int trailingNulls = 0 ;
while ( * end - - = = 0 ) {
+ + trailingNulls ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
size + = std : : max ( 0 , intFieldArraySize - intFieldPrefixLen - trailingNulls ) ;
2019-05-30 09:06:11 +08:00
if ( intFieldPrefixLen = = 0 & & version = = 0 ) {
size - = 8 ;
}
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
return size ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
// commonPrefix between *this and base can be passed if known
int writeDelta ( Delta & d , const RedwoodRecordRef & base , int commonPrefix = - 1 ) const {
d . flags = version = = 0 ? 0 : Delta : : HAS_VERSION ;
if ( commonPrefix < 0 ) {
commonPrefix = getCommonPrefixLen ( base , 0 ) ;
2019-04-30 08:00:29 +08:00
}
2019-05-29 21:23:32 +08:00
2019-07-02 15:58:43 +08:00
Writer w ( d . data ( ) ) ;
2019-05-29 21:23:32 +08:00
// prefixLen
w . writeVarInt ( commonPrefix ) ;
// valueLen
2019-02-21 18:46:30 +08:00
if ( value . present ( ) ) {
2019-05-29 21:23:32 +08:00
d . flags | = Delta : : HAS_VALUE ;
w . write < uint8_t > ( value . get ( ) . size ( ) ) ;
}
// keySuffixLen
if ( key . size ( ) > commonPrefix ) {
d . flags | = Delta : : HAS_KEY_SUFFIX ;
StringRef keySuffix = key . substr ( commonPrefix ) ;
w . writeVarInt ( keySuffix . size ( ) ) ;
// keySuffix
w . writeString ( keySuffix ) ;
}
// This is a common case, where no int suffix is needed
if ( version = = 0 & & chunk . total = = 0 & & chunk . start = = 0 ) {
// The suffixLen bits in flags are already zero, so nothing to do here.
}
else {
byte fields [ intFieldArraySize ] ;
serializeIntFields ( fields ) ;
// Find the position of the first null byte from the right
// This for loop has no endPos > 0 check because it is known that the array contains non-null bytes
int endPos ;
for ( endPos = intFieldArraySize ; fields [ endPos - 1 ] = = 0 ; - - endPos ) ;
// Start copying after any prefix bytes that matched the int fields of the base
int intFieldPrefixLen = std : : max ( 0 , commonPrefix - key . size ( ) ) ;
2019-05-30 07:26:58 +08:00
int startPos = intFieldPrefixLen + ( intFieldPrefixLen = = 0 & & version = = 0 ? 8 : 0 ) ;
2019-05-29 21:23:32 +08:00
int suffixLen = std : : max ( 0 , endPos - startPos ) ;
if ( suffixLen > 0 ) {
w . writeString ( StringRef ( fields + startPos , suffixLen ) ) ;
d . flags | = suffixLen ;
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
if ( value . present ( ) ) {
w . writeString ( value . get ( ) ) ;
}
2019-07-02 15:58:43 +08:00
return w . wptr - d . data ( ) + sizeof ( Delta ) ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
template < typename StringRefT >
static std : : string kvformat ( StringRefT s , int hexLimit = - 1 ) {
2019-02-21 18:46:30 +08:00
bool hex = false ;
for ( auto c : s ) {
if ( ! isprint ( c ) ) {
hex = true ;
break ;
}
}
return hex ? s . toHexString ( hexLimit ) : s . toString ( ) ;
}
std : : string toString ( int hexLimit = 15 ) const {
std : : string r ;
2019-06-04 19:03:52 +08:00
r + = format ( " '%s'@% " PRId64 , kvformat ( key , hexLimit ) . c_str ( ) , version ) ;
2019-06-06 11:58:47 +08:00
r + = format ( " [%u/%u]-> " , chunk . start , chunk . total ) ;
2019-02-21 18:46:30 +08:00
if ( value . present ( ) ) {
2019-06-04 19:03:52 +08:00
// Assume that values the size of a page ID are page IDs. It's not perfect but it's just for debugging.
if ( value . get ( ) . size ( ) = = sizeof ( LogicalPageID ) ) {
2019-10-15 18:10:50 +08:00
r + = format ( " [%s] " , : : toString ( getChildPage ( ) ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
}
else {
r + = format ( " '%s' " , kvformat ( value . get ( ) , hexLimit ) . c_str ( ) ) ;
}
2019-02-21 18:46:30 +08:00
}
else {
2019-06-04 19:03:52 +08:00
r + = " null " ;
2019-02-21 18:46:30 +08:00
}
return r ;
}
} ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
struct BTreePage {
2019-02-21 18:46:30 +08:00
2018-06-08 18:32:34 +08:00
enum EPageFlags { IS_LEAF = 1 } ;
2017-06-10 05:56:41 +08:00
2019-02-21 18:46:30 +08:00
typedef DeltaTree < RedwoodRecordRef > BinaryTree ;
2019-10-15 18:10:50 +08:00
typedef DeltaTree < RedwoodRecordRef , RedwoodRecordRef : : DeltaValueOnly > ValueTree ;
2019-02-21 18:46:30 +08:00
2019-09-05 15:47:57 +08:00
static constexpr int FORMAT_VERSION = 1 ;
2019-02-21 18:46:30 +08:00
# pragma pack(push,1)
struct {
2019-09-05 15:47:57 +08:00
uint16_t formatVersion ;
2019-02-21 18:46:30 +08:00
uint8_t flags ;
2019-09-05 15:47:57 +08:00
uint8_t height ;
uint16_t itemCount ;
2019-02-21 18:46:30 +08:00
uint32_t kvBytes ;
} ;
2018-10-19 11:26:45 +08:00
# pragma pack(pop)
2017-06-10 05:56:41 +08:00
2019-02-21 18:46:30 +08:00
int size ( ) const {
const BinaryTree * t = & tree ( ) ;
return ( uint8_t * ) t - ( uint8_t * ) this + t - > size ( ) ;
2018-09-19 15:32:39 +08:00
}
2019-02-21 18:46:30 +08:00
bool isLeaf ( ) const {
return flags & IS_LEAF ;
}
BinaryTree & tree ( ) {
2019-09-28 06:08:05 +08:00
return * ( BinaryTree * ) ( this + 1 ) ;
2019-02-21 18:46:30 +08:00
}
const BinaryTree & tree ( ) const {
2019-09-28 06:08:05 +08:00
return * ( const BinaryTree * ) ( this + 1 ) ;
2018-09-19 15:32:39 +08:00
}
2019-10-15 18:10:50 +08:00
const ValueTree & valueTree ( ) const {
return * ( const ValueTree * ) ( this + 1 ) ;
}
2019-09-28 06:08:05 +08:00
std : : string toString ( bool write , BTreePageID id , Version ver , const RedwoodRecordRef * lowerBound , const RedwoodRecordRef * upperBound ) const {
2018-06-08 18:32:34 +08:00
std : : string r ;
2019-10-15 18:10:50 +08:00
r + = format ( " BTreePage op=%s %s @% " PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d \n lowerBound: %s \n upperBound: %s \n " ,
2019-09-28 06:08:05 +08:00
write ? " write " : " read " , : : toString ( id ) . c_str ( ) , ver , this , ( int ) flags , ( int ) itemCount , ( int ) kvBytes ,
2019-02-21 18:46:30 +08:00
lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
2018-08-29 04:46:14 +08:00
try {
2019-09-05 15:47:57 +08:00
if ( itemCount > 0 ) {
2019-02-21 18:46:30 +08:00
// This doesn't use the cached reader for the page but it is only for debugging purposes
BinaryTree : : Reader reader ( & tree ( ) , lowerBound , upperBound ) ;
BinaryTree : : Cursor c = reader . getCursor ( ) ;
2018-08-29 04:46:14 +08:00
c . moveFirst ( ) ;
ASSERT ( c . valid ( ) ) ;
2019-06-06 11:58:47 +08:00
bool anyOutOfRange = false ;
2018-08-29 04:46:14 +08:00
do {
2019-02-21 18:46:30 +08:00
r + = " " ;
2019-06-04 19:03:52 +08:00
r + = c . get ( ) . toString ( ) ;
2017-06-10 05:56:41 +08:00
2019-06-06 11:58:47 +08:00
bool tooLow = c . get ( ) . key < lowerBound - > key ;
bool tooHigh = c . get ( ) . key > upperBound - > key ;
if ( tooLow | | tooHigh ) {
anyOutOfRange = true ;
if ( tooLow ) {
r + = " (too low) " ;
}
if ( tooHigh ) {
r + = " (too high) " ;
}
}
r + = " \n " ;
2019-05-22 10:16:32 +08:00
2018-08-29 04:46:14 +08:00
} while ( c . moveNext ( ) ) ;
2019-06-06 11:58:47 +08:00
ASSERT ( ! anyOutOfRange ) ;
2018-08-29 04:46:14 +08:00
}
2019-04-18 03:57:23 +08:00
} catch ( Error & e ) {
2018-08-29 04:46:14 +08:00
debug_printf ( " BTreePage::toString ERROR: %s \n " , e . what ( ) ) ;
debug_printf ( " BTreePage::toString partial result: %s \n " , r . c_str ( ) ) ;
throw ;
2018-06-08 18:32:34 +08:00
}
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
return r ;
2017-06-10 05:56:41 +08:00
}
2018-10-19 11:26:45 +08:00
} ;
2018-06-08 18:32:34 +08:00
2019-10-26 05:52:06 +08:00
static void makeEmptyRoot ( Reference < IPage > page ) {
2018-06-08 18:32:34 +08:00
BTreePage * btpage = ( BTreePage * ) page - > begin ( ) ;
2019-09-05 15:47:57 +08:00
btpage - > formatVersion = BTreePage : : FORMAT_VERSION ;
2019-10-26 05:52:06 +08:00
btpage - > flags = BTreePage : : IS_LEAF ;
2019-09-05 15:47:57 +08:00
btpage - > height = 1 ;
2018-06-08 18:32:34 +08:00
btpage - > kvBytes = 0 ;
2019-09-05 15:47:57 +08:00
btpage - > itemCount = 0 ;
2019-02-21 18:46:30 +08:00
btpage - > tree ( ) . build ( nullptr , nullptr , nullptr , nullptr ) ;
2019-08-07 17:36:33 +08:00
VALGRIND_MAKE_MEM_DEFINED ( page - > begin ( ) + btpage - > tree ( ) . size ( ) , page - > size ( ) - btpage - > tree ( ) . size ( ) ) ;
2019-02-21 18:46:30 +08:00
}
BTreePage : : BinaryTree : : Reader * getReader ( Reference < const IPage > page ) {
return ( BTreePage : : BinaryTree : : Reader * ) page - > userData ;
2018-06-08 18:32:34 +08:00
}
2017-06-10 05:56:41 +08:00
2019-09-28 06:08:05 +08:00
struct BoundaryRefAndPage {
2019-02-21 18:46:30 +08:00
Standalone < RedwoodRecordRef > lowerBound ;
2018-09-19 15:32:39 +08:00
Reference < IPage > firstPage ;
std : : vector < Reference < IPage > > extPages ;
2019-08-07 17:36:33 +08:00
std : : string toString ( ) const {
return format ( " [%s, %d pages] " , lowerBound . toString ( ) . c_str ( ) , extPages . size ( ) + ( firstPage ? 1 : 0 ) ) ;
}
2018-08-29 04:46:14 +08:00
} ;
2019-09-28 06:08:05 +08:00
# define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); }
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
# pragma pack(push, 1)
template < typename T , typename SizeT = int8_t >
struct InPlaceArray {
SizeT count ;
2019-02-21 18:46:30 +08:00
2019-09-28 06:08:05 +08:00
const T * begin ( ) const {
return ( T * ) ( this + 1 ) ;
}
T * begin ( ) {
return ( T * ) ( this + 1 ) ;
}
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
const T * end ( ) const {
return begin ( ) + count ;
}
T * end ( ) {
return begin ( ) + count ;
}
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
VectorRef < T > get ( ) {
return VectorRef < T > ( begin ( ) , count ) ;
2017-06-10 05:56:41 +08:00
}
2019-09-28 06:08:05 +08:00
void set ( VectorRef < T > v , int availableSpace ) {
ASSERT ( sizeof ( T ) * v . size ( ) < = availableSpace ) ;
count = v . size ( ) ;
memcpy ( begin ( ) , v . begin ( ) , sizeof ( T ) * v . size ( ) ) ;
}
2017-06-10 05:56:41 +08:00
2019-09-28 06:08:05 +08:00
int extraSize ( ) const {
return count * sizeof ( T ) ;
}
} ;
# pragma pack(pop)
2017-06-10 05:56:41 +08:00
class VersionedBTree : public IVersionedStore {
public :
2018-07-15 04:37:52 +08:00
// The first possible internal record possible in the tree
2019-02-21 18:46:30 +08:00
static RedwoodRecordRef dbBegin ;
2018-07-15 04:37:52 +08:00
// A record which is greater than the last possible record in the tree
2019-02-21 18:46:30 +08:00
static RedwoodRecordRef dbEnd ;
2017-06-10 05:56:41 +08:00
2019-08-07 17:36:33 +08:00
struct LazyDeleteQueueEntry {
Version version ;
2019-10-01 17:06:00 +08:00
Standalone < BTreePageID > pageID ;
2019-10-15 18:10:50 +08:00
bool operator < ( const LazyDeleteQueueEntry & rhs ) const {
2019-10-01 17:06:00 +08:00
return version < rhs . version ;
}
int readFromBytes ( const uint8_t * src ) {
version = * ( Version * ) src ;
src + = sizeof ( Version ) ;
int count = * src + + ;
pageID = BTreePageID ( ( LogicalPageID * ) src , count ) ;
return bytesNeeded ( ) ;
}
int bytesNeeded ( ) const {
return sizeof ( Version ) + 1 + ( pageID . size ( ) * sizeof ( LogicalPageID ) ) ;
}
int writeToBytes ( uint8_t * dst ) const {
* ( Version * ) dst = version ;
dst + = sizeof ( Version ) ;
* dst + + = pageID . size ( ) ;
memcpy ( dst , pageID . begin ( ) , pageID . size ( ) * sizeof ( LogicalPageID ) ) ;
return bytesNeeded ( ) ;
}
std : : string toString ( ) const {
2019-10-15 18:10:50 +08:00
return format ( " {%s @% " PRId64 " } " , : : toString ( pageID ) . c_str ( ) , version ) ;
2019-10-01 17:06:00 +08:00
}
2019-08-07 17:36:33 +08:00
} ;
typedef FIFOQueue < LazyDeleteQueueEntry > LazyDeleteQueueT ;
2019-09-05 15:47:57 +08:00
# pragma pack(push, 1)
2019-08-07 17:36:33 +08:00
struct MetaKey {
2019-09-05 15:47:57 +08:00
static constexpr int FORMAT_VERSION = 1 ;
uint16_t formatVersion ;
uint8_t height ;
2019-08-07 17:36:33 +08:00
LazyDeleteQueueT : : QueueState lazyDeleteQueue ;
2019-09-28 06:08:05 +08:00
InPlaceArray < LogicalPageID > root ;
2019-09-05 15:47:57 +08:00
2019-08-07 17:36:33 +08:00
KeyRef asKeyRef ( ) const {
2019-09-28 06:08:05 +08:00
return KeyRef ( ( uint8_t * ) this , sizeof ( MetaKey ) + root . extraSize ( ) ) ;
2019-08-07 17:36:33 +08:00
}
2019-09-05 15:47:57 +08:00
2019-08-07 17:36:33 +08:00
void fromKeyRef ( KeyRef k ) {
memcpy ( this , k . begin ( ) , k . size ( ) ) ;
2019-09-05 15:47:57 +08:00
ASSERT ( formatVersion = = FORMAT_VERSION ) ;
2019-08-07 17:36:33 +08:00
}
2019-10-15 18:10:50 +08:00
std : : string toString ( ) {
return format ( " {height=%d formatVersion=%d root=%s lazyDeleteQueue=%s} " , ( int ) height , ( int ) formatVersion , : : toString ( root . get ( ) ) . c_str ( ) , lazyDeleteQueue . toString ( ) . c_str ( ) ) ;
}
2019-08-07 17:36:33 +08:00
} ;
2019-09-05 15:47:57 +08:00
# pragma pack(pop)
2019-08-07 17:36:33 +08:00
2018-12-06 14:41:04 +08:00
struct Counts {
Counts ( ) {
memset ( this , 0 , sizeof ( Counts ) ) ;
}
void clear ( ) {
* this = Counts ( ) ;
}
2019-03-15 15:46:09 +08:00
int64_t pageReads ;
int64_t extPageReads ;
int64_t setBytes ;
2018-12-06 14:41:04 +08:00
int64_t pageWrites ;
2019-03-15 15:46:09 +08:00
int64_t extPageWrites ;
2018-12-06 14:41:04 +08:00
int64_t sets ;
int64_t clears ;
int64_t commits ;
2019-03-15 15:46:09 +08:00
int64_t gets ;
int64_t getRanges ;
int64_t commitToPage ;
int64_t commitToPageStart ;
2018-12-06 14:41:04 +08:00
2019-03-15 15:46:09 +08:00
std : : string toString ( bool clearAfter = false ) {
2019-05-29 21:23:32 +08:00
std : : string s = format ( " set=% " PRId64 " clear=% " PRId64 " get=% " PRId64 " getRange=% " PRId64 " commit=% " PRId64 " pageRead=% " PRId64 " extPageRead=% " PRId64 " pageWrite=% " PRId64 " extPageWrite=% " PRId64 " commitPage=% " PRId64 " commitPageStart=% " PRId64 " " ,
2019-03-15 15:46:09 +08:00
sets , clears , gets , getRanges , commits , pageReads , extPageReads , pageWrites , extPageWrites , commitToPage , commitToPageStart ) ;
if ( clearAfter ) {
clear ( ) ;
}
2018-12-06 14:41:04 +08:00
return s ;
}
} ;
2019-03-15 15:46:09 +08:00
// Using a static for metrics because a single process shouldn't normally have multiple storage engines
static Counts counts ;
2018-12-06 14:41:04 +08:00
2018-10-25 06:57:06 +08:00
// All async opts on the btree are based on pager reads, writes, and commits, so
// we can mostly forward these next few functions to the pager
2018-10-15 18:43:43 +08:00
virtual Future < Void > getError ( ) {
return m_pager - > getError ( ) ;
}
virtual Future < Void > onClosed ( ) {
return m_pager - > onClosed ( ) ;
}
2018-10-25 06:57:06 +08:00
void close_impl ( bool dispose ) {
2019-08-07 17:36:33 +08:00
auto * pager = m_pager ;
2018-10-25 06:57:06 +08:00
delete this ;
if ( dispose )
pager - > dispose ( ) ;
else
pager - > close ( ) ;
2018-10-15 18:43:43 +08:00
}
2018-10-25 06:57:06 +08:00
virtual void dispose ( ) {
return close_impl ( true ) ;
2018-10-15 18:43:43 +08:00
}
virtual void close ( ) {
2018-10-25 06:57:06 +08:00
return close_impl ( false ) ;
2018-10-15 18:43:43 +08:00
}
2017-06-10 05:56:41 +08:00
virtual KeyValueStoreType getType ( ) NOT_IMPLEMENTED
virtual bool supportsMutation ( int op ) NOT_IMPLEMENTED
2018-10-25 06:57:06 +08:00
virtual StorageBytes getStorageBytes ( ) {
return m_pager - > getStorageBytes ( ) ;
}
2017-06-10 05:56:41 +08:00
// Writes are provided in an ordered stream.
// A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion()
// A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns
virtual void set ( KeyValueRef keyValue ) {
2018-12-06 14:41:04 +08:00
+ + counts . sets ;
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion & changes = insertMutationBoundary ( keyValue . key ) - > second . startKeyMutations ;
2019-04-30 08:00:29 +08:00
if ( singleVersion ) {
if ( changes . empty ( ) ) {
changes [ 0 ] = SingleKeyMutation ( keyValue . value ) ;
}
else {
2019-05-22 10:16:32 +08:00
changes . begin ( ) - > second = SingleKeyMutation ( keyValue . value ) ;
2019-04-30 08:00:29 +08:00
}
}
else {
// Add the set if the changes set is empty or the last entry isn't a set to exactly the same value
if ( changes . empty ( ) | | ! changes . rbegin ( ) - > second . equalToSet ( keyValue . value ) ) {
changes [ m_writeVersion ] = SingleKeyMutation ( keyValue . value ) ;
}
2017-08-26 06:48:32 +08:00
}
2017-08-22 13:29:57 +08:00
}
virtual void clear ( KeyRangeRef range ) {
2018-12-06 14:41:04 +08:00
+ + counts . clears ;
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator iBegin = insertMutationBoundary ( range . begin ) ;
MutationBufferT : : iterator iEnd = insertMutationBoundary ( range . end ) ;
2019-04-30 08:00:29 +08:00
// In single version mode, clear all pending updates in the affected range
if ( singleVersion ) {
2017-08-25 08:25:53 +08:00
RangeMutation & range = iBegin - > second ;
2019-04-30 08:00:29 +08:00
range . startKeyMutations . clear ( ) ;
2019-05-22 10:16:32 +08:00
range . startKeyMutations [ 0 ] = SingleKeyMutation ( ) ;
2019-04-30 08:00:29 +08:00
range . rangeClearVersion = 0 ;
+ + iBegin ;
m_pBuffer - > erase ( iBegin , iEnd ) ;
}
else {
// For each boundary in the cleared range
while ( iBegin ! = iEnd ) {
RangeMutation & range = iBegin - > second ;
2017-08-25 08:25:53 +08:00
2019-04-30 08:00:29 +08:00
// Set the rangeClearedVersion if not set
if ( ! range . rangeClearVersion . present ( ) )
range . rangeClearVersion = m_writeVersion ;
2017-08-25 08:25:53 +08:00
2019-04-30 08:00:29 +08:00
// Add a clear to the startKeyMutations map if it's empty or the last item is not a clear
if ( range . startKeyMutations . empty ( ) | | ! range . startKeyMutations . rbegin ( ) - > second . isClear ( ) )
range . startKeyMutations [ m_writeVersion ] = SingleKeyMutation ( ) ;
2017-08-25 08:25:53 +08:00
2019-04-30 08:00:29 +08:00
+ + iBegin ;
}
2017-08-25 08:25:53 +08:00
}
2017-06-10 05:56:41 +08:00
}
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
virtual void mutate ( int op , StringRef param1 , StringRef param2 ) NOT_IMPLEMENTED
2019-10-18 16:27:00 +08:00
virtual void setOldestVersion ( Version v ) {
m_newOldestVersion = v ;
}
virtual Version getOldestVersion ( ) {
2019-10-23 08:17:29 +08:00
return m_pager - > getOldestVersion ( ) ;
2019-10-18 16:27:00 +08:00
}
2017-06-10 05:56:41 +08:00
2019-10-23 08:17:29 +08:00
virtual Version getLatestVersion ( ) {
2017-06-10 05:56:41 +08:00
if ( m_writeVersion ! = invalidVersion )
return m_writeVersion ;
return m_pager - > getLatestVersion ( ) ;
}
2017-09-23 08:18:28 +08:00
Version getWriteVersion ( ) {
return m_writeVersion ;
}
2017-09-21 19:43:49 +08:00
Version getLastCommittedVersion ( ) {
return m_lastCommittedVersion ;
}
2019-08-07 17:36:33 +08:00
VersionedBTree ( IPager2 * pager , std : : string name , bool singleVersion = false )
2017-08-04 06:07:29 +08:00
: m_pager ( pager ) ,
m_writeVersion ( invalidVersion ) ,
2017-09-23 08:18:28 +08:00
m_lastCommittedVersion ( invalidVersion ) ,
m_pBuffer ( nullptr ) ,
2019-04-30 08:00:29 +08:00
m_name ( name ) ,
singleVersion ( singleVersion )
2017-09-06 07:59:31 +08:00
{
2017-09-23 08:18:28 +08:00
m_init = init_impl ( this ) ;
2017-10-10 04:24:16 +08:00
m_latestCommit = m_init ;
2017-06-10 05:56:41 +08:00
}
2019-11-04 19:04:03 +08:00
ACTOR static Future < int > incrementalSubtreeClear ( VersionedBTree * self , bool * pStop = nullptr , unsigned int minPages = 0 , int maxPages = std : : numeric_limits < int > : : max ( ) ) {
2019-10-15 18:10:50 +08:00
// TODO: Is it contractually okay to always to read at the latest version?
2019-10-23 08:17:29 +08:00
state Reference < IPagerSnapshot > snapshot = self - > m_pager - > getReadSnapshot ( self - > m_pager - > getLatestVersion ( ) ) ;
2019-10-15 18:10:50 +08:00
state int freedPages = 0 ;
loop {
// take a page from front of queue
state Optional < LazyDeleteQueueEntry > q = wait ( self - > m_lazyDeleteQueue . pop ( ) ) ;
debug_printf ( " LazyDelete: popped %s \n " , toString ( q ) . c_str ( ) ) ;
if ( ! q . present ( ) ) {
2019-10-23 08:17:29 +08:00
break ;
2019-10-15 18:10:50 +08:00
}
// Read the page without caching
Reference < const IPage > p = wait ( self - > readPage ( snapshot , q . get ( ) . pageID , nullptr , nullptr , true ) ) ;
const BTreePage & btPage = * ( BTreePage * ) p - > begin ( ) ;
// Level 1 (leaf) nodes should never be in the lazy delete queue
ASSERT ( btPage . height > 1 ) ;
// Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses
// RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding
BTreePage : : ValueTree : : Reader reader ( & btPage . valueTree ( ) , & dbBegin , & dbEnd ) ;
auto c = reader . getCursor ( ) ;
ASSERT ( c . moveFirst ( ) ) ;
Version v = q . get ( ) . version ;
while ( 1 ) {
if ( c . get ( ) . value . present ( ) ) {
BTreePageID btChildPageID = c . get ( ) . getChildPage ( ) ;
// If this page is height 2, then the children are leaves so free
if ( btPage . height = = 2 ) {
debug_printf ( " LazyDelete: freeing child %s \n " , toString ( btChildPageID ) . c_str ( ) ) ;
self - > freeBtreePage ( btChildPageID , v ) ;
freedPages + = btChildPageID . size ( ) ;
}
else {
// Otherwise, queue them for lazy delete.
debug_printf ( " LazyDelete: queuing child %s \n " , toString ( btChildPageID ) . c_str ( ) ) ;
self - > m_lazyDeleteQueue . pushFront ( LazyDeleteQueueEntry { v , btChildPageID } ) ;
}
}
if ( ! c . moveNext ( ) ) {
break ;
}
}
// Free the page, now that its children have either been freed or queued
debug_printf ( " LazyDelete: freeing queue entry %s \n " , toString ( q . get ( ) . pageID ) . c_str ( ) ) ;
self - > freeBtreePage ( q . get ( ) . pageID , v ) ;
freedPages + = q . get ( ) . pageID . size ( ) ;
2019-10-23 08:17:29 +08:00
// If stop is set and we've freed the minimum number of pages required, or the maximum is exceeded, return.
2019-10-24 00:31:06 +08:00
if ( ( freedPages > = minPages & & pStop ! = nullptr & & * pStop ) | | freedPages > = maxPages ) {
2019-10-23 08:17:29 +08:00
break ;
2019-10-15 18:10:50 +08:00
}
}
2019-10-23 08:17:29 +08:00
return freedPages ;
2019-10-15 18:10:50 +08:00
}
2017-09-21 19:43:49 +08:00
ACTOR static Future < Void > init_impl ( VersionedBTree * self ) {
2019-10-23 08:17:29 +08:00
wait ( self - > m_pager - > init ( ) ) ;
state Version latest = self - > m_pager - > getLatestVersion ( ) ;
self - > m_newOldestVersion = self - > m_pager - > getOldestVersion ( ) ;
2019-10-18 16:27:00 +08:00
debug_printf ( " Recovered pager to version % " PRId64 " , oldest version is % " PRId64 " \n " , self - > m_newOldestVersion ) ;
2019-08-07 17:36:33 +08:00
state Key meta = self - > m_pager - > getMetaKey ( ) ;
if ( meta . size ( ) = = 0 ) {
2019-09-05 15:47:57 +08:00
self - > m_header . formatVersion = MetaKey : : FORMAT_VERSION ;
2019-09-28 06:08:05 +08:00
LogicalPageID id = wait ( self - > m_pager - > newPageID ( ) ) ;
BTreePageID newRoot ( ( LogicalPageID * ) & id , 1 ) ;
2019-10-15 18:10:50 +08:00
debug_printf ( " new root %s \n " , toString ( newRoot ) . c_str ( ) ) ;
2019-09-28 06:08:05 +08:00
self - > m_header . root . set ( newRoot , sizeof ( headerSpace ) - sizeof ( m_header ) ) ;
2019-09-05 15:47:57 +08:00
self - > m_header . height = 1 ;
2017-08-04 06:07:29 +08:00
+ + latest ;
2017-09-21 08:50:02 +08:00
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
2019-10-26 05:52:06 +08:00
makeEmptyRoot ( page ) ;
2019-09-28 06:08:05 +08:00
self - > m_pager - > updatePage ( id , page ) ;
2019-10-15 18:10:50 +08:00
self - > m_pager - > setCommitVersion ( latest ) ;
2019-08-07 17:36:33 +08:00
LogicalPageID newQueuePage = wait ( self - > m_pager - > newPageID ( ) ) ;
2019-10-15 18:10:50 +08:00
self - > m_lazyDeleteQueue . create ( self - > m_pager , newQueuePage , " LazyDeleteQueue " ) ;
2019-08-07 17:36:33 +08:00
self - > m_header . lazyDeleteQueue = self - > m_lazyDeleteQueue . getState ( ) ;
self - > m_pager - > setMetaKey ( self - > m_header . asKeyRef ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_pager - > commit ( ) ) ;
2019-08-07 17:36:33 +08:00
debug_printf ( " Committed initial commit. \n " ) ;
}
else {
self - > m_header . fromKeyRef ( meta ) ;
2019-08-14 13:41:41 +08:00
self - > m_lazyDeleteQueue . recover ( self - > m_pager , self - > m_header . lazyDeleteQueue , " LazyDeleteQueueRecovered " ) ;
2017-07-15 02:36:49 +08:00
}
2019-09-05 15:47:57 +08:00
2019-10-15 18:10:50 +08:00
debug_printf ( " Recovered btree at version % " PRId64 " : %s \n " , latest , self - > m_header . toString ( ) . c_str ( ) ) ;
2019-09-05 15:47:57 +08:00
2019-08-07 17:36:33 +08:00
self - > m_maxPartSize = std : : min ( 255 , self - > m_pager - > getUsablePageSize ( ) / 5 ) ;
2017-08-04 06:07:29 +08:00
self - > m_lastCommittedVersion = latest ;
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
2019-10-23 08:17:29 +08:00
Future < Void > init ( ) override {
return m_init ;
}
2017-06-10 05:56:41 +08:00
2017-08-22 13:29:57 +08:00
virtual ~ VersionedBTree ( ) {
2018-10-15 18:43:43 +08:00
// This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe,
// it will cancel init and commit and leave the pager alive but with potentially an incomplete set of
// uncommitted writes so it should not be committed.
2017-10-02 18:32:22 +08:00
m_init . cancel ( ) ;
m_latestCommit . cancel ( ) ;
2017-08-22 13:29:57 +08:00
}
2017-06-10 05:56:41 +08:00
2019-10-18 16:27:00 +08:00
// readAtVersion() may only be called on a committed v which has previously been passed to setWriteVersion() and never previously passed
// to setOldestVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations.
2017-06-10 05:56:41 +08:00
// The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less
// than or equal to the given version.
2019-10-18 16:27:00 +08:00
// v must be a committed version.
2017-06-10 05:56:41 +08:00
virtual Reference < IStoreCursor > readAtVersion ( Version v ) {
2019-09-29 04:27:00 +08:00
// Only committed versions can be read.
2019-04-30 08:00:29 +08:00
Version recordVersion = singleVersion ? 0 : v ;
2017-07-26 07:10:19 +08:00
ASSERT ( v < = m_lastCommittedVersion ) ;
2019-04-30 08:00:29 +08:00
if ( singleVersion ) {
ASSERT ( v = = m_lastCommittedVersion ) ;
}
2019-09-02 14:03:31 +08:00
Reference < IPagerSnapshot > snapshot = m_pager - > getReadSnapshot ( v ) ;
2019-08-07 17:36:33 +08:00
Key m = snapshot - > getMetaKey ( ) ;
2019-09-28 06:08:05 +08:00
return Reference < IStoreCursor > ( new Cursor ( snapshot , ( ( MetaKey * ) m . begin ( ) ) - > root . get ( ) , recordVersion ) ) ;
2017-06-10 05:56:41 +08:00
}
// Must be nondecreasing
virtual void setWriteVersion ( Version v ) {
2017-09-23 08:18:28 +08:00
ASSERT ( v > m_lastCommittedVersion ) ;
// If there was no current mutation buffer, create one in the buffer map and update m_pBuffer
if ( m_pBuffer = = nullptr ) {
// When starting a new mutation buffer its start version must be greater than the last write version
ASSERT ( v > m_writeVersion ) ;
m_pBuffer = & m_mutationBuffers [ v ] ;
2019-10-28 19:00:37 +08:00
2017-09-23 08:18:28 +08:00
// Create range representing the entire keyspace. This reduces edge cases to applying mutations
// because now all existing keys are within some range in the mutation map.
2019-10-28 19:00:37 +08:00
( * m_pBuffer ) [ dbBegin . key ] = RangeMutation ( ) ;
// Setting the dbEnd key to be cleared prevents having to treat a range clear to dbEnd as a special
// case in order to avoid traversing down the rightmost edge of the tree.
( * m_pBuffer ) [ dbEnd . key ] . startKeyMutations [ 0 ] = SingleKeyMutation ( ) ;
2017-09-23 08:18:28 +08:00
}
else {
// It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null
ASSERT ( v > = m_writeVersion ) ;
}
2017-06-10 05:56:41 +08:00
m_writeVersion = v ;
}
virtual Future < Void > commit ( ) {
2017-09-23 08:18:28 +08:00
if ( m_pBuffer = = nullptr )
return m_latestCommit ;
2017-06-10 05:56:41 +08:00
return commit_impl ( this ) ;
}
2019-10-24 00:31:06 +08:00
ACTOR static Future < Void > destroyAndCheckSanity_impl ( VersionedBTree * self ) {
ASSERT ( g_network - > isSimulated ( ) ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " Clearing tree. \n " ) ;
2019-10-24 00:31:06 +08:00
self - > setWriteVersion ( self - > getLatestVersion ( ) + 1 ) ;
self - > clear ( KeyRangeRef ( dbBegin . key , dbEnd . key ) ) ;
loop {
2019-11-04 19:04:03 +08:00
int freedPages = wait ( self - > incrementalSubtreeClear ( self ) ) ;
debug_printf ( " incrementalSubtreeClear freed %d \n " , freedPages ) ;
2019-10-24 00:31:06 +08:00
wait ( self - > commit ( ) ) ;
if ( self - > m_lazyDeleteQueue . numEntries = = 0 ) {
break ;
}
self - > setWriteVersion ( self - > getLatestVersion ( ) + 1 ) ;
}
2019-11-04 19:04:03 +08:00
// Forget all but the latest version of the tree.
debug_printf ( " Discarding all old versions. \n " ) ;
self - > setOldestVersion ( self - > getLastCommittedVersion ( ) ) ;
self - > setWriteVersion ( self - > getLatestVersion ( ) + 1 ) ;
wait ( self - > commit ( ) ) ;
2019-10-28 19:00:37 +08:00
// The lazy delete queue should now be empty and contain only the new page to start writing to
// on the next commit.
2019-10-24 00:31:06 +08:00
LazyDeleteQueueT : : QueueState s = self - > m_lazyDeleteQueue . getState ( ) ;
ASSERT ( s . numEntries = = 0 ) ;
ASSERT ( s . numPages = = 1 ) ;
2019-10-28 19:00:37 +08:00
// The btree should now be a single non-oversized root page.
2019-10-24 00:31:06 +08:00
ASSERT ( self - > m_header . height = = 1 ) ;
2019-10-28 19:00:37 +08:00
ASSERT ( self - > m_header . root . count = = 1 ) ;
// From the pager's perspective the only pages that should be in use are the btree root and
// the previously mentioned lazy delete queue page.
2019-11-04 19:04:03 +08:00
int64_t userPageCount = wait ( self - > m_pager - > getUserPageCount ( ) ) ;
ASSERT ( userPageCount = = 2 ) ;
2019-10-24 00:31:06 +08:00
return Void ( ) ;
}
Future < Void > destroyAndCheckSanity ( ) {
return destroyAndCheckSanity_impl ( this ) ;
}
2019-04-30 08:00:29 +08:00
bool isSingleVersion ( ) const {
return singleVersion ;
}
2017-06-10 05:56:41 +08:00
private :
2019-09-28 06:08:05 +08:00
struct VersionAndChildrenRef {
VersionAndChildrenRef ( Version v , VectorRef < RedwoodRecordRef > children , RedwoodRecordRef upperBound )
: version ( v ) , children ( children ) , upperBound ( upperBound ) {
}
VersionAndChildrenRef ( Arena & arena , const VersionAndChildrenRef & toCopy )
: version ( toCopy . version ) , children ( arena , toCopy . children ) , upperBound ( arena , toCopy . upperBound ) {
}
int expectedSize ( ) const {
return children . expectedSize ( ) + upperBound . expectedSize ( ) ;
}
std : : string toString ( ) const {
2019-10-15 18:10:50 +08:00
return format ( " {version=% " PRId64 " children=%s upperbound=%s} " , version , : : toString ( children ) . c_str ( ) , upperBound . toString ( ) . c_str ( ) ) ;
2019-09-28 06:08:05 +08:00
}
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
Version version ;
2019-09-28 06:08:05 +08:00
VectorRef < RedwoodRecordRef > children ;
RedwoodRecordRef upperBound ;
2019-06-04 19:03:52 +08:00
} ;
2017-08-04 15:01:25 +08:00
2019-09-28 06:08:05 +08:00
typedef VectorRef < VersionAndChildrenRef > VersionedChildrenT ;
2019-06-04 19:03:52 +08:00
// Utility class for building a vector of internal page entries.
// Entries must be added in version order. Modified will be set to true
// if any entries differ from the original ones. Additional entries will be
// added when necessary to reconcile differences between the upper and lower
// boundaries of consecutive entries.
struct InternalPageBuilder {
// Cursor must be at first entry in page
InternalPageBuilder ( const BTreePage : : BinaryTree : : Cursor & c )
: cursor ( c ) , modified ( false ) , childPageCount ( 0 )
{
}
2019-09-28 06:08:05 +08:00
private :
// This must be called internally, on records whose arena has already been added to the entries arena
2019-06-04 19:03:52 +08:00
inline void addEntry ( const RedwoodRecordRef & rec ) {
if ( rec . value . present ( ) ) {
+ + childPageCount ;
}
// If no modification detected yet then check that this record is identical to the next
// record from the original page which is at the current cursor position.
if ( ! modified ) {
if ( cursor . valid ( ) ) {
if ( ! rec . identical ( cursor . get ( ) ) ) {
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Found internal page difference. new: %s old: %s \n " , rec . toString ( ) . c_str ( ) , cursor . get ( ) . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
modified = true ;
}
else {
cursor . moveNext ( ) ;
}
}
else {
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Found internal page difference. new: %s old: <end> \n " , rec . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
modified = true ;
}
}
2019-09-28 06:08:05 +08:00
entries . push_back ( entries . arena ( ) , rec ) ;
2019-06-04 19:03:52 +08:00
}
2019-09-28 06:08:05 +08:00
public :
// Add the child entries from newSet into entries
void addEntries ( VersionAndChildrenRef newSet ) {
2019-06-04 19:03:52 +08:00
// If there are already entries, the last one links to a child page, and its upper bound is not the same
// as the first lowerBound in newSet (or newSet is empty, as the next newSet is necessarily greater)
// then add the upper bound of the previous set as a value-less record so that on future reads
// the previous child page can be decoded correctly.
if ( ! entries . empty ( ) & & entries . back ( ) . value . present ( )
& & ( newSet . children . empty ( ) | | newSet . children . front ( ) ! = lastUpperBound ) )
{
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Added placeholder %s \n " , lastUpperBound . withoutValue ( ) . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
addEntry ( lastUpperBound . withoutValue ( ) ) ;
}
for ( auto & child : newSet . children ) {
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Adding child entry %s \n " , child . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
addEntry ( child ) ;
}
2019-04-30 08:00:29 +08:00
2019-06-04 19:03:52 +08:00
lastUpperBound = newSet . upperBound ;
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: New upper bound: %s \n " , lastUpperBound . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
2019-06-04 19:03:52 +08:00
// Finish comparison to existing data if necesary.
// Handle possible page upper bound changes.
// If modified is set (see below) and our rightmost entry has a child page and its upper bound
// (currently in lastUpperBound) does not match the new desired page upper bound, passed as newUpperBound,
// then write lastUpperBound with no value to allow correct decoding of the rightmost entry.
// This is only done if modified is set to avoid rewriting this page for this purpose only.
//
// After this call, lastUpperBound is internal page's upper bound.
2019-06-06 11:58:47 +08:00
void finalize ( const RedwoodRecordRef & upperBound , const RedwoodRecordRef & decodeUpperBound ) {
debug_printf ( " InternalPageBuilder::end modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s \n " , modified , upperBound . toString ( ) . c_str ( ) , decodeUpperBound . toString ( ) . c_str ( ) , lastUpperBound . toString ( ) . c_str ( ) ) ;
modified = modified | | cursor . valid ( ) ;
debug_printf ( " InternalPageBuilder::end modified=%d after cursor check \n " , modified ) ;
// If there are boundary key entries and the last one has a child page then the
// upper bound for this internal page must match the required upper bound for
// the last child entry.
if ( ! entries . empty ( ) & & entries . back ( ) . value . present ( ) ) {
debug_printf ( " InternalPageBuilder::end last entry is not null \n " ) ;
// If the page contents were not modified so far and the upper bound required
// for the last child page (lastUpperBound) does not match what the page
// was encoded with then the page must be modified.
if ( ! modified & & lastUpperBound ! = decodeUpperBound ) {
debug_printf ( " InternalPageBuilder::end modified set true because lastUpperBound does not match decodeUpperBound \n " ) ;
2019-06-04 19:03:52 +08:00
modified = true ;
}
2019-06-06 11:58:47 +08:00
if ( modified & & lastUpperBound ! = upperBound ) {
debug_printf ( " InternalPageBuilder::end Modified is true but lastUpperBound does not match upperBound so adding placeholder \n " ) ;
2019-06-04 19:03:52 +08:00
addEntry ( lastUpperBound . withoutValue ( ) ) ;
2019-06-06 11:58:47 +08:00
lastUpperBound = upperBound ;
2019-06-04 19:03:52 +08:00
}
}
2019-06-06 11:58:47 +08:00
debug_printf ( " InternalPageBuilder::end exit. modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s \n " , modified , upperBound . toString ( ) . c_str ( ) , decodeUpperBound . toString ( ) . c_str ( ) , lastUpperBound . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
}
BTreePage : : BinaryTree : : Cursor cursor ;
2019-09-28 06:08:05 +08:00
Standalone < VectorRef < RedwoodRecordRef > > entries ;
RedwoodRecordRef lastUpperBound ;
2019-06-04 19:03:52 +08:00
bool modified ;
int childPageCount ;
} ;
2017-08-25 08:25:53 +08:00
// Represents a change to a single key - set, clear, or atomic op
struct SingleKeyMutation {
// Clear
SingleKeyMutation ( ) : op ( MutationRef : : ClearRange ) { }
// Set
SingleKeyMutation ( Value val ) : op ( MutationRef : : SetValue ) , value ( val ) { }
// Atomic Op
SingleKeyMutation ( MutationRef : : Type op , Value val ) : op ( op ) , value ( val ) { }
2017-08-22 13:29:57 +08:00
MutationRef : : Type op ;
2017-08-25 08:25:53 +08:00
Value value ;
2017-08-04 15:01:25 +08:00
2017-08-22 13:29:57 +08:00
inline bool isClear ( ) const { return op = = MutationRef : : ClearRange ; }
inline bool isSet ( ) const { return op = = MutationRef : : SetValue ; }
2017-08-25 08:25:53 +08:00
inline bool isAtomicOp ( ) const { return ! isSet ( ) & & ! isClear ( ) ; }
inline bool equalToSet ( ValueRef val ) { return isSet ( ) & & value = = val ; }
2019-02-21 18:46:30 +08:00
inline RedwoodRecordRef toRecord ( KeyRef userKey , Version version ) const {
2017-09-06 07:59:31 +08:00
// No point in serializing an atomic op, it needs to be coalesced to a real value.
2017-08-25 08:25:53 +08:00
ASSERT ( ! isAtomicOp ( ) ) ;
if ( isClear ( ) )
2019-02-21 18:46:30 +08:00
return RedwoodRecordRef ( userKey , version ) ;
2017-08-22 13:29:57 +08:00
2019-02-21 18:46:30 +08:00
return RedwoodRecordRef ( userKey , version , value ) ;
2017-08-25 08:25:53 +08:00
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
std : : string toString ( ) const {
2017-08-26 06:48:32 +08:00
return format ( " op=%d val='%s' " , op , printable ( value ) . c_str ( ) ) ;
2017-08-22 13:29:57 +08:00
}
2017-08-04 15:01:25 +08:00
} ;
2017-08-25 08:25:53 +08:00
// Represents mutations on a single key and a possible clear to a range that begins
// immediately after that key
typedef std : : map < Version , SingleKeyMutation > SingleKeyMutationsByVersion ;
struct RangeMutation {
// Mutations for exactly the start key
SingleKeyMutationsByVersion startKeyMutations ;
// A clear range version, if cleared, for the range starting immediately AFTER the start key
Optional < Version > rangeClearVersion ;
2019-10-29 16:31:59 +08:00
bool keyCleared ( ) const {
return startKeyMutations . size ( ) = = 1 & & startKeyMutations . begin ( ) - > second . isClear ( ) ;
}
bool keyChanged ( ) const {
return ! startKeyMutations . empty ( ) ;
}
bool rangeCleared ( ) const {
return rangeClearVersion . present ( ) ;
}
2017-08-25 08:25:53 +08:00
// Returns true if this RangeMutation doesn't actually mutate anything
bool noChanges ( ) const {
return ! rangeClearVersion . present ( ) & & startKeyMutations . empty ( ) ;
}
std : : string toString ( ) const {
std : : string result ;
result . append ( " rangeClearVersion: " ) ;
if ( rangeClearVersion . present ( ) )
2019-05-29 21:23:32 +08:00
result . append ( format ( " % " PRId64 " " , rangeClearVersion . get ( ) ) ) ;
2017-08-25 08:25:53 +08:00
else
result . append ( " <not present> " ) ;
result . append ( " startKeyMutations: " ) ;
for ( SingleKeyMutationsByVersion : : value_type const & m : startKeyMutations )
2019-05-29 21:23:32 +08:00
result . append ( format ( " [% " PRId64 " => %s] " , m . first , m . second . toString ( ) . c_str ( ) ) ) ;
2017-08-25 08:25:53 +08:00
return result ;
}
} ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
typedef std : : map < Key , RangeMutation > MutationBufferT ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
/* Mutation Buffer Overview
2017-08-22 13:29:57 +08:00
*
2019-04-30 08:00:29 +08:00
* This structure ' s organization is meant to put pending updates for the btree in an order
* that makes it efficient to query all pending mutations across all pending versions which are
* relevant to a particular subtree of the btree .
*
* At the top level , it is a map of the start of a range being modified to a RangeMutation .
* The end of the range is map key ( which is the next range start in the map ) .
*
2018-07-15 04:37:52 +08:00
* - The buffer starts out with keys ' ' and endKVV . key already populated .
2017-08-25 08:25:53 +08:00
*
* - When a new key is inserted into the buffer map , it is by definition
* splitting an existing range so it should take on the rangeClearVersion of
* the immediately preceding key which is the start of that range
2017-08-22 13:29:57 +08:00
*
* - Keys are inserted into the buffer map for every individual operation ( set / clear / atomic )
* key and for both the start and end of a range clear .
2017-08-25 08:25:53 +08:00
*
2017-08-22 13:29:57 +08:00
* - To apply a single clear , add it to the individual ops only if the last entry is not also a clear .
*
2017-08-25 08:25:53 +08:00
* - To apply a range clear , after inserting the new range boundaries do the following to the start
* boundary and all successive boundaries < end
* - set the range clear version if not already set
* - add a clear to the startKeyMutations if the final entry is not a clear .
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* - Note that there are actually TWO valid ways to represent
* set c = val1 at version 1
* clear c \ x00 to z at version 2
* with this model . Either
* c = { rangeClearVersion = 2 , startKeyMutations = { 1 = > val1 }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
* OR
* c = { rangeClearVersion = < not present > , startKeyMutations = { 1 = > val1 }
* c \ x00 = { rangeClearVersion = 2 , startKeyMutations = { 2 = > < not present > }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* This is because the rangeClearVersion applies to a range begining with the first
* key AFTER the start key , so that the logic for reading the start key is more simple
* as it only involves consulting startKeyMutations . When adding a clear range , the
* boundary key insert / split described above is valid , and is what is currently done ,
* but it would also be valid to see if the last key before startKey is equal to
* keyBefore ( startKey ) , and if so that mutation buffer boundary key can be used instead
* without adding an additional key to the buffer .
2019-04-30 08:00:29 +08:00
* TODO : A possible optimization here could be to only use existing btree leaf page boundaries as keys ,
* with mutation point keys being stored in an unsorted strucutre under those boundary map keys ,
* to be sorted later just before being merged into the existing leaf page .
*/
2017-08-22 13:29:57 +08:00
2019-08-07 17:36:33 +08:00
IPager2 * m_pager ;
2018-10-15 18:43:43 +08:00
MutationBufferT * m_pBuffer ;
std : : map < Version , MutationBufferT > m_mutationBuffers ;
Version m_writeVersion ;
Version m_lastCommittedVersion ;
2019-10-18 16:27:00 +08:00
Version m_newOldestVersion ;
2018-10-15 18:43:43 +08:00
Future < Void > m_latestCommit ;
Future < Void > m_init ;
std : : string m_name ;
2019-04-30 08:00:29 +08:00
bool singleVersion ;
2018-10-15 18:43:43 +08:00
2019-09-28 06:08:05 +08:00
// MetaKey changes size so allocate space for it to expand into
union {
uint8_t headerSpace [ sizeof ( MetaKey ) + sizeof ( LogicalPageID ) * 20 ] ;
MetaKey m_header ;
} ;
2019-08-07 17:36:33 +08:00
LazyDeleteQueueT m_lazyDeleteQueue ;
int m_maxPartSize ;
2017-09-23 08:18:28 +08:00
// Find or create a mutation buffer boundary for bound and return an iterator to it
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator insertMutationBoundary ( Key boundary ) {
2017-09-23 08:18:28 +08:00
ASSERT ( m_pBuffer ! = nullptr ) ;
2017-08-25 08:25:53 +08:00
// Find the first split point in buffer that is >= key
2017-09-23 08:18:28 +08:00
MutationBufferT : : iterator ib = m_pBuffer - > lower_bound ( boundary ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Since the initial state of the mutation buffer contains the range '' through
// the maximum possible key, our search had to have found something.
2017-09-23 08:18:28 +08:00
ASSERT ( ib ! = m_pBuffer - > end ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// If we found the boundary we are looking for, return its iterator
if ( ib - > first = = boundary )
return ib ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is our insert hint. Insert the new boundary and set ib to its entry
2017-09-23 08:18:28 +08:00
ib = m_pBuffer - > insert ( ib , { boundary , RangeMutation ( ) } ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is certainly > begin() because it is guaranteed that the empty string
// boundary exists and the only way to have found that is to look explicitly
// for it in which case we would have returned above.
MutationBufferT : : iterator iPrevious = ib ;
2017-08-26 06:48:32 +08:00
- - iPrevious ;
2017-08-28 16:57:01 +08:00
if ( iPrevious - > second . rangeClearVersion . present ( ) ) {
ib - > second . rangeClearVersion = iPrevious - > second . rangeClearVersion ;
ib - > second . startKeyMutations [ iPrevious - > second . rangeClearVersion . get ( ) ] = SingleKeyMutation ( ) ;
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
return ib ;
2017-08-22 13:29:57 +08:00
}
2017-07-26 07:10:19 +08:00
2019-09-28 06:08:05 +08:00
// Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s)
ACTOR static Future < Standalone < VectorRef < RedwoodRecordRef > > > writePages ( VersionedBTree * self , bool minimalBoundaries , const RedwoodRecordRef * lowerBound , const RedwoodRecordRef * upperBound , VectorRef < RedwoodRecordRef > entries , uint8_t newFlags , int height , Version v , BTreePageID previousID ) {
ASSERT ( entries . size ( ) > 0 ) ;
state Standalone < VectorRef < RedwoodRecordRef > > records ;
2019-08-07 17:36:33 +08:00
2019-09-28 06:08:05 +08:00
// This is how much space for the binary tree exists in the page, after the header
state int blockSize = self - > m_pager - > getUsablePageSize ( ) ;
state int pageSize = blockSize - sizeof ( BTreePage ) ;
state int blockCount = 1 ;
2017-07-14 02:32:14 +08:00
2019-09-28 06:08:05 +08:00
state int kvBytes = 0 ;
state int compressedBytes = BTreePage : : BinaryTree : : GetTreeOverhead ( ) ;
2017-07-14 02:32:14 +08:00
2019-09-28 06:08:05 +08:00
state int start = 0 ;
state int i = 0 ;
state bool end ;
2019-08-07 17:36:33 +08:00
2019-09-28 06:08:05 +08:00
// For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor
state int minimumEntries = minimalBoundaries ? 1 : 4 ;
// Lower bound of the page being added to
state RedwoodRecordRef pageLowerBound = lowerBound - > withoutValue ( ) ;
state RedwoodRecordRef pageUpperBound ;
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
while ( i < = entries . size ( ) ) {
end = i = = entries . size ( ) ;
bool flush = end ;
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
// If not the end, add i to the page if necessary
if ( end ) {
pageUpperBound = upperBound - > withoutValue ( ) ;
}
else {
// Get delta from previous record
const RedwoodRecordRef & entry = entries [ i ] ;
int deltaSize = entry . deltaSize ( ( i = = start ) ? pageLowerBound : entries [ i - 1 ] ) ;
int keySize = entry . key . size ( ) ;
int valueSize = entry . value . present ( ) ? entry . value . get ( ) . size ( ) : 0 ;
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
int spaceNeeded = sizeof ( BTreePage : : BinaryTree : : Node ) + deltaSize ;
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
debug_printf ( " Trying to add record %3d of %3lu (i=%3d) klen %4d vlen %3d deltaSize %4d spaceNeeded %4d compressed %4d / page %4d bytes %s \n " ,
i + 1 , entries . size ( ) , i , keySize , valueSize , deltaSize ,
spaceNeeded , compressedBytes , pageSize , entry . toString ( ) . c_str ( ) ) ;
2017-07-14 02:32:14 +08:00
2019-09-28 06:08:05 +08:00
int spaceAvailable = pageSize - compressedBytes ;
2018-09-19 15:32:39 +08:00
2019-09-28 06:08:05 +08:00
// Does it fit?
bool fits = spaceAvailable > = spaceNeeded ;
// If it doesn't fit, either end the current page or increase the page size
if ( ! fits ) {
int count = i - start ;
// If not enough entries or page less than half full, increase page size to make the entry fit
if ( count < minimumEntries | | spaceAvailable > pageSize / 2 ) {
// Figure out how many additional whole or partial blocks are needed
// newBlocks = ceil ( additional space needed / block size)
int newBlocks = 1 + ( spaceNeeded - spaceAvailable - 1 ) / blockSize ;
int newPageSize = pageSize + ( newBlocks * blockSize ) ;
if ( newPageSize < = BTreePage : : BinaryTree : : MaximumTreeSize ( ) ) {
blockCount + = newBlocks ;
pageSize = newPageSize ;
fits = true ;
}
}
if ( ! fits ) {
pageUpperBound = entry . withoutValue ( ) ;
}
2018-09-19 15:32:39 +08:00
}
2019-09-28 06:08:05 +08:00
// If the record fits then add it to the page set
if ( fits ) {
kvBytes + = keySize + valueSize ;
compressedBytes + = spaceNeeded ;
+ + i ;
}
flush = ! fits ;
2018-09-19 15:32:39 +08:00
}
2019-09-28 06:08:05 +08:00
// If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above.
if ( flush ) {
end = i = = entries . size ( ) ; // i could have been moved above
int count = i - start ;
// If not writing the final page, reduce entry count of page by a third
if ( ! end ) {
i - = count / 3 ;
pageUpperBound = entries [ i ] . withoutValue ( ) ;
}
// If this isn't the final page, shorten the upper boundary
if ( ! end & & minimalBoundaries ) {
int commonPrefix = pageUpperBound . getCommonPrefixLen ( entries [ i - 1 ] , 0 ) ;
pageUpperBound . truncate ( commonPrefix + 1 ) ;
}
state std : : vector < Reference < IPage > > pages ;
BTreePage * btPage ;
if ( blockCount = = 1 ) {
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
VALGRIND_MAKE_MEM_DEFINED ( page - > begin ( ) , page - > size ( ) ) ;
btPage = ( BTreePage * ) page - > mutate ( ) ;
pages . push_back ( std : : move ( page ) ) ;
}
else {
ASSERT ( blockCount > 1 ) ;
int size = blockSize * blockCount ;
btPage = ( BTreePage * ) new uint8_t [ size ] ;
2019-10-02 21:43:11 +08:00
VALGRIND_MAKE_MEM_DEFINED ( btPage , size ) ;
2019-09-28 06:08:05 +08:00
}
btPage - > formatVersion = BTreePage : : FORMAT_VERSION ;
btPage - > flags = newFlags ;
btPage - > height = height ;
btPage - > kvBytes = kvBytes ;
btPage - > itemCount = i - start ;
int written = btPage - > tree ( ) . build ( & entries [ start ] , & entries [ i ] , & pageLowerBound , & pageUpperBound ) ;
if ( written > pageSize ) {
fprintf ( stderr , " ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d \n " , written , pageSize , blockCount , i - start , kvBytes , compressedBytes ) ;
ASSERT ( false ) ;
}
// Create chunked pages
// TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled.
if ( blockCount ! = 1 ) {
const uint8_t * rptr = ( const uint8_t * ) btPage ;
for ( int b = 0 ; b < blockCount ; + + b ) {
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
VALGRIND_MAKE_MEM_DEFINED ( page - > begin ( ) , page - > size ( ) ) ;
memcpy ( page - > mutate ( ) , rptr , blockSize ) ;
rptr + = blockSize ;
pages . push_back ( std : : move ( page ) ) ;
}
2019-11-04 19:04:03 +08:00
delete [ ] ( uint8_t * ) btPage ;
2019-09-28 06:08:05 +08:00
}
// Write this btree page, which is made of 1 or more pager pages.
state int p ;
state BTreePageID childPageID ;
2019-10-01 17:06:00 +08:00
// If we are only writing 1 page and it has the same BTreePageID size as the original they try to reuse the
// LogicalPageIDs in previousID and try to update them atomically.
2019-09-28 06:08:05 +08:00
if ( end & & records . empty ( ) & & previousID . size ( ) = = pages . size ( ) ) {
for ( p = 0 ; p < pages . size ( ) ; + + p ) {
LogicalPageID id = wait ( self - > m_pager - > atomicUpdatePage ( previousID [ p ] , pages [ p ] , v ) ) ;
childPageID . push_back ( records . arena ( ) , id ) ;
}
}
else {
2019-10-01 17:06:00 +08:00
// Either the original page is being split, or it's not but it has changed BTreePageID size.
// Either way, there is no point in reusing any of the original page IDs because the parent
// must be rewritten anyway to count for the change in child count or child links.
// Free the old IDs, but only once (before the first output record is added).
2019-09-28 06:08:05 +08:00
if ( records . empty ( ) ) {
2019-10-01 17:06:00 +08:00
self - > freeBtreePage ( previousID , v ) ;
2019-09-28 06:08:05 +08:00
}
for ( p = 0 ; p < pages . size ( ) ; + + p ) {
LogicalPageID id = wait ( self - > m_pager - > newPageID ( ) ) ;
self - > m_pager - > updatePage ( id , pages [ p ] ) ;
childPageID . push_back ( records . arena ( ) , id ) ;
}
}
2019-10-26 05:52:06 +08:00
wait ( yield ( ) ) ;
2019-09-28 06:08:05 +08:00
// Update activity counts
+ + counts . pageWrites ;
if ( pages . size ( ) > 1 ) {
counts . extPageWrites + = pages . size ( ) - 1 ;
}
2019-10-15 18:10:50 +08:00
debug_printf ( " Flushing %s original=%s start=%d i=%d count=%d \n lower: %s \n upper: %s \n " , toString ( childPageID ) . c_str ( ) , toString ( previousID ) . c_str ( ) , start , i , i - start , pageLowerBound . toString ( ) . c_str ( ) , pageUpperBound . toString ( ) . c_str ( ) ) ;
2019-09-28 06:08:05 +08:00
if ( REDWOOD_DEBUG ) {
for ( int j = start ; j < i ; + + j ) {
debug_printf ( " %3d: %s \n " , j , entries [ j ] . toString ( ) . c_str ( ) ) ;
}
ASSERT ( pageLowerBound . key < = pageUpperBound . key ) ;
}
// Push a new record onto the results set, without the child page, copying it into the records arena
records . push_back_deep ( records . arena ( ) , pageLowerBound . withoutValue ( ) ) ;
// Set the child page value of the inserted record to childPageID, which has already been allocated in records.arena() above
records . back ( ) . setChildPage ( childPageID ) ;
if ( end ) {
break ;
}
start = i ;
kvBytes = 0 ;
compressedBytes = BTreePage : : BinaryTree : : GetTreeOverhead ( ) ;
pageLowerBound = pageUpperBound . withoutValue ( ) ;
2018-09-19 15:32:39 +08:00
}
}
2017-07-14 02:32:14 +08:00
2019-09-28 06:08:05 +08:00
return records ;
}
ACTOR static Future < Standalone < VectorRef < RedwoodRecordRef > > > buildNewRoot ( VersionedBTree * self , Version version , Standalone < VectorRef < RedwoodRecordRef > > records , int height ) {
debug_printf ( " buildNewRoot start version % " PRId64 " , %lu records \n " , version , records . size ( ) ) ;
// While there are multiple child pages for this version we must write new tree levels.
while ( records . size ( ) > 1 ) {
self - > m_header . height = + + height ;
Standalone < VectorRef < RedwoodRecordRef > > newRecords = wait ( writePages ( self , false , & dbBegin , & dbEnd , records , 0 , height , version , BTreePageID ( ) ) ) ;
2019-10-02 21:43:11 +08:00
debug_printf ( " Wrote a new root level at version % " PRId64 " height %d size %lu pages \n " , version , height , newRecords . size ( ) ) ;
2019-09-28 06:08:05 +08:00
records = newRecords ;
}
2017-07-14 02:32:14 +08:00
2019-09-28 06:08:05 +08:00
return records ;
2018-09-19 15:32:39 +08:00
}
class SuperPage : public IPage , ReferenceCounted < SuperPage > {
public :
2019-09-28 06:08:05 +08:00
SuperPage ( std : : vector < Reference < const IPage > > pages ) {
int blockSize = pages . front ( ) - > size ( ) ;
m_size = blockSize * pages . size ( ) ;
2018-09-19 15:32:39 +08:00
m_data = new uint8_t [ m_size ] ;
uint8_t * wptr = m_data ;
for ( auto & p : pages ) {
2019-09-28 06:08:05 +08:00
ASSERT ( p - > size ( ) = = blockSize ) ;
memcpy ( wptr , p - > begin ( ) , blockSize ) ;
wptr + = blockSize ;
2018-09-19 15:32:39 +08:00
}
}
virtual ~ SuperPage ( ) {
2019-11-04 19:04:03 +08:00
delete [ ] m_data ;
2018-09-19 15:32:39 +08:00
}
virtual void addref ( ) const {
ReferenceCounted < SuperPage > : : addref ( ) ;
}
virtual void delref ( ) const {
ReferenceCounted < SuperPage > : : delref ( ) ;
}
virtual int size ( ) const {
return m_size ;
}
virtual uint8_t const * begin ( ) const {
return m_data ;
}
virtual uint8_t * mutate ( ) {
return m_data ;
}
private :
uint8_t * m_data ;
2019-09-28 06:08:05 +08:00
int m_size ;
2018-09-19 15:32:39 +08:00
} ;
2019-10-15 18:10:50 +08:00
ACTOR static Future < Reference < const IPage > > readPage ( Reference < IPagerSnapshot > snapshot , BTreePageID id , const RedwoodRecordRef * lowerBound , const RedwoodRecordRef * upperBound , bool forLazyDelete = false ) {
if ( ! forLazyDelete ) {
debug_printf ( " readPage() op=read %s @% " PRId64 " lower=%s upper=%s \n " , toString ( id ) . c_str ( ) , snapshot - > getVersion ( ) , lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
}
else {
debug_printf ( " readPage() op=readForDeferredClear %s @% " PRId64 " \n " , toString ( id ) . c_str ( ) , snapshot - > getVersion ( ) ) ;
2018-09-19 15:32:39 +08:00
}
2019-10-26 05:52:06 +08:00
wait ( yield ( ) ) ;
2018-09-19 15:32:39 +08:00
2019-10-15 18:10:50 +08:00
state Reference < const IPage > page ;
2019-09-28 06:08:05 +08:00
2019-10-15 18:10:50 +08:00
+ + counts . pageReads ;
if ( id . size ( ) = = 1 ) {
wait ( store ( page , snapshot - > getPhysicalPage ( id . front ( ) , ! forLazyDelete ) ) ) ;
2019-09-28 06:08:05 +08:00
}
else {
2019-10-15 18:10:50 +08:00
ASSERT ( ! id . empty ( ) ) ;
counts . extPageReads + = ( id . size ( ) - 1 ) ;
std : : vector < Future < Reference < const IPage > > > reads ;
for ( auto & pageID : id ) {
reads . push_back ( snapshot - > getPhysicalPage ( pageID , ! forLazyDelete ) ) ;
}
std : : vector < Reference < const IPage > > pages = wait ( getAll ( reads ) ) ;
2019-09-28 06:08:05 +08:00
// TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager.
page = Reference < const IPage > ( new SuperPage ( pages ) ) ;
2017-07-14 02:32:14 +08:00
}
2018-09-19 15:32:39 +08:00
2019-10-15 18:10:50 +08:00
debug_printf ( " readPage() op=readComplete %s @% " PRId64 " \n " , toString ( id ) . c_str ( ) , snapshot - > getVersion ( ) ) ;
2019-09-28 06:08:05 +08:00
const BTreePage * pTreePage = ( const BTreePage * ) page - > begin ( ) ;
ASSERT ( pTreePage - > formatVersion = = BTreePage : : FORMAT_VERSION ) ;
2019-10-15 18:10:50 +08:00
if ( ! forLazyDelete & & page - > userData = = nullptr ) {
debug_printf ( " readPage() Creating Reader for %s @% " PRId64 " lower=%s upper=%s \n " , toString ( id ) . c_str ( ) , snapshot - > getVersion ( ) , lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
2019-09-28 06:08:05 +08:00
page - > userData = new BTreePage : : BinaryTree : : Reader ( & pTreePage - > tree ( ) , lowerBound , upperBound ) ;
page - > userDataDestructor = [ ] ( void * ptr ) { delete ( BTreePage : : BinaryTree : : Reader * ) ptr ; } ;
2019-02-21 18:46:30 +08:00
}
2018-09-19 15:32:39 +08:00
2019-10-15 18:10:50 +08:00
if ( ! forLazyDelete ) {
debug_printf ( " readPage() %s \n " , pTreePage - > toString ( false , id , snapshot - > getVersion ( ) , lowerBound , upperBound ) . c_str ( ) ) ;
}
2019-02-21 18:46:30 +08:00
// Nothing should attempt to read bytes in the page outside the BTreePage structure
2019-10-02 21:43:11 +08:00
VALGRIND_MAKE_MEM_UNDEFINED ( page - > begin ( ) + pTreePage - > size ( ) , page - > size ( ) - pTreePage - > size ( ) ) ;
2019-02-21 18:46:30 +08:00
2019-09-28 06:08:05 +08:00
return page ;
}
void freeBtreePage ( BTreePageID btPageID , Version v ) {
// Free individual pages at v
for ( LogicalPageID id : btPageID ) {
m_pager - > freePage ( id , v ) ;
}
2017-07-14 02:32:14 +08:00
}
2019-09-29 04:27:00 +08:00
// Returns list of (version, internal page records, required upper bound)
2019-10-01 17:06:00 +08:00
ACTOR static Future < Standalone < VersionedChildrenT > > commitSubtree ( VersionedBTree * self , MutationBufferT * mutationBuffer , Reference < IPagerSnapshot > snapshot , BTreePageID rootID , bool isLeaf , const RedwoodRecordRef * lowerBound , const RedwoodRecordRef * upperBound , const RedwoodRecordRef * decodeLowerBound , const RedwoodRecordRef * decodeUpperBound ) {
2019-06-04 19:03:52 +08:00
state std : : string context ;
if ( REDWOOD_DEBUG ) {
2019-09-28 06:08:05 +08:00
context = format ( " CommitSubtree(root=%s): " , toString ( rootID ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
}
2019-09-28 06:08:05 +08:00
state Standalone < VersionedChildrenT > results ;
debug_printf ( " %s lower=%s upper=%s \n " , context . c_str ( ) , lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
debug_printf ( " %s decodeLower=%s decodeUpper=%s \n " , context . c_str ( ) , decodeLowerBound - > toString ( ) . c_str ( ) , decodeUpperBound - > toString ( ) . c_str ( ) ) ;
2019-03-15 15:46:09 +08:00
self - > counts . commitToPageStart + + ;
2017-08-25 08:25:53 +08:00
2019-03-15 15:46:09 +08:00
// Find the slice of the mutation buffer that is relevant to this subtree
// TODO: Rather than two lower_bound searches, perhaps just compare each mutation to the upperBound key while iterating
2019-05-22 10:16:32 +08:00
state MutationBufferT : : const_iterator iMutationBoundary = mutationBuffer - > upper_bound ( lowerBound - > key ) ;
- - iMutationBoundary ;
2019-03-15 15:46:09 +08:00
state MutationBufferT : : const_iterator iMutationBoundaryEnd = mutationBuffer - > lower_bound ( upperBound - > key ) ;
2019-05-22 10:16:32 +08:00
if ( REDWOOD_DEBUG ) {
2019-10-28 19:00:37 +08:00
debug_printf ( " %s ---------MUTATION BUFFER SLICE --------------------- \n " , context . c_str ( ) ) ;
auto begin = iMutationBoundary ;
while ( 1 ) {
debug_printf ( " %s Mutation: '%s': %s \n " , context . c_str ( ) , printable ( begin - > first ) . c_str ( ) , begin - > second . toString ( ) . c_str ( ) ) ;
if ( begin = = iMutationBoundaryEnd ) {
break ;
}
+ + begin ;
}
debug_printf ( " %s ------------------------------------- \n " , context . c_str ( ) ) ;
2017-08-28 21:28:49 +08:00
}
2019-05-22 10:16:32 +08:00
2019-10-29 16:31:59 +08:00
// iMutationBoundary is greatest boundary <= lowerBound->key
// iMutationBoundaryEnd is least boundary >= upperBound->key
// If the boundary range iterators are the same then this subtree only has one unique key, which is the same key as the boundary
// record the iterators are pointing to. There only two outcomes possible: Clearing the subtree or leaving it alone.
// If there are any changes to the one key then the entire subtree should be deleted as the changes for the key
// do not go into this subtree.
2019-05-22 10:16:32 +08:00
if ( iMutationBoundary = = iMutationBoundaryEnd ) {
2019-10-29 16:31:59 +08:00
if ( iMutationBoundary - > second . keyChanged ( ) ) {
2019-10-15 18:10:50 +08:00
debug_printf ( " %s lower and upper bound key/version match and key is modified so deleting page, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
2019-10-01 17:06:00 +08:00
Version firstKeyChangeVersion = self - > singleVersion ? self - > getLastCommittedVersion ( ) + 1 : iMutationBoundary - > second . startKeyMutations . begin ( ) - > first ;
if ( isLeaf ) {
self - > freeBtreePage ( rootID , firstKeyChangeVersion ) ;
}
else {
self - > m_lazyDeleteQueue . pushBack ( LazyDeleteQueueEntry { firstKeyChangeVersion , rootID } ) ;
}
2019-09-28 06:08:05 +08:00
return results ;
2019-05-22 10:16:32 +08:00
}
2019-10-23 08:17:29 +08:00
// Otherwise, no changes to this subtree
results . push_back_deep ( results . arena ( ) , VersionAndChildrenRef ( 0 , VectorRef < RedwoodRecordRef > ( ( RedwoodRecordRef * ) decodeLowerBound , 1 ) , * decodeUpperBound ) ) ;
debug_printf ( " %s page contains a single key '%s' which is not changing, returning %s \n " , context . c_str ( ) , lowerBound - > key . toString ( ) . c_str ( ) , toString ( results ) . c_str ( ) ) ;
return results ;
2017-08-25 08:25:53 +08:00
}
2019-10-28 19:00:37 +08:00
// If one mutation range covers the entire subtree, then check if the entire subtree is modified,
// unmodified, or possibly/partially modified.
2017-08-25 08:25:53 +08:00
MutationBufferT : : const_iterator iMutationBoundaryNext = iMutationBoundary ;
+ + iMutationBoundaryNext ;
2019-10-23 08:17:29 +08:00
if ( iMutationBoundaryNext = = iMutationBoundaryEnd ) {
2019-10-29 16:31:59 +08:00
// Cleared means the entire range covering the subtree was cleared. It is assumed true
// if the range starting after the lower mutation boundary was cleared, and then proven false
// below if possible.
bool cleared = iMutationBoundary - > second . rangeCleared ( ) ;
// Unchanged means the entire range covering the subtree was unchanged, it is assumed to be the
// opposite of cleared() and then proven false below if possible.
bool unchanged = ! cleared ;
debug_printf ( " %s cleared=%d unchanged=%d \n " , context . c_str ( ) , cleared , unchanged ) ;
// If the lower mutation boundary key is the same as the subtree lower bound then whether or not
// that key is being changed or cleared affects this subtree.
if ( iMutationBoundary - > first = = lowerBound - > key ) {
// If subtree will be cleared (so far) but the lower boundary key is not cleared then the subtree is not cleared
if ( cleared & & ! iMutationBoundary - > second . keyCleared ( ) ) {
cleared = false ;
debug_printf ( " %s cleared=%d unchanged=%d \n " , context . c_str ( ) , cleared , unchanged ) ;
}
// If the subtree looked unchanged (so far) but the lower boundary is is changed then the subtree is changed
if ( unchanged & & iMutationBoundary - > second . keyChanged ( ) ) {
unchanged = false ;
debug_printf ( " %s cleared=%d unchanged=%d \n " , context . c_str ( ) , cleared , unchanged ) ;
}
}
// If the higher mutation boundary key is the same as the subtree upper bound key then whether
// or not it is being changed or cleared affects this subtree.
if ( ( cleared | | unchanged ) & & iMutationBoundaryEnd - > first = = upperBound - > key ) {
// If the key is being changed then the records in this subtree with the same key must be removed
// so the subtree is definitely not unchanged, though it may be cleared to achieve the same effect.
if ( iMutationBoundaryEnd - > second . keyChanged ( ) ) {
unchanged = false ;
debug_printf ( " %s cleared=%d unchanged=%d \n " , context . c_str ( ) , cleared , unchanged ) ;
}
else {
// If the key is not being changed then the records in this subtree can't be removed so the
// subtree is not being cleared.
cleared = false ;
debug_printf ( " %s cleared=%d unchanged=%d \n " , context . c_str ( ) , cleared , unchanged ) ;
}
}
// The subtree cannot be both cleared and unchanged.
ASSERT ( ! ( cleared & & unchanged ) ) ;
// If no changes in subtree
if ( unchanged ) {
2019-10-23 08:17:29 +08:00
results . push_back_deep ( results . arena ( ) , VersionAndChildrenRef ( 0 , VectorRef < RedwoodRecordRef > ( ( RedwoodRecordRef * ) decodeLowerBound , 1 ) , * decodeUpperBound ) ) ;
debug_printf ( " %s no changes on this subtree, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
return results ;
}
2019-10-29 16:31:59 +08:00
// If subtree is cleared
if ( cleared ) {
2019-10-23 08:17:29 +08:00
debug_printf ( " %s %s cleared, deleting it, returning %s \n " , context . c_str ( ) , isLeaf ? " Page " : " Subtree " , toString ( results ) . c_str ( ) ) ;
Version clearVersion = self - > singleVersion ? self - > getLastCommittedVersion ( ) + 1 : iMutationBoundary - > second . rangeClearVersion . get ( ) ;
if ( isLeaf ) {
self - > freeBtreePage ( rootID , clearVersion ) ;
}
else {
self - > m_lazyDeleteQueue . pushBack ( LazyDeleteQueueEntry { clearVersion , rootID } ) ;
}
return results ;
}
2017-08-25 08:25:53 +08:00
}
2019-03-15 15:46:09 +08:00
self - > counts . commitToPage + + ;
2019-09-28 06:08:05 +08:00
state Reference < const IPage > rawPage = wait ( readPage ( snapshot , rootID , decodeLowerBound , decodeUpperBound ) ) ;
2018-09-19 15:32:39 +08:00
state BTreePage * page = ( BTreePage * ) rawPage - > begin ( ) ;
2019-09-28 06:08:05 +08:00
debug_printf ( " %s commitSubtree(): %s \n " , context . c_str ( ) , page - > toString ( false , rootID , snapshot - > getVersion ( ) , decodeLowerBound , decodeUpperBound ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
2019-06-04 19:03:52 +08:00
state BTreePage : : BinaryTree : : Cursor cursor = getReader ( rawPage ) - > getCursor ( ) ;
2019-02-21 18:46:30 +08:00
cursor . moveFirst ( ) ;
2017-06-10 05:56:41 +08:00
2019-08-07 17:36:33 +08:00
state Version writeVersion ;
2018-08-29 04:46:14 +08:00
// Leaf Page
2018-06-08 18:32:34 +08:00
if ( page - > flags & BTreePage : : IS_LEAF ) {
2019-10-01 17:06:00 +08:00
ASSERT ( isLeaf ) ;
2019-09-28 06:08:05 +08:00
state Standalone < VectorRef < RedwoodRecordRef > > merged ;
2017-06-10 05:56:41 +08:00
2019-10-28 19:00:37 +08:00
debug_printf ( " %s Leaf page, merging changes. \n " , context . c_str ( ) ) ;
2017-08-22 13:29:57 +08:00
// If replacement pages are written they will be at the minimum version seen in the mutations for this leaf
2017-08-28 16:57:01 +08:00
Version minVersion = invalidVersion ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Now, process each mutation range and merge changes with existing data.
2019-10-29 07:05:11 +08:00
bool firstMutationBoundary = true ;
2017-08-25 08:25:53 +08:00
while ( iMutationBoundary ! = iMutationBoundaryEnd ) {
2019-06-04 19:03:52 +08:00
debug_printf ( " %s New mutation boundary: '%s': %s \n " , context . c_str ( ) , printable ( iMutationBoundary - > first ) . c_str ( ) , iMutationBoundary - > second . toString ( ) . c_str ( ) ) ;
2017-08-23 02:30:44 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutations ;
2017-08-22 13:29:57 +08:00
2019-10-29 07:05:11 +08:00
// For the first mutation boundary only, if the boundary key is less than the lower bound for the page
// then skip startKeyMutations for this boundary, we're only processing this mutation range here to apply
// a possible clear to existing data.
if ( firstMutationBoundary & & iMutationBoundary - > first < lowerBound - > key ) {
2017-08-28 21:28:49 +08:00
iMutations = iMutationBoundary - > second . startKeyMutations . end ( ) ;
2019-04-30 08:00:29 +08:00
}
else {
2017-08-25 08:25:53 +08:00
iMutations = iMutationBoundary - > second . startKeyMutations . begin ( ) ;
2019-04-30 08:00:29 +08:00
}
2019-10-29 07:05:11 +08:00
firstMutationBoundary = false ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutationsEnd = iMutationBoundary - > second . startKeyMutations . end ( ) ;
2019-04-30 08:00:29 +08:00
// Iterate over old versions of the mutation boundary key, outputting if necessary
2019-10-29 16:31:59 +08:00
bool boundaryKeyWritten = false ;
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) & & cursor . get ( ) . key = = iMutationBoundary - > first ) {
2019-04-30 08:00:29 +08:00
// If not in single version mode or there were no changes to the key
if ( ! self - > singleVersion | | iMutationBoundary - > second . noChanges ( ) ) {
2019-09-28 06:08:05 +08:00
merged . push_back ( merged . arena ( ) , cursor . get ( ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, boundary start] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-10-29 16:31:59 +08:00
boundaryKeyWritten = true ;
2019-04-30 08:00:29 +08:00
}
else {
ASSERT ( self - > singleVersion ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Skipped %s [existing, boundary start, singleVersion mode] \n " , context . c_str ( ) , cursor . get ( ) . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
minVersion = 0 ;
}
2019-02-21 18:46:30 +08:00
cursor . moveNext ( ) ;
2017-08-25 08:25:53 +08:00
}
2017-06-10 05:56:41 +08:00
2018-07-18 18:19:35 +08:00
// TODO: If a mutation set is equal to the previous existing value of the key, maybe don't write it.
2017-08-25 08:25:53 +08:00
// Output mutations for the mutation boundary start key
2017-08-22 13:29:57 +08:00
while ( iMutations ! = iMutationsEnd ) {
2017-09-06 07:59:31 +08:00
const SingleKeyMutation & m = iMutations - > second ;
2019-08-07 17:36:33 +08:00
if ( m . isClear ( ) | | m . value . size ( ) < = self - > m_maxPartSize ) {
2019-10-29 16:31:59 +08:00
// If the boundary key was not yet written to the merged list then clears can be skipped.
// Note that in a more complex scenario where there are multiple sibling pages for the same key, with different
// versions and/or part numbers, this is still a valid thing to do. This is because a changing boundary
// key (set or clear) will result in any instances (different versions, split parts) of this key
// on sibling pages to the left of this page to be removed, so an explicit clear need only be stored
// if a record with the mutation boundary key was already written to this page.
if ( ! boundaryKeyWritten & & iMutations - > second . isClear ( ) ) {
debug_printf ( " %s Skipped %s [mutation, unnecessary boundary key clear] \n " , context . c_str ( ) , m . toRecord ( iMutationBoundary - > first , iMutations - > first ) . toString ( ) . c_str ( ) ) ;
}
else {
merged . push_back ( merged . arena ( ) , m . toRecord ( iMutationBoundary - > first , iMutations - > first ) ) ;
debug_printf ( " %s Added non-split %s [mutation, boundary start] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
boundaryKeyWritten = true ;
}
2017-09-06 07:59:31 +08:00
}
else {
2018-07-23 18:09:13 +08:00
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
2017-09-06 07:59:31 +08:00
int bytesLeft = m . value . size ( ) ;
2018-07-23 18:09:13 +08:00
int start = 0 ;
2019-02-21 18:46:30 +08:00
RedwoodRecordRef whole ( iMutationBoundary - > first , iMutations - > first , m . value ) ;
2017-09-06 07:59:31 +08:00
while ( bytesLeft > 0 ) {
2019-08-07 17:36:33 +08:00
int partSize = std : : min ( bytesLeft , self - > m_maxPartSize ) ;
2018-07-23 18:09:13 +08:00
// Don't copy the value chunk because this page will stay in memory until after we've built new version(s) of it
2019-09-28 06:08:05 +08:00
merged . push_back ( merged . arena ( ) , whole . split ( start , partSize ) ) ;
2018-07-25 17:29:17 +08:00
bytesLeft - = partSize ;
start + = partSize ;
2019-08-07 17:36:33 +08:00
debug_printf ( " %s Added split %s [mutation, boundary start] bytesLeft %d \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) , bytesLeft ) ;
2017-09-06 07:59:31 +08:00
}
2019-10-29 16:31:59 +08:00
boundaryKeyWritten = true ;
2017-09-06 07:59:31 +08:00
}
2017-08-22 13:29:57 +08:00
+ + iMutations ;
}
2017-06-10 05:56:41 +08:00
2017-08-25 08:25:53 +08:00
// Get the clear version for this range, which is the last thing that we need from it,
Optional < Version > clearRangeVersion = iMutationBoundary - > second . rangeClearVersion ;
// Advance to the next boundary because we need to know the end key for the current range.
+ + iMutationBoundary ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Mutation range end: '%s' \n " , context . c_str ( ) , printable ( iMutationBoundary - > first ) . c_str ( ) ) ;
2017-08-29 08:26:53 +08:00
2017-08-25 08:25:53 +08:00
// Write existing keys which are less than the next mutation boundary key, clearing if needed.
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) & & cursor . get ( ) . key < iMutationBoundary - > first ) {
2019-04-30 08:00:29 +08:00
// TODO: Remove old versions that are too old
2019-02-21 18:46:30 +08:00
2019-04-30 08:00:29 +08:00
bool remove = self - > singleVersion & & clearRangeVersion . present ( ) ;
if ( ! remove ) {
2019-09-28 06:08:05 +08:00
merged . push_back ( merged . arena ( ) , cursor . get ( ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, middle] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
else {
ASSERT ( self - > singleVersion ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Skipped %s [existing, boundary start, singleVersion mode] \n " , context . c_str ( ) , cursor . get ( ) . toString ( ) . c_str ( ) ) ;
2017-08-28 16:57:01 +08:00
Version clearVersion = clearRangeVersion . get ( ) ;
if ( clearVersion < minVersion | | minVersion = = invalidVersion )
minVersion = clearVersion ;
2017-08-22 13:29:57 +08:00
}
2019-04-30 08:00:29 +08:00
// If keeping version history, write clears for records that exist in this range if the range was cleared
if ( ! self - > singleVersion ) {
// Write a clear of this key if needed. A clear is required if clearRangeVersion is set and the next cursor
// key is different than the current one. If the last cursor key in the page is different from the
// first key in the right sibling page then the page's upper bound will reflect that.
auto nextCursor = cursor ;
nextCursor . moveNext ( ) ;
if ( clearRangeVersion . present ( ) & & cursor . get ( ) . key ! = nextCursor . getOrUpperBound ( ) . key ) {
Version clearVersion = clearRangeVersion . get ( ) ;
if ( clearVersion < minVersion | | minVersion = = invalidVersion )
minVersion = clearVersion ;
2019-09-28 06:08:05 +08:00
merged . push_back ( merged . arena ( ) , RedwoodRecordRef ( cursor . get ( ) . key , clearVersion ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, middle clear] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
cursor = nextCursor ;
}
else {
cursor . moveNext ( ) ;
}
2017-08-22 13:29:57 +08:00
}
2017-08-26 06:48:32 +08:00
}
2017-06-10 05:56:41 +08:00
2017-08-26 06:48:32 +08:00
// Write any remaining existing keys, which are not subject to clears as they are beyond the cleared range.
2019-10-29 16:31:59 +08:00
bool upperMutationBoundaryKeyChanged = iMutationBoundaryEnd - > second . keyChanged ( ) ;
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) ) {
2019-10-29 16:31:59 +08:00
// If the upper mutation boundary is being changed and the cursor's key matches it then stop because none of the earlier
// versions or fragments of that key should be written.
if ( upperMutationBoundaryKeyChanged & & cursor . get ( ) . key = = iMutationBoundaryEnd - > first ) {
debug_printf ( " %s Skipped %s and beyond [existing, matches changed upper mutation boundary] \n " , context . c_str ( ) , cursor . get ( ) . toString ( ) . c_str ( ) ) ;
Version changedVersion = iMutationBoundaryEnd - > second . startKeyMutations . begin ( ) - > first ;
if ( changedVersion < minVersion | | minVersion = = invalidVersion )
minVersion = changedVersion ;
break ;
}
2019-09-28 06:08:05 +08:00
merged . push_back ( merged . arena ( ) , cursor . get ( ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, tail] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
cursor . moveNext ( ) ;
2017-06-10 05:56:41 +08:00
}
2017-08-25 08:25:53 +08:00
2019-03-15 15:46:09 +08:00
// No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records.
2019-10-23 08:17:29 +08:00
if ( minVersion = = invalidVersion ) {
2019-09-28 06:08:05 +08:00
results . push_back_deep ( results . arena ( ) , VersionAndChildrenRef ( 0 , VectorRef < RedwoodRecordRef > ( ( RedwoodRecordRef * ) decodeLowerBound , 1 ) , * decodeUpperBound ) ) ;
debug_printf ( " %s No changes were made during mutation merge, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
return results ;
2017-08-28 18:53:29 +08:00
}
2019-11-04 19:04:03 +08:00
else {
debug_printf ( " %s Changes were made, writing. \n " , context . c_str ( ) ) ;
}
2017-08-28 16:57:01 +08:00
2018-09-19 15:32:39 +08:00
// TODO: Make version and key splits based on contents of merged list, if keeping history
2017-06-10 05:56:41 +08:00
2019-10-15 18:10:50 +08:00
writeVersion = self - > singleVersion ? self - > getLastCommittedVersion ( ) + 1 : minVersion ;
2019-04-30 08:00:29 +08:00
// If everything in the page was deleted then this page should be deleted as of the new version
// Note that if a single range clear covered the entire page then we should not get this far
2019-10-28 19:00:37 +08:00
if ( merged . empty ( ) ) {
2019-09-28 06:08:05 +08:00
debug_printf ( " %s All leaf page contents were cleared, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
self - > freeBtreePage ( rootID , writeVersion ) ;
2019-09-28 06:08:05 +08:00
return results ;
2017-07-05 14:41:48 +08:00
}
2017-06-10 05:56:41 +08:00
2019-09-28 06:08:05 +08:00
state Standalone < VectorRef < RedwoodRecordRef > > entries = wait ( writePages ( self , true , lowerBound , upperBound , merged , BTreePage : : IS_LEAF , page - > height , writeVersion , rootID ) ) ;
results . arena ( ) . dependsOn ( entries . arena ( ) ) ;
results . push_back ( results . arena ( ) , VersionAndChildrenRef ( writeVersion , entries , * upperBound ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Merge complete, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
return results ;
}
else {
2018-08-29 04:46:14 +08:00
// Internal Page
2019-10-01 17:06:00 +08:00
ASSERT ( ! isLeaf ) ;
2019-09-28 06:08:05 +08:00
state std : : vector < Future < Standalone < VersionedChildrenT > > > futureChildren ;
2019-05-22 10:16:32 +08:00
2018-06-15 08:52:25 +08:00
bool first = true ;
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) ) {
// The lower bound for the first child is the lowerBound arg
const RedwoodRecordRef & childLowerBound = first ? * lowerBound : cursor . get ( ) ;
2019-05-22 10:16:32 +08:00
first = false ;
// Skip over any children that do not link to a page. They exist to preserve the ancestors from
// which adjacent children can borrow prefix bytes.
// If there are any, then the first valid child page will incur a boundary change to move
// its lower bound to the left so we can delete the non-linking entry from this page to free up space.
while ( ! cursor . get ( ) . value . present ( ) ) {
2019-06-04 19:03:52 +08:00
// There should never be an internal page written that has no valid child pages. This loop will find
2019-05-22 10:16:32 +08:00
// the first valid child link, and if there are no more then execution will not return to this loop.
ASSERT ( cursor . moveNext ( ) ) ;
}
2018-06-08 18:32:34 +08:00
2019-05-22 10:16:32 +08:00
ASSERT ( cursor . valid ( ) ) ;
const RedwoodRecordRef & decodeChildLowerBound = cursor . get ( ) ;
2019-09-28 06:08:05 +08:00
BTreePageID pageID = cursor . get ( ) . getChildPage ( ) ;
ASSERT ( ! pageID . empty ( ) ) ;
2018-06-08 18:32:34 +08:00
2019-06-04 19:55:09 +08:00
const RedwoodRecordRef & decodeChildUpperBound = cursor . moveNext ( ) ? cursor . get ( ) : * decodeUpperBound ;
2019-05-22 10:16:32 +08:00
// Skip over any next-children which do not actually link to child pages
while ( cursor . valid ( ) & & ! cursor . get ( ) . value . present ( ) ) {
cursor . moveNext ( ) ;
}
const RedwoodRecordRef & childUpperBound = cursor . valid ( ) ? cursor . get ( ) : * upperBound ;
2017-08-28 16:57:01 +08:00
2019-10-15 18:10:50 +08:00
debug_printf ( " %s recursing to %s lower=%s upper=%s decodeLower=%s decodeUpper=%s \n " ,
2019-09-28 06:08:05 +08:00
context . c_str ( ) , toString ( pageID ) . c_str ( ) , childLowerBound . toString ( ) . c_str ( ) , childUpperBound . toString ( ) . c_str ( ) , decodeChildLowerBound . toString ( ) . c_str ( ) , decodeChildUpperBound . toString ( ) . c_str ( ) ) ;
2017-08-28 18:53:29 +08:00
2019-04-30 08:00:29 +08:00
/*
// TODO: If lower bound and upper bound have the same key, do something intelligent if possible
//
if ( childLowerBound . key = = childUpperBound . key ) {
if ( key is modified or cleared ) {
if ( self - > singleVersion ) {
// In single version mode, don't keep any records with the old key if the key is modified, so return
// an empty page set to replace the child page
futureChildren . push_back ( VersionedChildrenT ( { { 0 , { } } } ) ) ;
}
else {
// In versioned mode, there is no need to recurse to this page because new versions of key
// will go in the right most page that has the same lowerBound key, but since the key is
// being changed the new version of this page should exclude the old subtree
}
else {
// Return the child page as-is, no need to visit it
futureChildren . push_back ( VersionedChildrenT ( { { 0 , { { childLowerBound , pageID } } } } ) ) ;
}
}
else {
// No changes
futureChildren . push_back ( VersionedChildrenT ( { { 0 , { { childLowerBound , pageID } } } } ) ) ;
}
}
else {
futureChildren . push_back ( self - > commitSubtree ( self , mutationBuffer , snapshot , pageID , & childLowerBound , & childUpperBound ) ) ;
}
*/
2019-10-01 17:06:00 +08:00
// If this page has height of 2 then its children are leaf nodes
futureChildren . push_back ( self - > commitSubtree ( self , mutationBuffer , snapshot , pageID , page - > height = = 2 , & childLowerBound , & childUpperBound , & decodeChildLowerBound , & decodeChildUpperBound ) ) ;
2017-06-10 05:56:41 +08:00
}
2019-03-15 15:46:09 +08:00
// Waiting one at a time makes debugging easier
// TODO: Is it better to use waitForAll()?
2019-02-21 18:46:30 +08:00
state int k ;
for ( k = 0 ; k < futureChildren . size ( ) ; + + k ) {
wait ( success ( futureChildren [ k ] ) ) ;
}
2017-06-10 05:56:41 +08:00
2019-04-30 08:00:29 +08:00
if ( REDWOOD_DEBUG ) {
2019-09-28 06:08:05 +08:00
debug_printf ( " %s Subtree update results \n " , context . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
2019-06-18 09:55:49 +08:00
debug_printf ( " %s subtree result %s \n " , context . c_str ( ) , toString ( futureChildren [ i ] . get ( ) ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
}
2019-09-28 06:08:05 +08:00
// TODO: Either handle multi-versioned results or change commitSubtree interface to return a single child set.
2019-06-04 19:03:52 +08:00
ASSERT ( self - > singleVersion ) ;
2019-10-01 17:06:00 +08:00
writeVersion = self - > getLastCommittedVersion ( ) + 1 ;
2019-06-04 19:03:52 +08:00
cursor . moveFirst ( ) ;
2019-09-28 06:08:05 +08:00
// All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be upperBound
2019-06-04 19:03:52 +08:00
InternalPageBuilder pageBuilder ( cursor ) ;
2019-04-30 08:00:29 +08:00
2019-06-04 19:03:52 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
2019-09-28 06:08:05 +08:00
VersionedChildrenT versionedChildren = futureChildren [ i ] . get ( ) ;
2019-06-04 19:03:52 +08:00
ASSERT ( versionedChildren . size ( ) < = 1 ) ;
2019-05-22 10:16:32 +08:00
2019-06-04 19:03:52 +08:00
if ( ! versionedChildren . empty ( ) ) {
pageBuilder . addEntries ( versionedChildren . front ( ) ) ;
2017-06-10 05:56:41 +08:00
}
}
2019-06-06 11:58:47 +08:00
pageBuilder . finalize ( * upperBound , * decodeUpperBound ) ;
2019-06-04 19:03:52 +08:00
// If page contents have changed
if ( pageBuilder . modified ) {
// If the page now has no children
if ( pageBuilder . childPageCount = = 0 ) {
2019-10-01 17:06:00 +08:00
debug_printf ( " %s All internal page children were deleted so deleting this page too, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
2019-10-15 18:10:50 +08:00
self - > freeBtreePage ( rootID , writeVersion ) ;
2019-09-28 06:08:05 +08:00
return results ;
2017-06-10 05:56:41 +08:00
}
2019-06-04 19:03:52 +08:00
else {
2019-09-28 06:08:05 +08:00
debug_printf ( " %s Internal page modified, creating replacements. \n " , context . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s newChildren=%s lastUpperBound=%s upperBound=%s \n " , context . c_str ( ) , toString ( pageBuilder . entries ) . c_str ( ) , pageBuilder . lastUpperBound . toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
ASSERT ( pageBuilder . lastUpperBound = = * upperBound ) ;
2017-06-10 05:56:41 +08:00
2019-09-28 06:08:05 +08:00
Standalone < VectorRef < RedwoodRecordRef > > childEntries = wait ( holdWhile ( pageBuilder . entries , writePages ( self , false , lowerBound , upperBound , pageBuilder . entries , 0 , page - > height , writeVersion , rootID ) ) ) ;
2017-06-10 05:56:41 +08:00
2019-09-28 06:08:05 +08:00
results . arena ( ) . dependsOn ( childEntries . arena ( ) ) ;
results . push_back ( results . arena ( ) , VersionAndChildrenRef ( 0 , childEntries , * upperBound ) ) ;
debug_printf ( " %s Internal modified, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
return results ;
2019-06-04 19:03:52 +08:00
}
}
else {
2019-09-28 06:08:05 +08:00
results . push_back_deep ( results . arena ( ) , VersionAndChildrenRef ( 0 , VectorRef < RedwoodRecordRef > ( ( RedwoodRecordRef * ) decodeLowerBound , 1 ) , * decodeUpperBound ) ) ;
debug_printf ( " %s Page has no changes, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
return results ;
2017-06-10 05:56:41 +08:00
}
}
}
ACTOR static Future < Void > commit_impl ( VersionedBTree * self ) {
2017-09-23 08:18:28 +08:00
state MutationBufferT * mutations = self - > m_pBuffer ;
// No more mutations are allowed to be written to this mutation buffer we will commit
// at m_writeVersion, which we must save locally because it could change during commit.
self - > m_pBuffer = nullptr ;
state Version writeVersion = self - > m_writeVersion ;
// The latest mutation buffer start version is the one we will now (or eventually) commit.
state Version mutationBufferStartVersion = self - > m_mutationBuffers . rbegin ( ) - > first ;
// Replace the lastCommit future with a new one and then wait on the old one
state Promise < Void > committed ;
Future < Void > previousCommit = self - > m_latestCommit ;
self - > m_latestCommit = committed . getFuture ( ) ;
// Wait for the latest commit that started to be finished.
2018-09-20 18:39:55 +08:00
wait ( previousCommit ) ;
2019-10-15 18:10:50 +08:00
2019-10-18 16:27:00 +08:00
self - > m_pager - > setOldestVersion ( self - > m_newOldestVersion ) ;
debug_printf ( " %s: Beginning commit of version % " PRId64 " , new oldest version set to % " PRId64 " \n " , self - > m_name . c_str ( ) , writeVersion , self - > m_newOldestVersion ) ;
2019-10-15 18:10:50 +08:00
2019-10-23 08:17:29 +08:00
state bool lazyDeleteStop = false ;
2019-11-04 19:04:03 +08:00
state Future < int > lazyDelete = incrementalSubtreeClear ( self , & lazyDeleteStop ) ;
2017-09-23 08:18:28 +08:00
// Get the latest version from the pager, which is what we will read at
2019-10-23 08:17:29 +08:00
state Version latestVersion = self - > m_pager - > getLatestVersion ( ) ;
2019-09-02 14:03:31 +08:00
debug_printf ( " %s: pager latestVersion % " PRId64 " \n " , self - > m_name . c_str ( ) , latestVersion ) ;
2017-06-10 05:56:41 +08:00
2019-09-28 06:08:05 +08:00
state Standalone < BTreePageID > rootPageID = self - > m_header . root . get ( ) ;
state RedwoodRecordRef lowerBound = dbBegin . withPageID ( rootPageID ) ;
2019-10-01 17:06:00 +08:00
Standalone < VersionedChildrenT > versionedRoots = wait ( commitSubtree ( self , mutations , self - > m_pager - > getReadSnapshot ( latestVersion ) , rootPageID , self - > m_header . height = = 1 , & lowerBound , & dbEnd , & lowerBound , & dbEnd ) ) ;
2019-09-28 06:08:05 +08:00
debug_printf ( " CommitSubtree(root %s) returned %s \n " , toString ( rootPageID ) . c_str ( ) , toString ( versionedRoots ) . c_str ( ) ) ;
// CommitSubtree on the root can only return 1 child at most because the pager interface only supports writing
// one meta record (which contains the root page) per commit.
ASSERT ( versionedRoots . size ( ) < = 1 ) ;
// If the old root was deleted, write a new empty tree root node and free the old roots
if ( versionedRoots . empty ( ) ) {
debug_printf ( " Writing new empty root. \n " ) ;
LogicalPageID newRootID = wait ( self - > m_pager - > newPageID ( ) ) ;
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
2019-10-26 05:52:06 +08:00
makeEmptyRoot ( page ) ;
self - > m_header . height = 1 ;
2019-09-28 06:08:05 +08:00
self - > m_pager - > updatePage ( newRootID , page ) ;
rootPageID = BTreePageID ( ( LogicalPageID * ) & newRootID , 1 ) ;
}
else {
Standalone < VectorRef < RedwoodRecordRef > > newRootLevel ( versionedRoots . front ( ) . children , versionedRoots . arena ( ) ) ;
if ( newRootLevel . size ( ) = = 1 ) {
rootPageID = newRootLevel . front ( ) . getChildPage ( ) ;
}
else {
// If the new root level's size is not 1 then build new root level(s)
Standalone < VectorRef < RedwoodRecordRef > > newRootPage = wait ( buildNewRoot ( self , latestVersion , newRootLevel , self - > m_header . height ) ) ;
rootPageID = newRootPage . front ( ) . getChildPage ( ) ;
}
}
self - > m_header . root . set ( rootPageID , sizeof ( headerSpace ) - sizeof ( m_header ) ) ;
2019-08-07 17:36:33 +08:00
2019-10-23 08:17:29 +08:00
lazyDeleteStop = true ;
wait ( success ( lazyDelete ) ) ;
debug_printf ( " Lazy delete freed %u pages \n " , lazyDelete . get ( ) ) ;
2019-10-15 18:10:50 +08:00
self - > m_pager - > setCommitVersion ( writeVersion ) ;
wait ( self - > m_lazyDeleteQueue . flush ( ) ) ;
self - > m_header . lazyDeleteQueue = self - > m_lazyDeleteQueue . getState ( ) ;
2019-08-07 17:36:33 +08:00
debug_printf ( " Setting metakey \n " ) ;
self - > m_pager - > setMetaKey ( self - > m_header . asKeyRef ( ) ) ;
2017-06-10 05:56:41 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " %s: Committing pager % " PRId64 " \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_pager - > commit ( ) ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " %s: Committed version % " PRId64 " \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
// Now that everything is committed we must delete the mutation buffer.
// Our buffer's start version should be the oldest mutation buffer version in the map.
ASSERT ( mutationBufferStartVersion = = self - > m_mutationBuffers . begin ( ) - > first ) ;
self - > m_mutationBuffers . erase ( self - > m_mutationBuffers . begin ( ) ) ;
self - > m_lastCommittedVersion = writeVersion ;
2019-09-28 06:08:05 +08:00
+ + counts . commits ;
2017-09-23 08:18:28 +08:00
committed . send ( Void ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
2019-02-21 18:46:30 +08:00
// InternalCursor is for seeking to and iterating over the 'internal' records (not user-visible) in the Btree.
// These records are versioned and they can represent deletedness or partial values.
struct InternalCursor {
private :
// Each InternalCursor's position is represented by a reference counted PageCursor, which links
// to its parent PageCursor, up to a PageCursor representing a cursor on the root page.
// PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead
struct PageCursor : ReferenceCounted < PageCursor > , FastAllocated < PageCursor > {
Reference < PageCursor > parent ;
2019-09-28 06:08:05 +08:00
BTreePageID pageID ; // Only needed for debugging purposes
2019-02-21 18:46:30 +08:00
Reference < const IPage > page ;
BTreePage : : BinaryTree : : Cursor cursor ;
2017-09-15 20:19:39 +08:00
2019-09-28 06:08:05 +08:00
// id will normally reference memory owned by the parent, which is okay because a reference to the parent
// will be held in the cursor
PageCursor ( BTreePageID id , Reference < const IPage > page , Reference < PageCursor > parent = { } )
2019-02-21 18:46:30 +08:00
: pageID ( id ) , page ( page ) , parent ( parent ) , cursor ( getReader ( ) . getCursor ( ) )
{
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
PageCursor ( const PageCursor & toCopy ) : parent ( toCopy . parent ) , pageID ( toCopy . pageID ) , page ( toCopy . page ) , cursor ( toCopy . cursor ) {
}
// Convenience method for copying a PageCursor
Reference < PageCursor > copy ( ) const {
return Reference < PageCursor > ( new PageCursor ( * this ) ) ;
}
// Multiple InternalCursors can share a Page
BTreePage : : BinaryTree : : Reader & getReader ( ) const {
return * ( BTreePage : : BinaryTree : : Reader * ) page - > userData ;
}
bool isLeaf ( ) const {
const BTreePage * p = ( ( const BTreePage * ) page - > begin ( ) ) ;
return p - > isLeaf ( ) ;
}
2019-08-07 17:36:33 +08:00
Future < Reference < PageCursor > > getChild ( Reference < IPagerSnapshot > pager ) {
2019-02-21 18:46:30 +08:00
ASSERT ( ! isLeaf ( ) ) ;
BTreePage : : BinaryTree : : Cursor next = cursor ;
next . moveNext ( ) ;
const RedwoodRecordRef & rec = cursor . get ( ) ;
2019-09-28 06:08:05 +08:00
BTreePageID id = rec . getChildPage ( ) ;
2019-08-07 17:36:33 +08:00
Future < Reference < const IPage > > child = readPage ( pager , id , & rec , & next . getOrUpperBound ( ) ) ;
2019-02-21 18:46:30 +08:00
return map ( child , [ = ] ( Reference < const IPage > page ) {
return Reference < PageCursor > ( new PageCursor ( id , page , Reference < PageCursor > : : addRef ( this ) ) ) ;
} ) ;
}
std : : string toString ( ) const {
2019-10-15 18:10:50 +08:00
return format ( " %s, %s " , : : toString ( pageID ) . c_str ( ) , cursor . valid ( ) ? cursor . get ( ) . toString ( ) . c_str ( ) : " <invalid> " ) ;
2019-02-21 18:46:30 +08:00
}
} ;
2019-09-28 06:08:05 +08:00
Standalone < BTreePageID > rootPageID ;
2019-02-21 18:46:30 +08:00
Reference < IPagerSnapshot > pager ;
Reference < PageCursor > pageCursor ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
public :
InternalCursor ( ) {
2017-09-15 20:19:39 +08:00
}
2019-09-28 06:08:05 +08:00
InternalCursor ( Reference < IPagerSnapshot > pager , BTreePageID root )
2019-08-07 17:36:33 +08:00
: pager ( pager ) , rootPageID ( root ) {
2019-02-21 18:46:30 +08:00
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
std : : string toString ( ) const {
2018-06-12 16:43:19 +08:00
std : : string r ;
2019-04-30 08:00:29 +08:00
2019-02-21 18:46:30 +08:00
Reference < PageCursor > c = pageCursor ;
2019-04-30 08:00:29 +08:00
int maxDepth = 0 ;
2019-02-21 18:46:30 +08:00
while ( c ) {
2019-04-30 08:00:29 +08:00
c = c - > parent ;
+ + maxDepth ;
}
c = pageCursor ;
int depth = maxDepth ;
while ( c ) {
r = format ( " [%d/%d: %s] " , depth - - , maxDepth , c - > toString ( ) . c_str ( ) ) + r ;
2019-02-21 18:46:30 +08:00
c = c - > parent ;
2018-06-12 16:43:19 +08:00
}
return r ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is a valid leaf page record
bool valid ( ) const {
return pageCursor & & pageCursor - > isLeaf ( ) & & pageCursor - > cursor . valid ( ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is valid() and has a present record value
bool present ( ) {
return valid ( ) & & pageCursor - > cursor . get ( ) . value . present ( ) ;
}
2018-07-15 04:37:52 +08:00
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is present() and has an effective version <= v
bool presentAtVersion ( Version v ) {
return present ( ) & & pageCursor - > cursor . get ( ) . version < = v ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is present() and has an effective version <= v
bool validAtVersion ( Version v ) {
return valid ( ) & & pageCursor - > cursor . get ( ) . version < = v ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
const RedwoodRecordRef & get ( ) const {
return pageCursor - > cursor . get ( ) ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// Ensure that pageCursor is not shared with other cursors so we can modify it
void ensureUnshared ( ) {
if ( ! pageCursor - > isSoleOwner ( ) ) {
pageCursor = pageCursor - > copy ( ) ;
2017-09-15 20:19:39 +08:00
}
}
2019-02-21 18:46:30 +08:00
Future < Void > moveToRoot ( ) {
// If pageCursor exists follow parent links to the root
if ( pageCursor ) {
while ( pageCursor - > parent ) {
pageCursor = pageCursor - > parent ;
}
return Void ( ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// Otherwise read the root page
2019-08-07 17:36:33 +08:00
Future < Reference < const IPage > > root = readPage ( pager , rootPageID , & dbBegin , & dbEnd ) ;
2019-02-21 18:46:30 +08:00
return map ( root , [ = ] ( Reference < const IPage > p ) {
pageCursor = Reference < PageCursor > ( new PageCursor ( rootPageID , p ) ) ;
return Void ( ) ;
} ) ;
}
2018-06-12 16:43:19 +08:00
2019-02-21 18:46:30 +08:00
ACTOR Future < bool > seekLessThanOrEqual_impl ( InternalCursor * self , RedwoodRecordRef query ) {
Future < Void > f = self - > moveToRoot ( ) ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// f will almost always be ready
if ( ! f . isReady ( ) ) {
wait ( f ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
self - > ensureUnshared ( ) ;
loop {
2019-05-22 10:16:32 +08:00
bool success = self - > pageCursor - > cursor . seekLessThanOrEqual ( query ) ;
// Skip backwards over internal page entries that do not link to child pages
if ( ! self - > pageCursor - > isLeaf ( ) ) {
// While record has no value, move again
while ( success & & ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
success = self - > pageCursor - > cursor . movePrev ( ) ;
}
}
if ( success ) {
2019-02-21 18:46:30 +08:00
// If we found a record <= query at a leaf page then return success
if ( self - > pageCursor - > isLeaf ( ) ) {
return true ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
2019-08-07 17:36:33 +08:00
Reference < PageCursor > child = wait ( self - > pageCursor - > getChild ( self - > pager ) ) ;
2019-02-21 18:46:30 +08:00
self - > pageCursor = child ;
2017-09-15 20:19:39 +08:00
}
else {
2019-02-21 18:46:30 +08:00
// No records <= query on this page, so move to immediate previous record at leaf level
bool success = wait ( self - > move ( false ) ) ;
return success ;
2017-09-15 20:19:39 +08:00
}
}
}
2019-02-21 18:46:30 +08:00
Future < bool > seekLTE ( RedwoodRecordRef query ) {
return seekLessThanOrEqual_impl ( this , query ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
ACTOR Future < bool > move_impl ( InternalCursor * self , bool forward ) {
// Try to move pageCursor, if it fails to go parent, repeat until it works or root cursor can't be moved
while ( 1 ) {
self - > ensureUnshared ( ) ;
bool success = self - > pageCursor - > cursor . valid ( ) & & ( forward ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ) ;
2017-09-15 20:19:39 +08:00
2019-05-22 10:16:32 +08:00
// Skip over internal page entries that do not link to child pages
if ( ! self - > pageCursor - > isLeaf ( ) ) {
// While record has no value, move again
while ( success & & ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
success = forward ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ;
}
}
2019-02-21 18:46:30 +08:00
// Stop if successful or there's no parent to move to
if ( success | | ! self - > pageCursor - > parent ) {
break ;
2018-06-12 16:43:19 +08:00
}
2019-02-21 18:46:30 +08:00
// Move to parent
self - > pageCursor = self - > pageCursor - > parent ;
2018-06-08 18:32:34 +08:00
}
2019-02-21 18:46:30 +08:00
// If pageCursor not valid we've reached an end of the tree
if ( ! self - > pageCursor - > cursor . valid ( ) ) {
return false ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// While not on a leaf page, move down to get to one.
while ( ! self - > pageCursor - > isLeaf ( ) ) {
2019-05-22 10:16:32 +08:00
// Skip over internal page entries that do not link to child pages
while ( ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
bool success = forward ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ;
if ( ! success ) {
return false ;
}
}
2019-08-07 17:36:33 +08:00
Reference < PageCursor > child = wait ( self - > pageCursor - > getChild ( self - > pager ) ) ;
2019-06-18 09:55:49 +08:00
forward ? child - > cursor . moveFirst ( ) : child - > cursor . moveLast ( ) ;
2019-02-21 18:46:30 +08:00
self - > pageCursor = child ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
return true ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
Future < bool > move ( bool forward ) {
return move_impl ( this , forward ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
Future < bool > moveNext ( ) {
return move_impl ( this , true ) ;
}
Future < bool > movePrev ( ) {
return move_impl ( this , false ) ;
}
2018-08-29 04:46:14 +08:00
2019-02-21 18:46:30 +08:00
// Move to the first or last record of the database.
ACTOR Future < bool > move_end ( InternalCursor * self , bool begin ) {
Future < Void > f = self - > moveToRoot ( ) ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// f will almost always be ready
if ( ! f . isReady ( ) ) {
wait ( f ) ;
2018-06-12 16:43:19 +08:00
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
self - > ensureUnshared ( ) ;
2018-06-08 18:32:34 +08:00
2019-02-21 18:46:30 +08:00
loop {
// Move to first or last record in the page
bool success = begin ? self - > pageCursor - > cursor . moveFirst ( ) : self - > pageCursor - > cursor . moveLast ( ) ;
2019-05-22 10:16:32 +08:00
// Skip over internal page entries that do not link to child pages
if ( ! self - > pageCursor - > isLeaf ( ) ) {
// While record has no value, move past it
while ( success & & ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
success = begin ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ;
}
}
2019-02-21 18:46:30 +08:00
// If it worked, return true if we've reached a leaf page otherwise go to the next child
if ( success ) {
if ( self - > pageCursor - > isLeaf ( ) ) {
return true ;
}
2019-05-22 10:16:32 +08:00
2019-08-07 17:36:33 +08:00
Reference < PageCursor > child = wait ( self - > pageCursor - > getChild ( self - > pager ) ) ;
2019-02-21 18:46:30 +08:00
self - > pageCursor = child ;
}
else {
return false ;
}
}
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
Future < bool > moveFirst ( ) {
return move_end ( this , true ) ;
}
Future < bool > moveLast ( ) {
return move_end ( this , false ) ;
}
2017-09-15 20:19:39 +08:00
} ;
// Cursor is for reading and interating over user visible KV pairs at a specific version
2019-02-21 18:46:30 +08:00
// KeyValueRefs returned become invalid once the cursor is moved
class Cursor : public IStoreCursor , public ReferenceCounted < Cursor > , public FastAllocated < Cursor > , NonCopyable {
2017-06-10 05:56:41 +08:00
public :
2019-09-28 06:08:05 +08:00
Cursor ( Reference < IPagerSnapshot > pageSource , BTreePageID root , Version recordVersion )
2019-04-30 08:00:29 +08:00
: m_version ( recordVersion ) ,
2019-08-07 17:36:33 +08:00
m_cur1 ( pageSource , root ) ,
2019-02-21 18:46:30 +08:00
m_cur2 ( m_cur1 )
{
2017-06-10 05:56:41 +08:00
}
2019-02-21 18:46:30 +08:00
void addref ( ) { ReferenceCounted < Cursor > : : addref ( ) ; }
void delref ( ) { ReferenceCounted < Cursor > : : delref ( ) ; }
2017-09-09 16:29:25 +08:00
2019-02-21 18:46:30 +08:00
private :
Version m_version ;
// If kv is valid
// - kv.key references memory held by cur1
// - If cur1 points to a non split KV pair
// - kv.value references memory held by cur1
// - cur2 points to the next internal record after cur1
// Else
// - kv.value references memory in arena
// - cur2 points to the first internal record of the split KV pair
InternalCursor m_cur1 ;
InternalCursor m_cur2 ;
Arena m_arena ;
Optional < KeyValueRef > m_kv ;
public :
virtual Future < Void > findEqual ( KeyRef key ) { return find_impl ( this , key , true , 0 ) ; }
virtual Future < Void > findFirstEqualOrGreater ( KeyRef key , bool needValue , int prefetchNextBytes ) { return find_impl ( this , key , needValue , 1 ) ; }
virtual Future < Void > findLastLessOrEqual ( KeyRef key , bool needValue , int prefetchPriorBytes ) { return find_impl ( this , key , needValue , - 1 ) ; }
virtual Future < Void > next ( bool needValue ) { return move ( this , true , needValue ) ; }
virtual Future < Void > prev ( bool needValue ) { return move ( this , false , needValue ) ; }
2017-06-10 05:56:41 +08:00
virtual bool isValid ( ) {
return m_kv . present ( ) ;
}
virtual KeyRef getKey ( ) {
return m_kv . get ( ) . key ;
}
2019-02-21 18:46:30 +08:00
2017-06-10 05:56:41 +08:00
virtual ValueRef getValue ( ) {
return m_kv . get ( ) . value ;
}
2019-02-21 18:46:30 +08:00
std : : string toString ( ) const {
2018-06-12 16:43:19 +08:00
std : : string r ;
2019-05-29 21:23:32 +08:00
r + = format ( " Cursor(%p) ver: % " PRId64 " " , this , m_version ) ;
2019-02-24 19:47:32 +08:00
if ( m_kv . present ( ) ) {
r + = format ( " KV: '%s' -> '%s' \n " , m_kv . get ( ) . key . printable ( ) . c_str ( ) , m_kv . get ( ) . value . printable ( ) . c_str ( ) ) ;
}
else {
2019-04-30 08:00:29 +08:00
r + = " KV: <np> \n " ;
2019-02-24 19:47:32 +08:00
}
2019-02-21 18:46:30 +08:00
r + = format ( " Cur1: %s \n " , m_cur1 . toString ( ) . c_str ( ) ) ;
r + = format ( " Cur2: %s \n " , m_cur2 . toString ( ) . c_str ( ) ) ;
2019-02-24 19:47:32 +08:00
2018-06-12 16:43:19 +08:00
return r ;
}
2017-09-15 20:19:39 +08:00
private :
2018-07-23 18:09:13 +08:00
// find key in tree closest to or equal to key (at this cursor's version)
2017-09-15 20:19:39 +08:00
// for less than or equal use cmp < 0
// for greater than or equal use cmp > 0
// for equal use cmp == 0
2019-02-21 18:46:30 +08:00
ACTOR static Future < Void > find_impl ( Cursor * self , KeyRef key , bool needValue , int cmp ) {
2018-07-23 18:09:13 +08:00
// Search for the last key at or before (key, version, \xff)
2019-05-22 10:16:32 +08:00
state RedwoodRecordRef query ( key , self - > m_version , { } , 0 , std : : numeric_limits < int32_t > : : max ( ) ) ;
2019-02-21 18:46:30 +08:00
self - > m_kv . reset ( ) ;
2017-06-10 05:56:41 +08:00
2019-02-21 18:46:30 +08:00
wait ( success ( self - > m_cur1 . seekLTE ( query ) ) ) ;
debug_printf ( " find%sE(%s): %s \n " , cmp > 0 ? " GT " : ( cmp = = 0 ? " " : " LT " ) , query . toString ( ) . c_str ( ) , self - > toString ( ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2019-02-21 18:46:30 +08:00
// If we found the target key with a present value then return it as it is valid for any cmp type
if ( self - > m_cur1 . present ( ) & & self - > m_cur1 . get ( ) . key = = key ) {
debug_printf ( " Target key found, reading full KV pair. Cursor: %s \n " , self - > toString ( ) . c_str ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > readFullKVPair ( self ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
2017-09-09 16:29:25 +08:00
}
2019-02-21 18:46:30 +08:00
// Mode is ==, so if we're still here we didn't find it.
2017-09-15 20:19:39 +08:00
if ( cmp = = 0 ) {
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2019-02-21 18:46:30 +08:00
// Mode is >=, so if we're here we have to go to the next present record at the target version
// because the seek done above was <= query
2017-09-15 20:19:39 +08:00
if ( cmp > 0 ) {
2019-02-21 18:46:30 +08:00
// icur is at a record < query or invalid.
// If cursor is invalid, try to go to start of tree
if ( ! self - > m_cur1 . valid ( ) ) {
bool valid = wait ( self - > m_cur1 . moveFirst ( ) ) ;
if ( ! valid ) {
self - > m_kv . reset ( ) ;
return Void ( ) ;
}
}
else {
loop {
bool valid = wait ( self - > m_cur1 . move ( true ) ) ;
if ( ! valid ) {
self - > m_kv . reset ( ) ;
return Void ( ) ;
}
if ( self - > m_cur1 . get ( ) . key > key ) {
break ;
}
}
2017-09-16 16:45:39 +08:00
}
2019-02-21 18:46:30 +08:00
2017-09-16 16:45:39 +08:00
// Get the next present key at the target version. Handles invalid cursor too.
2018-09-20 18:39:55 +08:00
wait ( self - > next ( needValue ) ) ;
2017-09-09 16:29:25 +08:00
}
2017-09-16 16:45:39 +08:00
else if ( cmp < 0 ) {
2019-02-21 18:46:30 +08:00
// Mode is <=, which is the same as the seekLTE(query)
if ( ! self - > m_cur1 . valid ( ) ) {
self - > m_kv . reset ( ) ;
return Void ( ) ;
}
2017-09-17 19:38:01 +08:00
// Move to previous present kv pair at the target version
2018-09-20 18:39:55 +08:00
wait ( self - > prev ( needValue ) ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2019-02-21 18:46:30 +08:00
// TODO: use needValue
ACTOR static Future < Void > move ( Cursor * self , bool fwd , bool needValue ) {
debug_printf ( " Cursor::move(%d): Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
ASSERT ( self - > m_cur1 . valid ( ) ) ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// If kv is present then the key/version at cur1 was already returned so move to a new key
// Move cur1 until failure or a new key is found, keeping prior record visited in cur2
2017-09-16 16:45:39 +08:00
if ( self - > m_kv . present ( ) ) {
2019-02-21 18:46:30 +08:00
ASSERT ( self - > m_cur1 . valid ( ) ) ;
loop {
self - > m_cur2 = self - > m_cur1 ;
bool valid = wait ( self - > m_cur1 . move ( fwd ) ) ;
if ( ! valid | | self - > m_cur1 . get ( ) . key ! = self - > m_cur2 . get ( ) . key ) {
break ;
}
2017-09-16 16:45:39 +08:00
}
}
2019-02-21 18:46:30 +08:00
// Given two consecutive cursors c1 and c2, c1 represents a returnable record if
// c1.presentAtVersion(v) || (!c2.validAtVersion() || c2.get().key != c1.get().key())
// Note the distinction between 'present' and 'valid'. Present means the value for the key
// exists at the version (but could be the empty string) while valid just means the internal
// record is in effect at that version but it could indicate that the key was cleared and
// no longer exists from the user's perspective at that version
//
2019-02-24 19:47:32 +08:00
// cur2 must be the record immediately after cur1
// TODO: This may already be the case, store state to track this condition and avoid the reset here
if ( self - > m_cur1 . valid ( ) ) {
2019-02-21 18:46:30 +08:00
self - > m_cur2 = self - > m_cur1 ;
wait ( success ( self - > m_cur2 . move ( true ) ) ) ;
}
while ( self - > m_cur1 . valid ( ) ) {
if ( self - > m_cur1 . presentAtVersion ( self - > m_version ) & &
( ! self - > m_cur2 . validAtVersion ( self - > m_version ) | |
self - > m_cur2 . get ( ) . key ! = self - > m_cur1 . get ( ) . key )
2017-09-16 08:27:13 +08:00
) {
2018-09-20 18:39:55 +08:00
wait ( readFullKVPair ( self ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
2019-02-21 18:46:30 +08:00
if ( fwd ) {
// Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record
2019-02-24 19:47:32 +08:00
debug_printf ( " Cursor::move(%d): Moving forward, Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
self - > m_cur1 = self - > m_cur2 ;
wait ( success ( self - > m_cur2 . move ( true ) ) ) ;
}
else {
// Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record
2019-02-24 19:47:32 +08:00
debug_printf ( " Cursor::move(%d): Moving backward, Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
self - > m_cur2 = self - > m_cur1 ;
wait ( success ( self - > m_cur1 . move ( false ) ) ) ;
}
2017-06-10 05:56:41 +08:00
}
2017-09-16 08:27:13 +08:00
2019-02-21 18:46:30 +08:00
self - > m_kv . reset ( ) ;
2019-02-24 19:47:32 +08:00
debug_printf ( " Cursor::move(%d): Exit, end of db reached. Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
2019-02-21 18:46:30 +08:00
// Read all of the current key-value record starting at cur1 into kv
ACTOR static Future < Void > readFullKVPair ( Cursor * self ) {
self - > m_arena = Arena ( ) ;
const RedwoodRecordRef & rec = self - > m_cur1 . get ( ) ;
2019-10-02 21:43:11 +08:00
self - > m_kv . reset ( ) ;
2019-02-21 18:46:30 +08:00
debug_printf ( " readFullKVPair: Starting at %s \n " , self - > toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
2019-02-21 18:46:30 +08:00
// Unsplit value, cur1 will hold the key and value memory
if ( ! rec . isMultiPart ( ) ) {
2019-02-24 19:47:32 +08:00
self - > m_kv = KeyValueRef ( rec . key , rec . value . get ( ) ) ;
2019-02-21 18:46:30 +08:00
debug_printf ( " readFullKVPair: Unsplit, exit. %s \n " , self - > toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
2019-02-21 18:46:30 +08:00
return Void ( ) ;
2017-09-17 19:38:01 +08:00
}
2019-10-28 19:00:37 +08:00
debug_printf ( " readFullKVPair: Split, first record %s \n " , rec . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
// Split value, need to coalesce split value parts into a buffer in arena,
// after which cur1 will point to the first part and kv.key will reference its key
2019-04-30 08:00:29 +08:00
ASSERT ( rec . chunk . start + rec . value . get ( ) . size ( ) = = rec . chunk . total ) ;
2017-09-17 19:38:01 +08:00
2019-02-21 18:46:30 +08:00
// Allocate space for the entire value in the same arena as the key
2019-04-30 08:00:29 +08:00
state int bytesLeft = rec . chunk . total ;
2019-02-21 18:46:30 +08:00
state StringRef dst = makeString ( bytesLeft , self - > m_arena ) ;
2018-07-23 18:09:13 +08:00
2019-02-21 18:46:30 +08:00
loop {
const RedwoodRecordRef & rec = self - > m_cur1 . get ( ) ;
2017-09-09 16:29:25 +08:00
2019-02-21 18:46:30 +08:00
debug_printf ( " readFullKVPair: Adding chunk %s \n " , rec . toString ( ) . c_str ( ) ) ;
int partSize = rec . value . get ( ) . size ( ) ;
2019-04-30 08:00:29 +08:00
memcpy ( mutateString ( dst ) + rec . chunk . start , rec . value . get ( ) . begin ( ) , partSize ) ;
2019-02-21 18:46:30 +08:00
bytesLeft - = partSize ;
if ( bytesLeft = = 0 ) {
self - > m_kv = KeyValueRef ( rec . key , dst ) ;
return Void ( ) ;
2017-09-09 16:29:25 +08:00
}
2019-02-21 18:46:30 +08:00
ASSERT ( bytesLeft > 0 ) ;
// Move backward
bool success = wait ( self - > m_cur1 . move ( false ) ) ;
ASSERT ( success ) ;
2017-09-09 16:29:25 +08:00
}
2017-06-10 05:56:41 +08:00
}
} ;
2019-02-21 18:46:30 +08:00
2017-06-10 05:56:41 +08:00
} ;
2019-02-21 18:46:30 +08:00
RedwoodRecordRef VersionedBTree : : dbBegin ( StringRef ( ) , 0 ) ;
2019-05-30 17:10:07 +08:00
RedwoodRecordRef VersionedBTree : : dbEnd ( LiteralStringRef ( " \xff \xff \xff \xff \xff " ) ) ;
2019-03-15 15:46:09 +08:00
VersionedBTree : : Counts VersionedBTree : : counts ;
2017-08-23 02:30:44 +08:00
2017-09-22 14:51:55 +08:00
class KeyValueStoreRedwoodUnversioned : public IKeyValueStore {
2017-09-21 19:43:49 +08:00
public :
2017-09-22 14:51:55 +08:00
KeyValueStoreRedwoodUnversioned ( std : : string filePrefix , UID logID ) : m_filePrefix ( filePrefix ) {
2018-10-25 06:57:06 +08:00
// TODO: This constructor should really just take an IVersionedStore
2019-11-04 19:04:03 +08:00
IPager2 * pager = new DWALPager ( 4096 , filePrefix , 0 ) ;
2019-08-07 17:36:33 +08:00
m_tree = new VersionedBTree ( pager , filePrefix , true ) ;
2018-10-25 06:57:06 +08:00
m_init = catchError ( init_impl ( this ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual Future < Void > init ( ) {
return m_init ;
}
ACTOR Future < Void > init_impl ( KeyValueStoreRedwoodUnversioned * self ) {
2018-10-25 06:57:06 +08:00
TraceEvent ( SevInfo , " RedwoodInit " ) . detail ( " FilePrefix " , self - > m_filePrefix ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_tree - > init ( ) ) ;
2019-10-23 08:17:29 +08:00
Version v = self - > m_tree - > getLatestVersion ( ) ;
2017-09-21 19:43:49 +08:00
self - > m_tree - > setWriteVersion ( v + 1 ) ;
2018-10-25 06:57:06 +08:00
TraceEvent ( SevInfo , " RedwoodInitComplete " ) . detail ( " FilePrefix " , self - > m_filePrefix ) ;
2017-09-21 19:43:49 +08:00
return Void ( ) ;
}
2017-10-02 18:32:22 +08:00
ACTOR void shutdown ( KeyValueStoreRedwoodUnversioned * self , bool dispose ) {
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdown " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2018-10-25 06:57:06 +08:00
if ( self - > m_error . canBeSet ( ) ) {
self - > m_error . sendError ( actor_cancelled ( ) ) ; // Ideally this should be shutdown_in_progress
}
2017-09-23 08:18:28 +08:00
self - > m_init . cancel ( ) ;
2018-10-25 06:57:06 +08:00
Future < Void > closedFuture = self - > m_tree - > onClosed ( ) ;
2017-10-02 18:32:22 +08:00
if ( dispose )
2018-10-25 06:57:06 +08:00
self - > m_tree - > dispose ( ) ;
2017-10-02 18:32:22 +08:00
else
2018-10-25 06:57:06 +08:00
self - > m_tree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-09-21 19:43:49 +08:00
self - > m_closed . send ( Void ( ) ) ;
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdownComplete " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2017-10-02 18:32:22 +08:00
delete self ;
2017-09-21 19:43:49 +08:00
}
virtual void close ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , false ) ;
2017-09-21 19:43:49 +08:00
}
virtual void dispose ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , true ) ;
2017-09-21 19:43:49 +08:00
}
virtual Future < Void > onClosed ( ) {
return m_closed . getFuture ( ) ;
}
Future < Void > commit ( bool sequential = false ) {
2017-10-10 04:24:16 +08:00
Future < Void > c = m_tree - > commit ( ) ;
2019-10-23 08:17:29 +08:00
m_tree - > setOldestVersion ( m_tree - > getLatestVersion ( ) ) ;
2017-10-10 04:24:16 +08:00
m_tree - > setWriteVersion ( m_tree - > getWriteVersion ( ) + 1 ) ;
2018-10-25 06:57:06 +08:00
return catchError ( c ) ;
2017-09-21 19:43:49 +08:00
}
virtual KeyValueStoreType getType ( ) {
2017-09-22 14:51:55 +08:00
return KeyValueStoreType : : SSD_REDWOOD_V1 ;
2017-09-21 19:43:49 +08:00
}
virtual StorageBytes getStorageBytes ( ) {
2018-10-25 06:57:06 +08:00
return m_tree - > getStorageBytes ( ) ;
2017-09-21 19:43:49 +08:00
}
2018-10-25 06:57:06 +08:00
virtual Future < Void > getError ( ) {
return delayed ( m_error . getFuture ( ) ) ;
} ;
2017-09-21 19:43:49 +08:00
void clear ( KeyRangeRef range , const Arena * arena = 0 ) {
2019-03-15 15:46:09 +08:00
debug_printf ( " CLEAR %s \n " , printable ( range ) . c_str ( ) ) ;
2017-09-21 19:43:49 +08:00
m_tree - > clear ( range ) ;
}
virtual void set ( KeyValueRef keyValue , const Arena * arena = NULL ) {
2019-03-15 15:46:09 +08:00
debug_printf ( " SET %s \n " , keyValue . key . printable ( ) . c_str ( ) ) ;
2017-09-21 19:43:49 +08:00
m_tree - > set ( keyValue ) ;
}
2019-03-15 15:46:09 +08:00
virtual Future < Standalone < VectorRef < KeyValueRef > > > readRange ( KeyRangeRef keys , int rowLimit = 1 < < 30 , int byteLimit = 1 < < 30 ) {
debug_printf ( " READRANGE %s \n " , printable ( keys ) . c_str ( ) ) ;
return catchError ( readRange_impl ( this , keys , rowLimit , byteLimit ) ) ;
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Standalone < VectorRef < KeyValueRef > > > readRange_impl ( KeyValueStoreRedwoodUnversioned * self , KeyRange keys , int rowLimit , int byteLimit ) {
2019-03-15 15:46:09 +08:00
self - > m_tree - > counts . getRanges + + ;
2017-09-21 19:43:49 +08:00
state Standalone < VectorRef < KeyValueRef > > result ;
state int accumulatedBytes = 0 ;
ASSERT ( byteLimit > 0 ) ;
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2017-10-02 18:32:22 +08:00
2017-09-21 19:43:49 +08:00
if ( rowLimit > = 0 ) {
2018-09-20 18:39:55 +08:00
wait ( cur - > findFirstEqualOrGreater ( keys . begin , true , 0 ) ) ;
2017-09-21 19:43:49 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) < keys . end ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2018-10-25 06:57:06 +08:00
if ( - - rowLimit = = 0 | | accumulatedBytes > = byteLimit ) {
2017-09-21 19:43:49 +08:00
break ;
2018-10-25 06:57:06 +08:00
}
2018-09-20 18:39:55 +08:00
wait ( cur - > next ( true ) ) ;
2017-09-21 19:43:49 +08:00
}
} else {
2018-09-20 18:39:55 +08:00
wait ( cur - > findLastLessOrEqual ( keys . end , true , 0 ) ) ;
2017-09-21 19:43:49 +08:00
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = keys . end )
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-21 19:43:49 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = keys . begin ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2019-10-26 05:52:06 +08:00
if ( + + rowLimit = = 0 | | accumulatedBytes > = byteLimit ) {
2017-09-21 19:43:49 +08:00
break ;
2018-10-25 06:57:06 +08:00
}
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-21 19:43:49 +08:00
}
}
return result ;
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Optional < Value > > readValue_impl ( KeyValueStoreRedwoodUnversioned * self , Key key , Optional < UID > debugID ) {
2019-03-15 15:46:09 +08:00
self - > m_tree - > counts . gets + + ;
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findEqual ( key ) ) ;
2017-10-02 18:32:22 +08:00
if ( cur - > isValid ( ) ) {
2017-09-21 19:43:49 +08:00
return cur - > getValue ( ) ;
2017-10-02 18:32:22 +08:00
}
2017-09-21 19:43:49 +08:00
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValue ( KeyRef key , Optional < UID > debugID = Optional < UID > ( ) ) {
2018-10-25 06:57:06 +08:00
return catchError ( readValue_impl ( this , key , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Optional < Value > > readValuePrefix_impl ( KeyValueStoreRedwoodUnversioned * self , Key key , int maxLength , Optional < UID > debugID ) {
2019-03-15 15:46:09 +08:00
self - > m_tree - > counts . gets + + ;
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findEqual ( key ) ) ;
2017-09-21 19:43:49 +08:00
if ( cur - > isValid ( ) ) {
Value v = cur - > getValue ( ) ;
int len = std : : min ( v . size ( ) , maxLength ) ;
return Value ( cur - > getValue ( ) . substr ( 0 , len ) ) ;
}
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValuePrefix ( KeyRef key , int maxLength , Optional < UID > debugID = Optional < UID > ( ) ) {
2018-10-25 06:57:06 +08:00
return catchError ( readValuePrefix_impl ( this , key , maxLength , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual ~ KeyValueStoreRedwoodUnversioned ( ) {
2017-09-21 19:43:49 +08:00
} ;
private :
std : : string m_filePrefix ;
VersionedBTree * m_tree ;
Future < Void > m_init ;
Promise < Void > m_closed ;
2017-10-02 18:32:22 +08:00
Promise < Void > m_error ;
2018-10-25 06:57:06 +08:00
template < typename T > inline Future < T > catchError ( Future < T > f ) {
2019-11-04 19:04:03 +08:00
return forwardError ( f , m_error ) ;
2018-10-25 06:57:06 +08:00
}
2017-09-21 19:43:49 +08:00
} ;
2017-09-22 14:51:55 +08:00
IKeyValueStore * keyValueStoreRedwoodV1 ( std : : string const & filename , UID logID ) {
return new KeyValueStoreRedwoodUnversioned ( filename , logID ) ;
2017-09-21 19:43:49 +08:00
}
2018-09-28 07:07:29 +08:00
int randomSize ( int max ) {
2019-06-25 11:17:49 +08:00
int n = pow ( deterministicRandom ( ) - > random01 ( ) , 3 ) * max ;
2018-09-28 07:07:29 +08:00
return n ;
}
2017-09-21 19:43:49 +08:00
2019-06-24 16:05:16 +08:00
StringRef randomString ( Arena & arena , int len , char firstChar = ' a ' , char lastChar = ' z ' ) {
+ + lastChar ;
StringRef s = makeString ( len , arena ) ;
for ( int i = 0 ; i < len ; + + i ) {
2019-06-25 11:17:49 +08:00
* ( uint8_t * ) ( s . begin ( ) + i ) = ( uint8_t ) deterministicRandom ( ) - > randomInt ( firstChar , lastChar ) ;
2019-06-24 16:05:16 +08:00
}
return s ;
}
Standalone < StringRef > randomString ( int len , char firstChar = ' a ' , char lastChar = ' z ' ) {
Standalone < StringRef > s ;
( StringRef & ) s = randomString ( s . arena ( ) , len , firstChar , lastChar ) ;
return s ;
}
KeyValue randomKV ( int maxKeySize = 10 , int maxValueSize = 5 ) {
int kLen = randomSize ( 1 + maxKeySize ) ;
int vLen = maxValueSize > 0 ? randomSize ( maxValueSize ) : 0 ;
2017-06-10 05:56:41 +08:00
KeyValue kv ;
2019-06-24 16:05:16 +08:00
kv . key = randomString ( kv . arena ( ) , kLen , ' a ' , ' m ' ) ;
2017-06-10 05:56:41 +08:00
for ( int i = 0 ; i < kLen ; + + i )
2019-05-11 05:01:52 +08:00
mutateString ( kv . key ) [ i ] = ( uint8_t ) deterministicRandom ( ) - > randomInt ( ' a ' , ' m ' ) ;
2019-06-24 16:05:16 +08:00
if ( vLen > 0 ) {
kv . value = randomString ( kv . arena ( ) , vLen , ' n ' , ' z ' ) ;
for ( int i = 0 ; i < vLen ; + + i )
2019-06-25 11:17:49 +08:00
mutateString ( kv . value ) [ i ] = ( uint8_t ) deterministicRandom ( ) - > randomInt ( ' o ' , ' z ' ) ;
2019-06-24 16:05:16 +08:00
}
2017-06-10 05:56:41 +08:00
return kv ;
}
2019-04-30 08:00:29 +08:00
ACTOR Future < int > verifyRange ( VersionedBTree * btree , Key start , Key end , Version v , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written , int * pErrorCount ) {
2017-09-15 20:19:39 +08:00
state int errors = 0 ;
if ( end < = start )
end = keyAfter ( start ) ;
2017-09-16 16:45:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written - > lower_bound ( std : : make_pair ( start . toString ( ) , 0 ) ) ;
2017-09-15 20:19:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written - > upper_bound ( std : : make_pair ( end . toString ( ) , 0 ) ) ;
2017-09-16 08:27:13 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iLast ;
2017-09-15 20:19:39 +08:00
2017-09-17 19:38:01 +08:00
state Reference < IStoreCursor > cur = btree - > readAtVersion ( v ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s): Start cur=%p \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur . getPtr ( ) ) ;
2017-09-17 19:38:01 +08:00
// Randomly use the cursor for something else first.
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > coinflip ( ) ) {
2017-09-21 15:58:56 +08:00
state Key randomKey = randomKV ( ) . key ;
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s): Dummy seek to '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , randomKey . toString ( ) . c_str ( ) ) ;
2019-05-11 05:01:52 +08:00
wait ( deterministicRandom ( ) - > coinflip ( ) ? cur - > findFirstEqualOrGreater ( randomKey , true , 0 ) : cur - > findLastLessOrEqual ( randomKey , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
}
2018-06-14 19:15:14 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s): Actual seek \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findFirstEqualOrGreater ( start , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
state std : : vector < KeyValue > results ;
2017-09-15 20:19:39 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) < end ) {
// Find the next written kv pair that would be present at this version
while ( 1 ) {
iLast = i ;
2017-09-16 08:27:13 +08:00
if ( i = = iEnd )
break ;
+ + i ;
2019-05-22 10:16:32 +08:00
2017-09-16 08:27:13 +08:00
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
2019-05-22 10:16:32 +08:00
) {
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s) Found key in written map: %s \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , iLast - > first . first . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
break ;
2019-05-22 10:16:32 +08:00
}
2017-09-15 20:19:39 +08:00
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( iLast = = iEnd ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( cur - > getKey ( ) ! = iLast - > first . first ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , iLast - > first . first . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
if ( cur - > getValue ( ) ! = iLast - > second . get ( ) ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , iLast - > second . get ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-17 19:38:01 +08:00
2019-05-22 10:16:32 +08:00
ASSERT ( errors = = 0 ) ;
2017-09-17 19:38:01 +08:00
results . push_back ( KeyValue ( KeyValueRef ( cur - > getKey ( ) , cur - > getValue ( ) ) ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > next ( true ) ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-16 08:27:13 +08:00
// Make sure there are no further written kv pairs that would be present at this version.
while ( 1 ) {
iLast = i ;
if ( i = = iEnd )
break ;
+ + i ;
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
)
break ;
}
if ( iLast ! = iEnd ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree range ended but written has @% " PRId64 " '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , iLast - > first . second , iLast - > first . first . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
}
2017-09-16 16:45:39 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s): start \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
// Randomly use a new cursor for the reverse range read but only if version history is available
2019-06-25 11:17:49 +08:00
if ( ! btree - > isSingleVersion ( ) & & deterministicRandom ( ) - > coinflip ( ) ) {
2017-09-17 19:38:01 +08:00
cur = btree - > readAtVersion ( v ) ;
}
// Now read the range from the tree in reverse order and compare to the saved results
2018-09-20 18:39:55 +08:00
wait ( cur - > findLastLessOrEqual ( end , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = end )
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-17 19:38:01 +08:00
state std : : vector < KeyValue > : : const_reverse_iterator r = results . rbegin ( ) ;
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = start ) {
if ( r = = results . rend ( ) ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
if ( cur - > getKey ( ) ! = r - > key ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
if ( cur - > getValue ( ) ! = r - > value ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , r - > value . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
+ + r ;
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-17 19:38:01 +08:00
}
if ( r ! = results . rend ( ) ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree range ended but written has '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
}
2017-09-15 20:19:39 +08:00
return errors ;
}
2019-02-24 19:47:32 +08:00
ACTOR Future < int > verifyAll ( VersionedBTree * btree , Version maxCommittedVersion , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written , int * pErrorCount ) {
2018-09-28 15:35:03 +08:00
// Read back every key at every version set or cleared and verify the result.
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written - > cbegin ( ) ;
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written - > cend ( ) ;
state int errors = 0 ;
while ( i ! = iEnd ) {
state std : : string key = i - > first . first ;
state Version ver = i - > first . second ;
if ( ver < = maxCommittedVersion ) {
state Optional < std : : string > val = i - > second ;
state Reference < IStoreCursor > cur = btree - > readAtVersion ( ver ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " Verifying @% " PRId64 " '%s' \n " , ver , key . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
state Arena arena ;
wait ( cur - > findEqual ( KeyRef ( arena , key ) ) ) ;
2018-09-28 15:35:03 +08:00
if ( val . present ( ) ) {
if ( ! ( cur - > isValid ( ) & & cur - > getKey ( ) = = key & & cur - > getValue ( ) = = val . get ( ) ) ) {
+ + errors ;
2019-02-24 19:47:32 +08:00
+ + * pErrorCount ;
2018-09-28 15:35:03 +08:00
if ( ! cur - > isValid ( ) )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: key_not_found: '%s' -> '%s' @% " PRId64 " \n " , key . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
else if ( cur - > getKey ( ) ! = key )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: key_incorrect: found '%s' expected '%s' @% " PRId64 " \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , key . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
else if ( cur - > getValue ( ) ! = val . get ( ) )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @% " PRId64 " \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
}
} else {
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = key ) {
+ + errors ;
2019-02-24 19:47:32 +08:00
+ + * pErrorCount ;
2019-05-29 21:23:32 +08:00
printf ( " Verify ERROR: cleared_key_found: '%s' -> '%s' @% " PRId64 " \n " , key . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
}
}
}
+ + i ;
}
return errors ;
}
2019-05-22 10:16:32 +08:00
ACTOR Future < Void > verify ( VersionedBTree * btree , FutureStream < Version > vStream , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written , int * pErrorCount , bool serial ) {
2019-04-30 08:00:29 +08:00
state Future < int > vall ;
state Future < int > vrange ;
2018-09-28 15:35:03 +08:00
try {
loop {
2018-10-02 07:51:57 +08:00
state Version v = waitNext ( vStream ) ;
2018-09-28 15:35:03 +08:00
2019-04-30 08:00:29 +08:00
if ( btree - > isSingleVersion ( ) ) {
v = btree - > getLastCommittedVersion ( ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " Verifying at latest committed version % " PRId64 " \n " , v ) ;
2019-04-30 08:00:29 +08:00
vall = verifyRange ( btree , LiteralStringRef ( " " ) , LiteralStringRef ( " \xff \xff " ) , v , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vall ) ) ;
}
2019-04-30 08:00:29 +08:00
vrange = verifyRange ( btree , randomKV ( ) . key , randomKV ( ) . key , v , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vrange ) ) ;
}
2019-04-30 08:00:29 +08:00
}
else {
2019-05-29 21:23:32 +08:00
debug_printf ( " Verifying through version % " PRId64 " \n " , v ) ;
2019-04-30 08:00:29 +08:00
vall = verifyAll ( btree , v , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vall ) ) ;
}
2019-06-25 11:17:49 +08:00
vrange = verifyRange ( btree , randomKV ( ) . key , randomKV ( ) . key , deterministicRandom ( ) - > randomInt ( 1 , v + 1 ) , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vrange ) ) ;
}
2019-04-30 08:00:29 +08:00
}
2018-09-28 15:35:03 +08:00
wait ( success ( vall ) & & success ( vrange ) ) ;
2019-06-18 09:55:49 +08:00
debug_printf ( " Verified through version % " PRId64 " , %d errors \n " , v , * pErrorCount ) ;
2018-09-28 15:35:03 +08:00
if ( * pErrorCount ! = 0 )
break ;
}
} catch ( Error & e ) {
2019-10-15 18:10:50 +08:00
if ( e . code ( ) ! = error_code_end_of_stream & & e . code ( ) ! = error_code_transaction_too_old ) {
2018-09-28 15:35:03 +08:00
throw ;
}
}
return Void ( ) ;
}
2018-10-05 14:46:37 +08:00
// Does a random range read, doesn't trap/report errors
ACTOR Future < Void > randomReader ( VersionedBTree * btree ) {
2019-10-15 18:10:50 +08:00
try {
state Reference < IStoreCursor > cur ;
loop {
2018-10-05 14:46:37 +08:00
wait ( yield ( ) ) ;
2019-10-15 18:10:50 +08:00
if ( ! cur | | deterministicRandom ( ) - > random01 ( ) > .1 ) {
Version v = btree - > getLastCommittedVersion ( ) ;
if ( ! btree - > isSingleVersion ( ) ) {
v = deterministicRandom ( ) - > randomInt ( 1 , v + 1 ) ;
}
cur = btree - > readAtVersion ( v ) ;
}
state KeyValue kv = randomKV ( 10 , 0 ) ;
wait ( cur - > findFirstEqualOrGreater ( kv . key , true , 0 ) ) ;
state int c = deterministicRandom ( ) - > randomInt ( 0 , 100 ) ;
while ( cur - > isValid ( ) & & c - - > 0 ) {
wait ( success ( cur - > next ( true ) ) ) ;
wait ( yield ( ) ) ;
}
2018-10-05 14:46:37 +08:00
}
}
2019-10-15 18:10:50 +08:00
catch ( Error & e ) {
if ( e . code ( ) ! = error_code_transaction_too_old ) {
throw e ;
}
}
return Void ( ) ;
2018-10-05 14:46:37 +08:00
}
2018-08-29 04:46:14 +08:00
2019-02-21 18:46:30 +08:00
struct IntIntPair {
IntIntPair ( ) { }
IntIntPair ( int k , int v ) : k ( k ) , v ( v ) { }
IntIntPair ( Arena & arena , const IntIntPair & toCopy ) {
* this = toCopy ;
}
struct Delta {
2019-05-29 21:23:32 +08:00
bool prefixSource ;
2019-02-21 18:46:30 +08:00
int dk ;
int dv ;
2019-05-29 21:23:32 +08:00
IntIntPair apply ( const IntIntPair & base , Arena & arena ) {
return { base . k + dk , base . v + dv } ;
}
void setPrefixSource ( bool val ) {
prefixSource = val ;
}
bool getPrefixSource ( ) const {
return prefixSource ;
2019-02-21 18:46:30 +08:00
}
int size ( ) const {
return sizeof ( Delta ) ;
}
std : : string toString ( ) const {
2019-05-29 21:23:32 +08:00
return format ( " DELTA{prefixSource=%d dk=%d(0x%x) dv = % d ( 0 x % x ) } " , prefixSource, dk, dk, dv, dv) ;
2019-02-21 18:46:30 +08:00
}
} ;
int compare ( const IntIntPair & rhs ) const {
//printf("compare %s to %s\n", toString().c_str(), rhs.toString().c_str());
return k - rhs . k ;
}
bool operator = = ( const IntIntPair & rhs ) const {
return k = = rhs . k ;
}
2019-05-29 21:23:32 +08:00
int getCommonPrefixLen ( const IntIntPair & other , int skip ) const {
return 0 ;
}
2019-02-21 18:46:30 +08:00
int deltaSize ( const IntIntPair & base ) const {
return sizeof ( Delta ) ;
}
2019-05-29 21:23:32 +08:00
int writeDelta ( Delta & d , const IntIntPair & base , int commonPrefix = - 1 ) const {
d . dk = k - base . k ;
d . dv = v - base . v ;
return sizeof ( Delta ) ;
2019-02-21 18:46:30 +08:00
}
int k ;
int v ;
std : : string toString ( ) const {
return format ( " {k=%d(0x%x) v = % d ( 0 x % x ) } " , k, k, v, v) ;
}
} ;
2019-05-29 21:23:32 +08:00
int getCommonIntFieldPrefix2 ( const RedwoodRecordRef & a , const RedwoodRecordRef & b ) {
RedwoodRecordRef : : byte aFields [ RedwoodRecordRef : : intFieldArraySize ] ;
RedwoodRecordRef : : byte bFields [ RedwoodRecordRef : : intFieldArraySize ] ;
a . serializeIntFields ( aFields ) ;
b . serializeIntFields ( bFields ) ;
//printf("a: %s\n", StringRef(aFields, RedwoodRecordRef::intFieldArraySize).toHexString().c_str());
//printf("b: %s\n", StringRef(bFields, RedwoodRecordRef::intFieldArraySize).toHexString().c_str());
int i = 0 ;
while ( i < RedwoodRecordRef : : intFieldArraySize & & aFields [ i ] = = bFields [ i ] ) {
+ + i ;
}
//printf("%d\n", i);
return i ;
}
void deltaTest ( RedwoodRecordRef rec , RedwoodRecordRef base ) {
char buf [ 500 ] ;
RedwoodRecordRef : : Delta & d = * ( RedwoodRecordRef : : Delta * ) buf ;
Arena mem ;
2019-06-06 11:58:47 +08:00
int expectedSize = rec . deltaSize ( base , false ) ;
2019-05-30 07:47:53 +08:00
int deltaSize = rec . writeDelta ( d , base ) ;
2019-05-29 21:23:32 +08:00
RedwoodRecordRef decoded = d . apply ( base , mem ) ;
2019-05-30 09:06:11 +08:00
if ( decoded ! = rec | | expectedSize ! = deltaSize ) {
printf ( " \n " ) ;
2019-05-30 07:47:53 +08:00
printf ( " Base: %s \n " , base . toString ( ) . c_str ( ) ) ;
printf ( " ExpectedSize: %d \n " , expectedSize ) ;
printf ( " DeltaSize: %d \n " , deltaSize ) ;
printf ( " Delta: %s \n " , d . toString ( ) . c_str ( ) ) ;
printf ( " Record: %s \n " , rec . toString ( ) . c_str ( ) ) ;
printf ( " Decoded: %s \n " , decoded . toString ( ) . c_str ( ) ) ;
2019-05-30 07:26:58 +08:00
printf ( " RedwoodRecordRef::Delta test failure! \n " ) ;
2019-05-29 21:23:32 +08:00
ASSERT ( false ) ;
}
}
2019-05-30 07:26:58 +08:00
Standalone < RedwoodRecordRef > randomRedwoodRecordRef ( int maxKeySize = 3 , int maxValueSize = 255 ) {
RedwoodRecordRef rec ;
KeyValue kv = randomKV ( 3 , 10 ) ;
rec . key = kv . key ;
2019-06-25 11:17:49 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .9 ) {
2019-05-30 07:26:58 +08:00
rec . value = kv . value ;
}
2019-06-25 11:17:49 +08:00
rec . version = deterministicRandom ( ) - > coinflip ( ) ? 0 : deterministicRandom ( ) - > randomInt64 ( 0 , std : : numeric_limits < Version > : : max ( ) ) ;
2019-05-30 07:26:58 +08:00
2019-06-25 11:17:49 +08:00
if ( deterministicRandom ( ) - > coinflip ( ) ) {
rec . chunk . total = deterministicRandom ( ) - > randomInt ( 1 , 100000 ) ;
rec . chunk . start = deterministicRandom ( ) - > randomInt ( 0 , rec . chunk . total ) ;
2019-05-30 07:26:58 +08:00
}
return Standalone < RedwoodRecordRef > ( rec , kv . arena ( ) ) ;
}
2019-05-29 21:23:32 +08:00
TEST_CASE ( " !/redwood/correctness/unit/RedwoodRecordRef " ) {
2019-06-04 19:03:52 +08:00
// Test pageID stuff.
{
2019-09-28 06:08:05 +08:00
LogicalPageID ids [ ] = { 1 , 5 } ;
BTreePageID id ( ids , 2 ) ;
2019-06-04 19:03:52 +08:00
RedwoodRecordRef r ;
2019-09-28 06:08:05 +08:00
r . setChildPage ( id ) ;
ASSERT ( r . getChildPage ( ) = = id ) ;
ASSERT ( r . getChildPage ( ) . begin ( ) = = id . begin ( ) ) ;
Standalone < RedwoodRecordRef > r2 = r ;
ASSERT ( r2 . getChildPage ( ) = = id ) ;
ASSERT ( r2 . getChildPage ( ) . begin ( ) ! = id . begin ( ) ) ;
2019-06-04 19:03:52 +08:00
}
2019-05-29 21:23:32 +08:00
// Testing common prefix calculation for integer fields using the member function that calculates this directly
// and by serializing the integer fields to arrays and finding the common prefix length of the two arrays
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abcd " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " ab " ) , 2 , LiteralStringRef ( " " ) , 1 , 3 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 5 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 5 , 1 )
) ;
2019-05-30 07:47:53 +08:00
RedwoodRecordRef : : byte varInts [ 100 ] ;
RedwoodRecordRef : : Writer w ( varInts ) ;
RedwoodRecordRef : : Reader r ( varInts ) ;
w . writeVarInt ( 1 ) ;
w . writeVarInt ( 128 ) ;
w . writeVarInt ( 32000 ) ;
ASSERT ( r . readVarInt ( ) = = 1 ) ;
ASSERT ( r . readVarInt ( ) = = 128 ) ;
ASSERT ( r . readVarInt ( ) = = 32000 ) ;
2019-05-29 21:23:32 +08:00
RedwoodRecordRef rec1 ;
RedwoodRecordRef rec2 ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12995678 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 5 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 14 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = invalidVersion ;
rec2 . version = 0 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 0 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . total = 4 ;
rec2 . chunk . total = 4 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 14 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . start = 4 ;
rec2 . chunk . start = 4 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 14 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . start = 4 ;
rec2 . chunk . start = 5 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 13 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . total = 256 ;
rec2 . chunk . total = 512 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 9 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
Arena mem ;
double start ;
uint64_t total ;
uint64_t count ;
uint64_t i ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 1e9 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . total = i & 0xffffff ;
rec2 . chunk . total = i & 0xffffff ;
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . getCommonIntFieldPrefix ( rec2 ) ;
}
printf ( " % " PRId64 " getCommonIntFieldPrefix() %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
rec1 . key = LiteralStringRef ( " alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf " ) ;
rec2 . key = LiteralStringRef ( " alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf " ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 1e9 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
RedwoodRecordRef : : byte fields [ RedwoodRecordRef : : intFieldArraySize ] ;
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
rec1 . serializeIntFields ( fields ) ;
total + = fields [ RedwoodRecordRef : : intFieldArraySize - 1 ] ;
}
printf ( " % " PRId64 " serializeIntFields() %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 100e6 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . getCommonPrefixLen ( rec2 , 50 ) ;
}
printf ( " % " PRId64 " getCommonPrefixLen(skip=50) %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 100e6 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . getCommonPrefixLen ( rec2 , 0 ) ;
}
printf ( " % " PRId64 " getCommonPrefixLen(skip=0) %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
char buf [ 1000 ] ;
RedwoodRecordRef : : Delta & d = * ( RedwoodRecordRef : : Delta * ) buf ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 100e6 ;
2019-05-29 21:23:32 +08:00
int commonPrefix = rec1 . getCommonPrefixLen ( rec2 , 0 ) ;
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . writeDelta ( d , rec2 , commonPrefix ) ;
}
printf ( " % " PRId64 " writeDelta(commonPrefix=%d) %g M/s \n " , total , commonPrefix , count / ( timer ( ) - start ) / 1e6 ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 10e6 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . writeDelta ( d , rec2 ) ;
}
printf ( " % " PRId64 " writeDelta() %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
2019-05-30 07:26:58 +08:00
start = timer ( ) ;
total = 0 ;
2019-05-30 07:47:53 +08:00
count = 1e6 ;
2019-05-30 07:26:58 +08:00
for ( i = 0 ; i < count ; + + i ) {
Standalone < RedwoodRecordRef > a = randomRedwoodRecordRef ( ) ;
Standalone < RedwoodRecordRef > b = randomRedwoodRecordRef ( ) ;
deltaTest ( a , b ) ;
}
printf ( " Random deltaTest() %g M/s \n " , count / ( timer ( ) - start ) / 1e6 ) ;
2019-05-29 21:23:32 +08:00
return Void ( ) ;
}
TEST_CASE ( " !/redwood/correctness/unit/deltaTree/RedwoodRecordRef " ) {
2019-02-21 18:46:30 +08:00
const int N = 200 ;
RedwoodRecordRef prev ;
RedwoodRecordRef next ( LiteralStringRef ( " \xff \xff \xff \xff " ) ) ;
Arena arena ;
std : : vector < RedwoodRecordRef > items ;
for ( int i = 0 ; i < N ; + + i ) {
2019-06-25 11:17:49 +08:00
std : : string k = deterministicRandom ( ) - > randomAlphaNumeric ( 30 ) ;
std : : string v = deterministicRandom ( ) - > randomAlphaNumeric ( 30 ) ;
2019-02-21 18:46:30 +08:00
RedwoodRecordRef rec ;
rec . key = StringRef ( arena , k ) ;
2019-06-25 11:17:49 +08:00
rec . version = deterministicRandom ( ) - > coinflip ( ) ? deterministicRandom ( ) - > randomInt64 ( 0 , std : : numeric_limits < Version > : : max ( ) ) : invalidVersion ;
if ( deterministicRandom ( ) - > coinflip ( ) ) {
2019-02-21 18:46:30 +08:00
rec . value = StringRef ( arena , v ) ;
2019-06-25 11:17:49 +08:00
if ( deterministicRandom ( ) - > coinflip ( ) ) {
rec . chunk . start = deterministicRandom ( ) - > randomInt ( 0 , 100000 ) ;
rec . chunk . total = rec . chunk . start + v . size ( ) + deterministicRandom ( ) - > randomInt ( 0 , 100000 ) ;
2019-02-21 18:46:30 +08:00
}
}
items . push_back ( rec ) ;
//printf("i=%d %s\n", i, items.back().toString().c_str());
}
std : : sort ( items . begin ( ) , items . end ( ) ) ;
DeltaTree < RedwoodRecordRef > * tree = ( DeltaTree < RedwoodRecordRef > * ) new uint8_t [ N * 100 ] ;
tree - > build ( & items [ 0 ] , & items [ items . size ( ) ] , & prev , & next ) ;
printf ( " Count=%d Size=%d InitialDepth=%d \n " , ( int ) items . size ( ) , ( int ) tree - > size ( ) , ( int ) tree - > initialDepth ) ;
debug_printf ( " Data(%p): %s \n " , tree , StringRef ( ( uint8_t * ) tree , tree - > size ( ) ) . toHexString ( ) . c_str ( ) ) ;
DeltaTree < RedwoodRecordRef > : : Reader r ( tree , & prev , & next ) ;
DeltaTree < RedwoodRecordRef > : : Cursor fwd = r . getCursor ( ) ;
DeltaTree < RedwoodRecordRef > : : Cursor rev = r . getCursor ( ) ;
2019-09-02 14:03:31 +08:00
DeltaTree < RedwoodRecordRef , RedwoodRecordRef : : DeltaValueOnly > : : Reader rValuesOnly ( tree , & prev , & next ) ;
DeltaTree < RedwoodRecordRef , RedwoodRecordRef : : DeltaValueOnly > : : Cursor fwdValueOnly = rValuesOnly . getCursor ( ) ;
2019-02-21 18:46:30 +08:00
ASSERT ( fwd . moveFirst ( ) ) ;
2019-09-02 14:03:31 +08:00
ASSERT ( fwdValueOnly . moveFirst ( ) ) ;
2019-02-21 18:46:30 +08:00
ASSERT ( rev . moveLast ( ) ) ;
int i = 0 ;
while ( 1 ) {
if ( fwd . get ( ) ! = items [ i ] ) {
printf ( " forward iterator i=%d \n %s found \n %s expected \n " , i , fwd . get ( ) . toString ( ) . c_str ( ) , items [ i ] . toString ( ) . c_str ( ) ) ;
2019-07-02 15:58:43 +08:00
printf ( " Delta: %s \n " , fwd . node - > raw - > delta ( ) . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
ASSERT ( false ) ;
}
if ( rev . get ( ) ! = items [ items . size ( ) - 1 - i ] ) {
printf ( " reverse iterator i=%d \n %s found \n %s expected \n " , i , rev . get ( ) . toString ( ) . c_str ( ) , items [ items . size ( ) - 1 - i ] . toString ( ) . c_str ( ) ) ;
2019-07-02 15:58:43 +08:00
printf ( " Delta: %s \n " , rev . node - > raw - > delta ( ) . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
ASSERT ( false ) ;
}
2019-09-02 14:03:31 +08:00
if ( fwdValueOnly . get ( ) . value ! = items [ i ] . value ) {
printf ( " forward values-only iterator i=%d \n %s found \n %s expected \n " , i , fwdValueOnly . get ( ) . toString ( ) . c_str ( ) , items [ i ] . toString ( ) . c_str ( ) ) ;
printf ( " Delta: %s \n " , fwdValueOnly . node - > raw - > delta ( ) . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
2019-02-21 18:46:30 +08:00
+ + i ;
2019-09-02 14:03:31 +08:00
bool more = fwd . moveNext ( ) ;
ASSERT ( fwdValueOnly . moveNext ( ) = = more ) ;
ASSERT ( rev . movePrev ( ) = = more ) ;
ASSERT ( fwd . valid ( ) = = more ) ;
ASSERT ( fwdValueOnly . valid ( ) = = more ) ;
ASSERT ( rev . valid ( ) = = more ) ;
2019-02-21 18:46:30 +08:00
if ( ! fwd . valid ( ) ) {
break ;
}
}
ASSERT ( i = = items . size ( ) ) ;
double start = timer ( ) ;
DeltaTree < RedwoodRecordRef > : : Cursor c = r . getCursor ( ) ;
for ( int i = 0 ; i < 20000000 ; + + i ) {
2019-06-25 11:17:49 +08:00
const RedwoodRecordRef & query = items [ deterministicRandom ( ) - > randomInt ( 0 , items . size ( ) ) ] ;
2019-02-21 18:46:30 +08:00
if ( ! c . seekLessThanOrEqual ( query ) ) {
printf ( " Not found! query=%s \n " , query . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
if ( c . get ( ) ! = query ) {
printf ( " Found incorrect node! query=%s found=%s \n " , query . toString ( ) . c_str ( ) , c . get ( ) . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
}
double elapsed = timer ( ) - start ;
printf ( " Elapsed %f \n " , elapsed ) ;
return Void ( ) ;
}
2019-05-29 21:23:32 +08:00
TEST_CASE ( " !/redwood/correctness/unit/deltaTree/IntIntPair " ) {
2019-02-21 18:46:30 +08:00
const int N = 200 ;
IntIntPair prev = { 0 , 0 } ;
IntIntPair next = { 1000 , 0 } ;
std : : vector < IntIntPair > items ;
for ( int i = 0 ; i < N ; + + i ) {
items . push_back ( { i * 10 , i * 1000 } ) ;
//printf("i=%d %s\n", i, items.back().toString().c_str());
}
DeltaTree < IntIntPair > * tree = ( DeltaTree < IntIntPair > * ) new uint8_t [ 10000 ] ;
tree - > build ( & items [ 0 ] , & items [ items . size ( ) ] , & prev , & next ) ;
printf ( " Count=%d Size=%d InitialDepth=%d \n " , ( int ) items . size ( ) , ( int ) tree - > size ( ) , ( int ) tree - > initialDepth ) ;
debug_printf ( " Data(%p): %s \n " , tree , StringRef ( ( uint8_t * ) tree , tree - > size ( ) ) . toHexString ( ) . c_str ( ) ) ;
DeltaTree < IntIntPair > : : Reader r ( tree , & prev , & next ) ;
DeltaTree < IntIntPair > : : Cursor fwd = r . getCursor ( ) ;
DeltaTree < IntIntPair > : : Cursor rev = r . getCursor ( ) ;
ASSERT ( fwd . moveFirst ( ) ) ;
ASSERT ( rev . moveLast ( ) ) ;
int i = 0 ;
while ( 1 ) {
if ( fwd . get ( ) ! = items [ i ] ) {
printf ( " forward iterator i=%d \n %s found \n %s expected \n " , i , fwd . get ( ) . toString ( ) . c_str ( ) , items [ i ] . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
if ( rev . get ( ) ! = items [ items . size ( ) - 1 - i ] ) {
printf ( " reverse iterator i=%d \n %s found \n %s expected \n " , i , rev . get ( ) . toString ( ) . c_str ( ) , items [ items . size ( ) - 1 - i ] . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
+ + i ;
ASSERT ( fwd . moveNext ( ) = = rev . movePrev ( ) ) ;
ASSERT ( fwd . valid ( ) = = rev . valid ( ) ) ;
if ( ! fwd . valid ( ) ) {
break ;
}
}
ASSERT ( i = = items . size ( ) ) ;
DeltaTree < IntIntPair > : : Cursor c = r . getCursor ( ) ;
double start = timer ( ) ;
for ( int i = 0 ; i < 20000000 ; + + i ) {
2019-06-25 11:17:49 +08:00
IntIntPair p ( { deterministicRandom ( ) - > randomInt ( 0 , items . size ( ) * 10 ) , 0 } ) ;
2019-02-21 18:46:30 +08:00
if ( ! c . seekLessThanOrEqual ( p ) ) {
printf ( " Not found! query=%s \n " , p . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
if ( c . get ( ) . k ! = ( p . k - ( p . k % 10 ) ) ) {
printf ( " Found incorrect node! query=%s found=%s \n " , p . toString ( ) . c_str ( ) , c . get ( ) . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
}
double elapsed = timer ( ) - start ;
printf ( " Elapsed %f \n " , elapsed ) ;
return Void ( ) ;
}
struct SimpleCounter {
SimpleCounter ( ) : x ( 0 ) , xt ( 0 ) , t ( timer ( ) ) , start ( t ) { }
void operator + = ( int n ) { x + = n ; }
void operator + + ( ) { x + + ; }
int64_t get ( ) { return x ; }
double rate ( ) {
double t2 = timer ( ) ;
int r = ( x - xt ) / ( t2 - t ) ;
xt = x ;
t = t2 ;
return r ;
}
double avgRate ( ) { return x / ( timer ( ) - start ) ; }
int64_t x ;
double t ;
double start ;
int64_t xt ;
2019-05-29 21:23:32 +08:00
std : : string toString ( ) { return format ( " % " PRId64 " /%.2f/%.2f " , x , rate ( ) / 1e6 , avgRate ( ) / 1e6 ) ; }
2019-02-21 18:46:30 +08:00
} ;
2019-05-29 21:23:32 +08:00
TEST_CASE ( " !/redwood/correctness/btree " ) {
2019-08-07 17:36:33 +08:00
state std : : string pagerFile = " unittest_pageFile.redwood " ;
IPager2 * pager ;
2018-08-29 04:46:14 +08:00
2019-06-25 11:17:49 +08:00
state bool serialTest = deterministicRandom ( ) - > coinflip ( ) ;
state bool shortTest = deterministicRandom ( ) - > coinflip ( ) ;
2019-05-22 10:16:32 +08:00
state bool singleVersion = true ; // Multi-version mode is broken / not finished
2019-08-07 17:36:33 +08:00
state int pageSize = shortTest ? 200 : ( deterministicRandom ( ) - > coinflip ( ) ? 4096 : deterministicRandom ( ) - > randomInt ( 200 , 400 ) ) ;
2017-09-06 07:59:31 +08:00
// We must be able to fit at least two any two keys plus overhead in a page to prevent
// a situation where the tree cannot be grown upward with decreasing level size.
2019-06-25 11:17:49 +08:00
state int maxKeySize = deterministicRandom ( ) - > randomInt ( 4 , pageSize * 2 ) ;
state int maxValueSize = deterministicRandom ( ) - > randomInt ( 0 , pageSize * 4 ) ;
2019-08-09 14:08:08 +08:00
state int maxCommitSize = shortTest ? 1000 : randomSize ( std : : min < int > ( ( maxKeySize + maxValueSize ) * 20000 , 10e6 ) ) ;
state int mutationBytesTarget = shortTest ? 5000 : randomSize ( std : : min < int > ( maxCommitSize * 100 , 100e6 ) ) ;
state double clearProbability = deterministicRandom ( ) - > random01 ( ) * .1 ;
2019-10-28 19:00:37 +08:00
state double clearPostSetProbability = deterministicRandom ( ) - > random01 ( ) * .1 ;
2019-08-09 14:08:08 +08:00
state double coldStartProbability = deterministicRandom ( ) - > random01 ( ) ;
2019-10-18 16:27:00 +08:00
state double advanceOldVersionProbability = deterministicRandom ( ) - > random01 ( ) ;
2019-08-09 14:08:08 +08:00
state double maxWallClockDuration = 60 ;
printf ( " \n " ) ;
printf ( " serialTest: %d \n " , serialTest ) ;
printf ( " shortTest: %d \n " , shortTest ) ;
printf ( " singleVersion: %d \n " , serialTest ) ;
printf ( " pageSize: %d \n " , pageSize ) ;
printf ( " maxKeySize: %d \n " , maxKeySize ) ;
printf ( " maxValueSize: %d \n " , maxValueSize ) ;
printf ( " maxCommitSize: %d \n " , maxCommitSize ) ;
printf ( " mutationBytesTarget: %d \n " , mutationBytesTarget ) ;
printf ( " clearProbability: %f \n " , clearProbability ) ;
2019-10-28 19:00:37 +08:00
printf ( " clearPostSetProbability: %f \n " , clearPostSetProbability ) ;
2019-08-09 14:08:08 +08:00
printf ( " coldStartProbability: %f \n " , coldStartProbability ) ;
2019-10-18 16:27:00 +08:00
printf ( " advanceOldVersionProbability: %f \n " , advanceOldVersionProbability ) ;
2019-08-09 14:08:08 +08:00
printf ( " \n " ) ;
printf ( " Deleting existing test data... \n " ) ;
deleteFile ( pagerFile ) ;
2019-02-21 18:46:30 +08:00
2019-08-09 14:08:08 +08:00
printf ( " Initializing... \n " ) ;
state double startTime = timer ( ) ;
2019-11-04 19:04:03 +08:00
pager = new DWALPager ( pageSize , pagerFile , 0 ) ;
2019-08-09 14:08:08 +08:00
state VersionedBTree * btree = new VersionedBTree ( pager , pagerFile , singleVersion ) ;
wait ( btree - > init ( ) ) ;
2017-09-06 07:59:31 +08:00
2017-08-22 13:29:57 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > written ;
state std : : set < Key > keys ;
2017-06-10 05:56:41 +08:00
2019-10-23 08:17:29 +08:00
state Version lastVer = btree - > getLatestVersion ( ) ;
2019-05-05 01:52:02 +08:00
printf ( " Starting from version: % " PRId64 " \n " , lastVer ) ;
2017-07-14 13:11:48 +08:00
2017-06-10 05:56:41 +08:00
state Version version = lastVer + 1 ;
2018-09-28 07:07:29 +08:00
btree - > setWriteVersion ( version ) ;
2018-09-28 15:35:03 +08:00
2019-02-21 18:46:30 +08:00
state SimpleCounter mutationBytes ;
state SimpleCounter keyBytesInserted ;
state SimpleCounter valueBytesInserted ;
state SimpleCounter sets ;
state SimpleCounter rangeClears ;
state SimpleCounter keyBytesCleared ;
2018-09-28 15:35:03 +08:00
state int errorCount ;
2019-02-24 19:47:32 +08:00
state int mutationBytesThisCommit = 0 ;
state int mutationBytesTargetThisCommit = randomSize ( maxCommitSize ) ;
2017-07-26 07:10:19 +08:00
2018-09-28 15:35:03 +08:00
state PromiseStream < Version > committedVersions ;
2019-05-22 10:16:32 +08:00
state Future < Void > verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount , serialTest ) ;
state Future < Void > randomTask = serialTest ? Void ( ) : ( randomReader ( btree ) | | btree - > getError ( ) ) ;
2018-09-28 07:07:29 +08:00
2018-10-02 07:51:57 +08:00
state Future < Void > commit = Void ( ) ;
2019-08-09 14:08:08 +08:00
while ( mutationBytes . get ( ) < mutationBytesTarget & & ( timer ( ) - startTime ) < maxWallClockDuration ) {
2019-05-22 13:19:14 +08:00
if ( now ( ) - startTime > 600 ) {
mutationBytesTarget = mutationBytes . get ( ) ;
}
2018-09-28 07:07:29 +08:00
// Sometimes advance the version
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < 0.10 ) {
2017-07-15 02:37:08 +08:00
+ + version ;
2017-06-10 05:56:41 +08:00
btree - > setWriteVersion ( version ) ;
}
2018-09-28 07:07:29 +08:00
// Sometimes do a clear range
2019-08-09 14:08:08 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < clearProbability ) {
2018-09-28 07:07:29 +08:00
Key start = randomKV ( maxKeySize , 1 ) . key ;
2019-05-11 05:01:52 +08:00
Key end = ( deterministicRandom ( ) - > random01 ( ) < .01 ) ? keyAfter ( start ) : randomKV ( maxKeySize , 1 ) . key ;
2017-06-10 05:56:41 +08:00
2018-09-28 07:07:29 +08:00
// Sometimes replace start and/or end with a close actual (previously used) value
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .10 ) {
2018-09-28 07:07:29 +08:00
auto i = keys . upper_bound ( start ) ;
if ( i ! = keys . end ( ) )
start = * i ;
}
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .10 ) {
2018-09-28 07:07:29 +08:00
auto i = keys . upper_bound ( end ) ;
if ( i ! = keys . end ( ) )
end = * i ;
}
2017-07-14 13:11:48 +08:00
2018-09-28 07:07:29 +08:00
if ( end = = start )
end = keyAfter ( start ) ;
else if ( end < start ) {
std : : swap ( end , start ) ;
}
2017-07-14 13:11:48 +08:00
2019-02-21 18:46:30 +08:00
+ + rangeClears ;
2018-09-28 07:07:29 +08:00
KeyRangeRef range ( start , end ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " Mutation: Clear '%s' to '%s' @% " PRId64 " \n " , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , version ) ;
2018-09-28 07:07:29 +08:00
auto e = written . lower_bound ( std : : make_pair ( start . toString ( ) , 0 ) ) ;
if ( e ! = written . end ( ) ) {
auto last = e ;
auto eEnd = written . lower_bound ( std : : make_pair ( end . toString ( ) , 0 ) ) ;
while ( e ! = eEnd ) {
auto w = * e ;
+ + e ;
// If e key is different from last and last was present then insert clear for last's key at version
if ( last ! = eEnd & & ( ( e = = eEnd | | e - > first . first ! = last - > first . first ) & & last - > second . present ( ) ) ) {
2019-05-29 21:23:32 +08:00
debug_printf ( " Mutation: Clearing key '%s' @% " PRId64 " \n " , last - > first . first . c_str ( ) , version ) ;
2019-02-21 18:46:30 +08:00
keyBytesCleared + = last - > first . first . size ( ) ;
2019-02-24 19:47:32 +08:00
mutationBytes + = last - > first . first . size ( ) ;
mutationBytesThisCommit + = last - > first . first . size ( ) ;
2019-02-21 18:46:30 +08:00
2018-09-28 07:07:29 +08:00
// If the last set was at version then just make it not present
if ( last - > first . second = = version ) {
2019-02-21 18:46:30 +08:00
last - > second . reset ( ) ;
2018-09-28 07:07:29 +08:00
}
else {
2019-02-21 18:46:30 +08:00
written [ std : : make_pair ( last - > first . first , version ) ] . reset ( ) ;
2018-09-28 07:07:29 +08:00
}
}
last = e ;
}
}
2017-07-14 13:11:48 +08:00
2018-09-28 07:07:29 +08:00
btree - > clear ( range ) ;
2019-10-28 19:00:37 +08:00
// Sometimes set the range start after the clear
if ( deterministicRandom ( ) - > random01 ( ) < clearPostSetProbability ) {
KeyValue kv = randomKV ( 0 , maxValueSize ) ;
kv . key = range . begin ;
btree - > set ( kv ) ;
written [ std : : make_pair ( kv . key . toString ( ) , version ) ] = kv . value . toString ( ) ;
}
// Sometimes set the range end after the clear
if ( deterministicRandom ( ) - > random01 ( ) < clearPostSetProbability ) {
KeyValue kv = randomKV ( 0 , maxValueSize ) ;
kv . key = range . end ;
btree - > set ( kv ) ;
written [ std : : make_pair ( kv . key . toString ( ) , version ) ] = kv . value . toString ( ) ;
}
2017-07-14 13:11:48 +08:00
}
2018-09-28 07:07:29 +08:00
else {
// Set a key
KeyValue kv = randomKV ( maxKeySize , maxValueSize ) ;
// Sometimes change key to a close previously used key
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .01 ) {
2018-09-28 07:07:29 +08:00
auto i = keys . upper_bound ( kv . key ) ;
if ( i ! = keys . end ( ) )
kv . key = StringRef ( kv . arena ( ) , * i ) ;
}
2019-02-21 18:46:30 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " Mutation: Set '%s' -> '%s' @% " PRId64 " \n " , kv . key . toString ( ) . c_str ( ) , kv . value . toString ( ) . c_str ( ) , version ) ;
2019-02-21 18:46:30 +08:00
+ + sets ;
2018-09-28 07:07:29 +08:00
keyBytesInserted + = kv . key . size ( ) ;
2019-02-21 18:46:30 +08:00
valueBytesInserted + = kv . value . size ( ) ;
2018-09-28 07:07:29 +08:00
mutationBytes + = ( kv . key . size ( ) + kv . value . size ( ) ) ;
2019-02-24 19:47:32 +08:00
mutationBytesThisCommit + = ( kv . key . size ( ) + kv . value . size ( ) ) ;
2019-02-21 18:46:30 +08:00
2018-09-28 07:07:29 +08:00
btree - > set ( kv ) ;
written [ std : : make_pair ( kv . key . toString ( ) , version ) ] = kv . value . toString ( ) ;
keys . insert ( kv . key ) ;
}
2019-02-24 19:47:32 +08:00
// Commit at end or after this commit's mutation bytes are reached
if ( mutationBytes . get ( ) > = mutationBytesTarget | | mutationBytesThisCommit > = mutationBytesTargetThisCommit ) {
// Wait for previous commit to finish
wait ( commit ) ;
2019-05-29 21:23:32 +08:00
printf ( " Committed. Next commit %d bytes, % " PRId64 " /%d (%.2f%%) Stats: Insert %.2f MB/s ClearedKeys %.2f MB/s Total %.2f \n " ,
2019-02-24 19:47:32 +08:00
mutationBytesThisCommit ,
mutationBytes . get ( ) ,
mutationBytesTarget ,
( double ) mutationBytes . get ( ) / mutationBytesTarget * 100 ,
( keyBytesInserted . rate ( ) + valueBytesInserted . rate ( ) ) / 1e6 ,
keyBytesCleared . rate ( ) / 1e6 ,
mutationBytes . rate ( ) / 1e6
) ;
Version v = version ; // Avoid capture of version as a member of *this
2019-10-18 16:27:00 +08:00
// Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random amount.
if ( deterministicRandom ( ) - > random01 ( ) < advanceOldVersionProbability ) {
btree - > setOldestVersion ( btree - > getLastCommittedVersion ( ) - deterministicRandom ( ) - > randomInt ( 0 , btree - > getLastCommittedVersion ( ) - btree - > getOldestVersion ( ) + 1 ) ) ;
}
2019-02-24 19:47:32 +08:00
commit = map ( btree - > commit ( ) , [ = ] ( Void ) {
2019-08-07 17:36:33 +08:00
printf ( " Committed: %s \n " , VersionedBTree : : counts . toString ( true ) . c_str ( ) ) ;
2018-10-02 07:51:57 +08:00
// Notify the background verifier that version is committed and therefore readable
committedVersions . send ( v ) ;
return Void ( ) ;
} ) ;
2019-05-22 10:16:32 +08:00
if ( serialTest ) {
// Wait for commit, wait for verification, then start new verification
wait ( commit ) ;
committedVersions . sendError ( end_of_stream ( ) ) ;
debug_printf ( " Waiting for verification to complete. \n " ) ;
wait ( verifyTask ) ;
committedVersions = PromiseStream < Version > ( ) ;
verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount , serialTest ) ;
}
2019-02-24 19:47:32 +08:00
mutationBytesThisCommit = 0 ;
mutationBytesTargetThisCommit = randomSize ( maxCommitSize ) ;
2018-09-28 07:07:29 +08:00
2018-10-02 07:51:57 +08:00
// Recover from disk at random
2019-08-09 14:08:08 +08:00
if ( ! serialTest & & deterministicRandom ( ) - > random01 ( ) < coldStartProbability ) {
2019-10-15 18:10:50 +08:00
printf ( " Recovering from disk after next commit. \n " ) ;
2018-10-02 07:51:57 +08:00
// Wait for outstanding commit
debug_printf ( " Waiting for outstanding commit \n " ) ;
wait ( commit ) ;
2018-09-28 15:35:03 +08:00
// Stop and wait for the verifier task
committedVersions . sendError ( end_of_stream ( ) ) ;
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for verification to complete. \n " ) ;
2018-09-28 15:35:03 +08:00
wait ( verifyTask ) ;
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-28 07:07:29 +08:00
wait ( closedFuture ) ;
2019-10-15 18:10:50 +08:00
printf ( " Reopening btree from disk. \n " ) ;
2019-11-04 19:04:03 +08:00
IPager2 * pager = new DWALPager ( pageSize , pagerFile , 0 ) ;
2019-08-07 17:36:33 +08:00
btree = new VersionedBTree ( pager , pagerFile , singleVersion ) ;
2018-09-28 07:07:29 +08:00
wait ( btree - > init ( ) ) ;
2019-10-23 08:17:29 +08:00
Version v = btree - > getLatestVersion ( ) ;
2018-09-28 07:07:29 +08:00
ASSERT ( v = = version ) ;
2019-05-05 01:52:02 +08:00
printf ( " Recovered from disk. Latest version % " PRId64 " \n " , v ) ;
2017-06-10 05:56:41 +08:00
2018-09-28 15:35:03 +08:00
// Create new promise stream and start the verifier again
committedVersions = PromiseStream < Version > ( ) ;
2019-05-22 10:16:32 +08:00
verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount , serialTest ) ;
2018-10-15 18:43:43 +08:00
randomTask = randomReader ( btree ) | | btree - > getError ( ) ;
2017-06-10 05:56:41 +08:00
}
2018-09-28 15:35:03 +08:00
+ + version ;
btree - > setWriteVersion ( version ) ;
2018-09-28 07:07:29 +08:00
}
2019-02-24 19:47:32 +08:00
// Check for errors
if ( errorCount ! = 0 )
throw internal_error ( ) ;
2017-06-10 05:56:41 +08:00
}
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for outstanding commit \n " ) ;
wait ( commit ) ;
2018-09-28 15:35:03 +08:00
committedVersions . sendError ( end_of_stream ( ) ) ;
2019-11-04 19:04:03 +08:00
randomTask . cancel ( ) ;
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for verification to complete. \n " ) ;
2018-09-28 15:35:03 +08:00
wait ( verifyTask ) ;
2019-02-24 19:47:32 +08:00
// Check for errors
if ( errorCount ! = 0 )
throw internal_error ( ) ;
2019-10-29 16:31:59 +08:00
wait ( btree - > destroyAndCheckSanity ( ) ) ;
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2019-11-04 19:04:03 +08:00
debug_printf ( " Closing. \n " ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-07-14 13:11:48 +08:00
return Void ( ) ;
}
2019-08-07 17:36:33 +08:00
ACTOR Future < Void > randomSeeks ( VersionedBTree * btree , int count , char firstChar , char lastChar ) {
2019-10-23 08:17:29 +08:00
state Version readVer = btree - > getLatestVersion ( ) ;
2019-03-15 15:46:09 +08:00
state int c = 0 ;
state double readStart = timer ( ) ;
2019-06-24 16:05:16 +08:00
printf ( " Executing %d random seeks \n " , count ) ;
2019-03-15 15:46:09 +08:00
state Reference < IStoreCursor > cur = btree - > readAtVersion ( readVer ) ;
while ( c < count ) {
2019-08-07 17:36:33 +08:00
wait ( yield ( ) ) ;
state Key k = randomString ( 20 , firstChar , lastChar ) ;
2019-06-24 16:05:16 +08:00
wait ( success ( cur - > findFirstEqualOrGreater ( k , false , 0 ) ) ) ;
2019-03-15 15:46:09 +08:00
+ + c ;
}
double elapsed = timer ( ) - readStart ;
2019-08-07 17:36:33 +08:00
printf ( " Random seek speed %d/s \n " , int ( count / elapsed ) ) ;
return Void ( ) ;
}
TEST_CASE ( " !/redwood/correctness/pager/cow " ) {
state std : : string pagerFile = " unittest_pageFile.redwood " ;
printf ( " Deleting old test data \n " ) ;
deleteFile ( pagerFile ) ;
int pageSize = 4096 ;
2019-11-04 19:04:03 +08:00
state IPager2 * pager = new DWALPager ( pageSize , pagerFile , 0 ) ;
2019-08-07 17:36:33 +08:00
2019-10-23 08:17:29 +08:00
wait ( success ( pager - > init ( ) ) ) ;
2019-08-07 17:36:33 +08:00
state LogicalPageID id = wait ( pager - > newPageID ( ) ) ;
Reference < IPage > p = pager - > newPageBuffer ( ) ;
memset ( p - > mutate ( ) , ( char ) id , p - > size ( ) ) ;
pager - > updatePage ( id , p ) ;
pager - > setMetaKey ( LiteralStringRef ( " asdfasdf " ) ) ;
wait ( pager - > commit ( ) ) ;
2019-09-28 06:08:05 +08:00
Reference < IPage > p2 = wait ( pager - > readPage ( id , true ) ) ;
2019-08-07 17:36:33 +08:00
printf ( " %s \n " , StringRef ( p2 - > begin ( ) , p2 - > size ( ) ) . toHexString ( ) . c_str ( ) ) ;
2019-09-28 06:08:05 +08:00
// TODO: Verify reads, do more writes and reads to make this a real pager validator
Future < Void > onClosed = pager - > onClosed ( ) ;
pager - > close ( ) ;
wait ( onClosed ) ;
2019-03-15 15:46:09 +08:00
return Void ( ) ;
}
2018-10-06 13:13:22 +08:00
TEST_CASE ( " !/redwood/performance/set " ) {
2019-08-07 17:36:33 +08:00
state std : : string pagerFile = " unittest_pageFile.redwood " ;
2019-02-24 19:47:32 +08:00
printf ( " Deleting old test data \n " ) ;
2018-10-15 18:43:43 +08:00
deleteFile ( pagerFile ) ;
2019-04-30 08:00:29 +08:00
2019-08-07 17:36:33 +08:00
int pageSize = 4096 ;
2019-11-04 19:04:03 +08:00
IPager2 * pager = new DWALPager ( pageSize , pagerFile , FLOW_KNOBS - > PAGE_CACHE_4K / pageSize ) ;
2019-04-30 08:00:29 +08:00
state bool singleVersion = true ;
2019-08-07 17:36:33 +08:00
state VersionedBTree * btree = new VersionedBTree ( pager , pagerFile , singleVersion ) ;
2018-09-20 18:39:55 +08:00
wait ( btree - > init ( ) ) ;
2017-07-14 13:11:48 +08:00
2019-03-15 15:46:09 +08:00
state int nodeCount = 1e9 ;
2019-08-07 17:36:33 +08:00
state int maxChangesPerVersion = 5000 ;
state int64_t kvBytesTarget = 4000e6 ;
state int commitTarget = 20e6 ;
state int maxKeyPrefixSize = 25 ;
state int maxValueSize = 500 ;
state int maxConsecutiveRun = 10 ;
state int minValueSize = 0 ;
state char firstKeyChar = ' a ' ;
state char lastKeyChar = ' b ' ;
2018-06-15 08:52:25 +08:00
state int64_t kvBytes = 0 ;
2019-03-15 15:46:09 +08:00
state int64_t kvBytesTotal = 0 ;
2018-06-15 08:52:25 +08:00
state int records = 0 ;
2018-10-02 07:51:57 +08:00
state Future < Void > commit = Void ( ) ;
2019-06-24 16:05:16 +08:00
state std : : string value ( maxValueSize , ' v ' ) ;
2017-07-14 13:11:48 +08:00
2019-02-24 19:47:32 +08:00
printf ( " Starting. \n " ) ;
state double intervalStart = timer ( ) ;
2019-06-24 16:05:16 +08:00
state double start = intervalStart ;
2019-02-24 19:47:32 +08:00
2019-03-15 15:46:09 +08:00
while ( kvBytesTotal < kvBytesTarget ) {
2019-08-07 17:36:33 +08:00
wait ( yield ( ) ) ;
2019-10-23 08:17:29 +08:00
Version lastVer = btree - > getLatestVersion ( ) ;
2017-07-14 13:11:48 +08:00
state Version version = lastVer + 1 ;
btree - > setWriteVersion ( version ) ;
2019-05-11 05:01:52 +08:00
int changes = deterministicRandom ( ) - > randomInt ( 0 , maxChangesPerVersion ) ;
2019-06-24 16:05:16 +08:00
2019-08-07 17:36:33 +08:00
while ( changes > 0 & & kvBytes < commitTarget ) {
2017-07-14 13:11:48 +08:00
KeyValue kv ;
2019-08-07 17:36:33 +08:00
kv . key = randomString ( kv . arena ( ) , deterministicRandom ( ) - > randomInt ( sizeof ( uint32_t ) , maxKeyPrefixSize + sizeof ( uint32_t ) + 1 ) , firstKeyChar , lastKeyChar ) ;
2019-06-25 11:17:49 +08:00
int32_t index = deterministicRandom ( ) - > randomInt ( 0 , nodeCount ) ;
int runLength = deterministicRandom ( ) - > randomInt ( 1 , maxConsecutiveRun + 1 ) ;
2019-06-24 16:05:16 +08:00
while ( runLength > 0 & & changes > 0 ) {
* ( uint32_t * ) ( kv . key . end ( ) - sizeof ( uint32_t ) ) = bigEndian32 ( index + + ) ;
2019-08-07 17:36:33 +08:00
kv . value = StringRef ( ( uint8_t * ) value . data ( ) , deterministicRandom ( ) - > randomInt ( minValueSize , maxValueSize + 1 ) ) ;
2019-06-24 16:05:16 +08:00
btree - > set ( kv ) ;
- - runLength ;
- - changes ;
kvBytes + = kv . key . size ( ) + kv . value . size ( ) ;
+ + records ;
2019-04-30 08:00:29 +08:00
}
2017-07-14 13:11:48 +08:00
}
2019-08-07 17:36:33 +08:00
if ( kvBytes > = commitTarget ) {
2019-11-04 19:04:03 +08:00
btree - > setOldestVersion ( btree - > getLastCommittedVersion ( ) ) ;
2018-12-06 14:41:04 +08:00
wait ( commit ) ;
2019-06-24 16:05:16 +08:00
printf ( " Cumulative %.2f MB keyValue bytes written at %.2f MB/s \n " , kvBytesTotal / 1e6 , kvBytesTotal / ( timer ( ) - start ) / 1e6 ) ;
2019-05-22 14:49:27 +08:00
2019-06-24 16:05:16 +08:00
// Avoid capturing via this to freeze counter values
2019-02-24 19:47:32 +08:00
int recs = records ;
int kvb = kvBytes ;
2019-06-18 09:55:49 +08:00
// Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object
double * pIntervalStart = & intervalStart ;
2019-02-24 19:47:32 +08:00
commit = map ( btree - > commit ( ) , [ = ] ( Void result ) {
2019-08-07 17:36:33 +08:00
printf ( " Committed: %s \n " , VersionedBTree : : counts . toString ( true ) . c_str ( ) ) ;
2019-06-18 09:55:49 +08:00
double elapsed = timer ( ) - * pIntervalStart ;
2019-02-24 19:47:32 +08:00
printf ( " Committed %d kvBytes in %d records in %f seconds, %.2f MB/s \n " , kvb , recs , elapsed , kvb / elapsed / 1e6 ) ;
2019-06-18 09:55:49 +08:00
* pIntervalStart = timer ( ) ;
2019-02-24 19:47:32 +08:00
return Void ( ) ;
} ) ;
2019-06-24 16:05:16 +08:00
kvBytesTotal + = kvBytes ;
2019-02-24 19:47:32 +08:00
kvBytes = 0 ;
2019-06-24 16:05:16 +08:00
records = 0 ;
2017-07-14 13:11:48 +08:00
}
}
2019-02-24 19:47:32 +08:00
wait ( commit ) ;
2019-06-25 11:17:49 +08:00
printf ( " Cumulative %.2f MB keyValue bytes written at %.2f MB/s \n " , kvBytesTotal / 1e6 , kvBytesTotal / ( timer ( ) - start ) / 1e6 ) ;
2017-07-14 13:11:48 +08:00
2019-11-04 19:04:03 +08:00
printf ( " Starting random seeks \n " ) ;
2019-06-24 16:05:16 +08:00
state int reads = 30000 ;
2019-08-07 17:36:33 +08:00
wait ( randomSeeks ( btree , reads , firstKeyChar , lastKeyChar ) & & randomSeeks ( btree , reads , firstKeyChar , lastKeyChar ) & & randomSeeks ( btree , reads , firstKeyChar , lastKeyChar ) ) ;
2019-03-15 15:46:09 +08:00
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-07-14 13:11:48 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}