2017-06-10 05:56:41 +08:00
/*
* VersionedBTree . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
*
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
*
* http : //www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
# include "flow/flow.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/IVersionedStore.h"
# include "fdbserver/IPager.h"
2017-06-10 05:56:41 +08:00
# include "fdbclient/Tuple.h"
# include "flow/serialize.h"
# include "flow/genericactors.actor.h"
# include "flow/UnitTest.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/MemoryPager.h"
# include "fdbserver/IndirectShadowPager.h"
2017-06-10 05:56:41 +08:00
# include <map>
# include <vector>
2017-08-04 15:01:25 +08:00
# include "fdbclient/CommitTransaction.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/IKeyValueStore.h"
2019-02-21 18:46:30 +08:00
# include "fdbserver/DeltaTree.h"
2018-07-23 18:09:13 +08:00
# include <string.h>
2018-10-19 11:26:45 +08:00
# include "flow/actorcompiler.h"
2019-05-05 01:52:02 +08:00
# include <cinttypes>
2018-07-23 18:09:13 +08:00
2019-05-29 21:23:32 +08:00
// TODO: Move this to a flow header once it is mature.
struct SplitStringRef {
StringRef a ;
StringRef b ;
SplitStringRef ( StringRef a = StringRef ( ) , StringRef b = StringRef ( ) ) : a ( a ) , b ( b ) {
}
SplitStringRef ( Arena & arena , const SplitStringRef & toCopy )
: a ( toStringRef ( arena ) ) , b ( ) {
}
SplitStringRef prefix ( int len ) const {
if ( len < = a . size ( ) ) {
return SplitStringRef ( a . substr ( 0 , len ) ) ;
}
len - = a . size ( ) ;
return SplitStringRef ( a , b . substr ( 0 , len ) ) ;
}
StringRef toStringRef ( Arena & arena ) const {
StringRef c = makeString ( size ( ) , arena ) ;
memcpy ( mutateString ( c ) , a . begin ( ) , a . size ( ) ) ;
memcpy ( mutateString ( c ) + a . size ( ) , b . begin ( ) , b . size ( ) ) ;
return c ;
}
Standalone < StringRef > toStringRef ( ) const {
Arena a ;
return Standalone < StringRef > ( toStringRef ( a ) , a ) ;
}
int size ( ) const {
return a . size ( ) + b . size ( ) ;
}
int expectedSize ( ) const {
return size ( ) ;
}
std : : string toString ( ) const {
return format ( " %s%s " , a . toString ( ) . c_str ( ) , b . toString ( ) . c_str ( ) ) ;
}
std : : string toHexString ( ) const {
return format ( " %s%s " , a . toHexString ( ) . c_str ( ) , b . toHexString ( ) . c_str ( ) ) ;
}
struct const_iterator {
const uint8_t * ptr ;
const uint8_t * end ;
const uint8_t * next ;
inline bool operator = = ( const const_iterator & rhs ) const {
return ptr = = rhs . ptr ;
}
inline const_iterator & operator + + ( ) {
+ + ptr ;
if ( ptr = = end ) {
ptr = next ;
}
return * this ;
}
inline const_iterator & operator + ( int n ) {
ptr + = n ;
if ( ptr > = end ) {
ptr = next + ( ptr - end ) ;
}
return * this ;
}
inline uint8_t operator * ( ) const {
return * ptr ;
}
} ;
inline const_iterator begin ( ) const {
return { a . begin ( ) , a . end ( ) , b . begin ( ) } ;
}
inline const_iterator end ( ) const {
return { b . end ( ) } ;
}
template < typename StringT >
int compare ( const StringT & rhs ) const {
auto j = begin ( ) ;
auto k = rhs . begin ( ) ;
auto jEnd = end ( ) ;
auto kEnd = rhs . end ( ) ;
while ( j ! = jEnd & & k ! = kEnd ) {
int cmp = * j - * k ;
if ( cmp ! = 0 ) {
return cmp ;
}
}
// If we've reached the end of *this, then values are equal if rhs is also exhausted, otherwise *this is less than rhs
if ( j = = jEnd ) {
return k = = kEnd ? 0 : - 1 ;
}
return 1 ;
}
} ;
2019-02-21 18:46:30 +08:00
# define STR(x) LiteralStringRef(x)
struct RedwoodRecordRef {
2019-05-29 21:23:32 +08:00
typedef uint8_t byte ;
2019-02-21 18:46:30 +08:00
2019-04-30 08:00:29 +08:00
RedwoodRecordRef ( KeyRef key = KeyRef ( ) , Version ver = 0 , Optional < ValueRef > value = { } , uint32_t chunkTotal = 0 , uint32_t chunkStart = 0 )
: key ( key ) , version ( ver ) , value ( value ) , chunk ( { chunkTotal , chunkStart } )
2019-02-21 18:46:30 +08:00
{
}
2019-05-29 21:23:32 +08:00
RedwoodRecordRef ( Arena & arena , const RedwoodRecordRef & toCopy )
2019-06-04 19:03:52 +08:00
: key ( arena , toCopy . key ) , version ( toCopy . version ) , chunk ( toCopy . chunk )
{
if ( toCopy . value . present ( ) ) {
if ( toCopy . localValue ( ) ) {
setPageID ( toCopy . getPageID ( ) ) ;
}
else {
value = ValueRef ( arena , toCopy . value . get ( ) ) ;
}
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
RedwoodRecordRef ( KeyRef key , Optional < ValueRef > value , const byte intFields [ 14 ] )
: key ( key ) , value ( value )
{
deserializeIntFields ( intFields ) ;
}
2019-06-04 19:03:52 +08:00
RedwoodRecordRef ( const RedwoodRecordRef & toCopy ) : key ( toCopy . key ) , version ( toCopy . version ) , chunk ( toCopy . chunk ) {
if ( toCopy . value . present ( ) ) {
if ( toCopy . localValue ( ) ) {
setPageID ( toCopy . getPageID ( ) ) ;
}
else {
value = toCopy . value ;
}
}
}
RedwoodRecordRef & operator = ( const RedwoodRecordRef & toCopy ) {
key = toCopy . key ;
version = toCopy . version ;
chunk = toCopy . chunk ;
if ( toCopy . value . present ( ) ) {
if ( toCopy . localValue ( ) ) {
setPageID ( toCopy . getPageID ( ) ) ;
}
else {
value = toCopy . value ;
}
}
return * this ;
}
bool localValue ( ) const {
return value . get ( ) . begin ( ) = = bigEndianPageIDSpace ;
}
// RedwoodRecordRefs are used for both internal and leaf pages of the BTree.
// Boundary records in internal pages are made from leaf records.
// These functions make creating and working with internal page records more convenient.
inline LogicalPageID getPageID ( ) const {
ASSERT ( value . present ( ) ) ;
return bigEndian32 ( * ( LogicalPageID * ) value . get ( ) . begin ( ) ) ;
}
inline void setPageID ( LogicalPageID id ) {
* ( LogicalPageID * ) bigEndianPageIDSpace = bigEndian32 ( id ) ;
value = ValueRef ( bigEndianPageIDSpace , sizeof ( bigEndianPageIDSpace ) ) ;
}
inline RedwoodRecordRef withPageID ( LogicalPageID id ) const {
RedwoodRecordRef rec ( key , version , { } , chunk . total , chunk . start ) ;
rec . setPageID ( id ) ;
return rec ;
}
inline RedwoodRecordRef withoutValue ( ) const {
2019-05-22 10:16:32 +08:00
return RedwoodRecordRef ( key , version , { } , chunk . total , chunk . start ) ;
}
2019-05-29 21:23:32 +08:00
// Returns how many bytes are in common between the integer fields of *this and other, assuming that
// all values are BigEndian, version is 64 bits, chunk total is 24 bits, and chunk start is 24 bits
int getCommonIntFieldPrefix ( const RedwoodRecordRef & other ) const {
if ( version ! = other . version ) {
return clzll ( version ^ other . version ) > > 3 ;
}
if ( chunk . total ! = other . chunk . total ) {
// the -1 is because we are only considering the lower 3 bytes
return 8 + ( clz ( chunk . total ^ other . chunk . total ) > > 3 ) - 1 ;
}
if ( chunk . start ! = other . chunk . start ) {
// the -1 is because we are only considering the lower 3 bytes
return 11 + ( clz ( chunk . start ^ other . chunk . start ) > > 3 ) - 1 ;
}
return 14 ;
}
2019-05-30 17:10:07 +08:00
// Truncate (key, version, chunk.total, chunk.start) tuple to len bytes.
void truncate ( int len ) {
if ( len < = key . size ( ) ) {
key = key . substr ( 0 , len ) ;
version = 0 ;
chunk . total = 0 ;
chunk . start = 0 ;
}
else {
byte fields [ intFieldArraySize ] ;
serializeIntFields ( fields ) ;
int end = len - key . size ( ) ;
for ( int i = intFieldArraySize - 1 ; i > = end ; - - i ) {
fields [ i ] = 0 ;
}
}
}
2019-05-29 21:23:32 +08:00
// Find the common prefix between two records, assuming that the first
// skip bytes are the same.
inline int getCommonPrefixLen ( const RedwoodRecordRef & other , int skip ) const {
int skipStart = std : : min ( skip , key . size ( ) ) ;
int common = skipStart + commonPrefixLength ( key . begin ( ) + skipStart , other . key . begin ( ) + skipStart , std : : min ( other . key . size ( ) , key . size ( ) ) - skipStart ) ;
if ( common = = key . size ( ) & & key . size ( ) = = other . key . size ( ) ) {
common + = getCommonIntFieldPrefix ( other ) ;
}
return common ;
}
static const int intFieldArraySize = 14 ;
// Write big endian values of version (64 bits), total (24 bits), and start (24 bits) fields
// to an array of 14 bytes
void serializeIntFields ( byte * dst ) const {
* ( uint32_t * ) ( dst + 10 ) = bigEndian32 ( chunk . start ) ;
* ( uint32_t * ) ( dst + 7 ) = bigEndian32 ( chunk . total ) ;
* ( uint64_t * ) dst = bigEndian64 ( version ) ;
}
// Initialize int fields from the array format that serializeIntFields produces
void deserializeIntFields ( const byte * src ) {
version = bigEndian64 ( * ( uint64_t * ) src ) ;
chunk . total = bigEndian32 ( * ( uint32_t * ) ( src + 7 ) ) & 0xffffff ;
chunk . start = bigEndian32 ( * ( uint32_t * ) ( src + 10 ) ) & 0xffffff ;
}
// TODO: Use SplitStringRef (unless it ends up being slower)
2019-02-21 18:46:30 +08:00
KeyRef key ;
Optional < ValueRef > value ;
2019-05-29 21:23:32 +08:00
Version version ;
2019-04-30 08:00:29 +08:00
struct {
uint32_t total ;
2019-05-29 21:23:32 +08:00
// TODO: Change start to chunk number.
2019-04-30 08:00:29 +08:00
uint32_t start ;
} chunk ;
2019-02-21 18:46:30 +08:00
2019-06-04 19:03:52 +08:00
// If the value is a page ID it will be stored here
uint8_t bigEndianPageIDSpace [ sizeof ( LogicalPageID ) ] ;
2019-02-21 18:46:30 +08:00
int expectedSize ( ) const {
2019-04-30 08:00:29 +08:00
return key . expectedSize ( ) + value . expectedSize ( ) ;
2019-02-21 18:46:30 +08:00
}
bool isMultiPart ( ) const {
2019-05-29 21:23:32 +08:00
return chunk . total ! = 0 ;
2019-02-21 18:46:30 +08:00
}
// Generate a kv shard from a complete kv
RedwoodRecordRef split ( int start , int len ) {
2019-04-30 08:00:29 +08:00
ASSERT ( ! isMultiPart ( ) ) ;
return RedwoodRecordRef ( key , version , value . get ( ) . substr ( start , len ) , value . get ( ) . size ( ) , start ) ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
class Writer {
public :
Writer ( byte * ptr ) : wptr ( ptr ) { }
byte * wptr ;
template < typename T > void write ( const T & in ) {
* ( T * ) wptr = in ;
wptr + = sizeof ( T ) ;
}
// Write a big endian 1 or 2 byte integer using the high bit of the first byte as an "extension" bit.
// Values > 15 bits in length are not valid input but this is not checked for.
void writeVarInt ( int x ) {
if ( x > = 128 ) {
2019-05-30 07:47:53 +08:00
* wptr + + = ( uint8_t ) ( ( x > > 8 ) | 0x80 ) ;
2019-05-29 21:23:32 +08:00
}
* wptr + + = ( uint8_t ) x ;
}
void writeString ( StringRef s ) {
memcpy ( wptr , s . begin ( ) , s . size ( ) ) ;
wptr + = s . size ( ) ;
}
} ;
class Reader {
public :
Reader ( const void * ptr ) : rptr ( ( const byte * ) ptr ) { }
const byte * rptr ;
template < typename T > T read ( ) {
T r = * ( const T * ) rptr ;
rptr + = sizeof ( T ) ;
return r ;
}
// Read a big endian 1 or 2 byte integer using the high bit of the first byte as an "extension" bit.
int readVarInt ( ) {
int x = * rptr + + ;
// If the high bit is set
if ( x & 0x80 ) {
// Clear the high bit
x & = 0x7f ;
// Shift low byte left
x < < = 8 ;
// Read the new low byte and OR it in
x | = * rptr + + ;
}
return x ;
}
StringRef readString ( int len ) {
StringRef s ( rptr , len ) ;
rptr + = len ;
return s ;
}
const byte * readBytes ( int len ) {
const byte * b = rptr ;
rptr + = len ;
return b ;
}
} ;
2019-02-21 18:46:30 +08:00
# pragma pack(push,1)
struct Delta {
2019-05-29 21:23:32 +08:00
// Serialized Format
//
// 1 byte for Flags + a 4 bit length
// borrow source is prev ancestor - 0 or 1
// has_key_suffix
// has_value
// has_version
// other_fields suffix len - 4 bits
//
// If has value and value is not 4 bytes
// 1 byte value length
//
// 1 or 2 bytes for Prefix Borrow Length (hi bit indicates presence of second byte)
//
// IF has_key_suffix is set
// 1 or 2 bytes for Key Suffix Length
//
// Key suffix bytes
// Meta suffix bytes
// Value bytes
//
// For a series of RedwoodRecordRef's containing shards of the same KV pair where the key size is < 104 bytes,
// the overhead per middle chunk is 7 bytes:
// 4 bytes of child pointers in the DeltaTree Node
// 1 flag byte
// 1 prefix borrow length byte
// 1 meta suffix byte describing chunk start position
enum EFlags {
PREFIX_SOURCE = 0x80 ,
HAS_KEY_SUFFIX = 0x40 ,
HAS_VALUE = 0x20 ,
HAS_VERSION = 0x10 ,
INT_FIELD_SUFFIX_BITS = 0x0f
} ;
2019-02-21 18:46:30 +08:00
uint8_t flags ;
2019-07-02 15:58:43 +08:00
inline byte * data ( ) {
return ( byte * ) ( this + 1 ) ;
}
inline const byte * data ( ) const {
return ( const byte * ) ( this + 1 ) ;
}
2019-05-29 21:23:32 +08:00
void setPrefixSource ( bool val ) {
if ( val ) {
flags | = PREFIX_SOURCE ;
2019-04-30 08:00:29 +08:00
}
else {
2019-05-29 21:23:32 +08:00
flags & = ~ PREFIX_SOURCE ;
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
bool getPrefixSource ( ) const {
return flags & PREFIX_SOURCE ;
}
RedwoodRecordRef apply ( const RedwoodRecordRef & base , Arena & arena ) const {
2019-07-02 15:58:43 +08:00
Reader r ( data ( ) ) ;
2019-05-29 21:23:32 +08:00
int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS ;
int prefixLen = r . readVarInt ( ) ;
int valueLen = ( flags & HAS_VALUE ) ? r . read < uint8_t > ( ) : 0 ;
StringRef k ;
int keyPrefixLen = std : : min ( prefixLen , base . key . size ( ) ) ;
int intFieldPrefixLen = prefixLen - keyPrefixLen ;
int keySuffixLen = ( flags & HAS_KEY_SUFFIX ) ? r . readVarInt ( ) : 0 ;
if ( keySuffixLen > 0 ) {
k = makeString ( keyPrefixLen + keySuffixLen , arena ) ;
memcpy ( mutateString ( k ) , base . key . begin ( ) , keyPrefixLen ) ;
memcpy ( mutateString ( k ) + keyPrefixLen , r . readString ( keySuffixLen ) . begin ( ) , keySuffixLen ) ;
}
else {
k = base . key . substr ( 0 , keyPrefixLen ) ;
}
// Now decode the integer fields
const byte * intFieldSuffix = r . readBytes ( intFieldSuffixLen ) ;
// Create big endian array in which to reassemble the integer fields from prefix and suffix bytes
byte intFields [ intFieldArraySize ] ;
// If borrowing any bytes, get the source's integer field array
if ( intFieldPrefixLen > 0 ) {
base . serializeIntFields ( intFields ) ;
}
else {
memset ( intFields , 0 , intFieldArraySize ) ;
}
// Version offset is used to skip the version bytes in the int field array when version is missing (aka 0)
2019-05-30 07:26:58 +08:00
int versionOffset = ( ( intFieldPrefixLen = = 0 ) & & ( ~ flags & HAS_VERSION ) ) ? 8 : 0 ;
2019-05-29 21:23:32 +08:00
// If there are suffix bytes, copy those into place after the prefix
if ( intFieldSuffixLen > 0 ) {
memcpy ( intFields + versionOffset + intFieldPrefixLen , intFieldSuffix , intFieldSuffixLen ) ;
}
// Zero out any remaining bytes if the array was initialized from base
if ( intFieldPrefixLen > 0 ) {
for ( int i = versionOffset + intFieldPrefixLen + intFieldSuffixLen ; i < intFieldArraySize ; + + i ) {
intFields [ i ] = 0 ;
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
return RedwoodRecordRef ( k , flags & HAS_VALUE ? r . readString ( valueLen ) : Optional < ValueRef > ( ) , intFields ) ;
}
int size ( ) const {
2019-07-02 15:58:43 +08:00
Reader r ( data ( ) ) ;
2019-05-29 21:23:32 +08:00
int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS ;
2019-06-18 09:55:49 +08:00
r . readVarInt ( ) ; // prefixlen
2019-05-29 21:23:32 +08:00
int valueLen = ( flags & HAS_VALUE ) ? r . read < uint8_t > ( ) : 0 ;
int keySuffixLen = ( flags & HAS_KEY_SUFFIX ) ? r . readVarInt ( ) : 0 ;
2019-07-02 15:58:43 +08:00
return sizeof ( Delta ) + r . rptr - data ( ) + intFieldSuffixLen + valueLen + keySuffixLen ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
// Delta can't be determined without the RedwoodRecordRef upon which the Delta is based.
2019-02-21 18:46:30 +08:00
std : : string toString ( ) const {
2019-07-02 15:58:43 +08:00
Reader r ( data ( ) ) ;
2019-05-29 21:23:32 +08:00
2019-05-30 17:10:07 +08:00
std : : string flagString = " " ;
2019-05-30 07:26:58 +08:00
if ( flags & PREFIX_SOURCE ) flagString + = " prefixSource " ;
if ( flags & HAS_KEY_SUFFIX ) flagString + = " keySuffix " ;
if ( flags & HAS_VERSION ) flagString + = " Version " ;
if ( flags & HAS_VALUE ) flagString + = " Value " ;
2019-05-29 21:23:32 +08:00
int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS ;
int prefixLen = r . readVarInt ( ) ;
int valueLen = ( flags & HAS_VALUE ) ? r . read < uint8_t > ( ) : 0 ;
int keySuffixLen = ( flags & HAS_KEY_SUFFIX ) ? r . readVarInt ( ) : 0 ;
2019-05-30 17:10:07 +08:00
return format ( " len: %d flags: %s prefixLen: %d keySuffixLen: %d intFieldSuffix: %d valueLen %d raw: %s " ,
size ( ) , flagString . c_str ( ) , prefixLen , keySuffixLen , intFieldSuffixLen , valueLen , StringRef ( ( const uint8_t * ) this , size ( ) ) . toHexString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
}
} ;
# pragma pack(pop)
2019-06-04 19:03:52 +08:00
// Compares and orders by key, version, chunk.start, chunk.total.
// Value is not considered, as it is does not make sense for a container
// to have two records which differ only in value.
2019-02-21 18:46:30 +08:00
int compare ( const RedwoodRecordRef & rhs ) const {
int cmp = key . compare ( rhs . key ) ;
if ( cmp = = 0 ) {
cmp = version - rhs . version ;
if ( cmp = = 0 ) {
2019-05-22 10:16:32 +08:00
// It is assumed that in any data set there will never be more than one
// unique chunk total size for the same key and version, so sort by start, total
// Chunked (represented by chunk.total > 0) sorts higher than whole
cmp = chunk . start - rhs . chunk . start ;
if ( cmp = = 0 ) {
2019-04-30 08:00:29 +08:00
cmp = chunk . total - rhs . chunk . total ;
2019-02-21 18:46:30 +08:00
}
}
}
return cmp ;
}
2019-06-04 19:03:52 +08:00
// Compares key fields and value for equality
bool identical ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) = = 0 & & value = = rhs . value ;
}
2019-02-21 18:46:30 +08:00
bool operator = = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) = = 0 ;
}
2019-06-04 19:03:52 +08:00
bool operator ! = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) ! = 0 ;
}
bool operator < ( const RedwoodRecordRef & rhs ) const {
2019-02-21 18:46:30 +08:00
return compare ( rhs ) < 0 ;
}
bool operator > ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) > 0 ;
}
bool operator < = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) < = 0 ;
}
bool operator > = ( const RedwoodRecordRef & rhs ) const {
return compare ( rhs ) > = 0 ;
}
2019-06-06 11:58:47 +08:00
int deltaSize ( const RedwoodRecordRef & base , bool worstCase = true ) const {
2019-05-29 21:23:32 +08:00
int size = sizeof ( Delta ) ;
2019-02-21 18:46:30 +08:00
if ( value . present ( ) ) {
2019-05-29 21:23:32 +08:00
size + = value . get ( ) . size ( ) ;
+ + size ;
}
int prefixLen = getCommonPrefixLen ( base , 0 ) ;
2019-06-06 11:58:47 +08:00
size + = ( worstCase | | prefixLen > = 128 ) ? 2 : 1 ;
2019-05-29 21:23:32 +08:00
int intFieldPrefixLen ;
// Currently using a worst-guess guess where int fields in suffix are stored in their entirety if nonzero.
if ( prefixLen < key . size ( ) ) {
int keySuffixLen = key . size ( ) - prefixLen ;
2019-06-06 11:58:47 +08:00
size + = ( worstCase | | keySuffixLen > = 128 ) ? 2 : 1 ;
2019-05-29 21:23:32 +08:00
size + = keySuffixLen ;
intFieldPrefixLen = 0 ;
}
else {
intFieldPrefixLen = prefixLen - key . size ( ) ;
2019-06-06 11:58:47 +08:00
if ( worstCase ) {
size + = 2 ;
}
2019-05-29 21:23:32 +08:00
}
if ( version = = 0 & & chunk . total = = 0 & & chunk . start = = 0 ) {
// No int field suffix needed
}
else {
byte fields [ intFieldArraySize ] ;
serializeIntFields ( fields ) ;
const byte * end = fields + intFieldArraySize - 1 ;
int trailingNulls = 0 ;
while ( * end - - = = 0 ) {
+ + trailingNulls ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
size + = std : : max ( 0 , intFieldArraySize - intFieldPrefixLen - trailingNulls ) ;
2019-05-30 09:06:11 +08:00
if ( intFieldPrefixLen = = 0 & & version = = 0 ) {
size - = 8 ;
}
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
return size ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
// commonPrefix between *this and base can be passed if known
int writeDelta ( Delta & d , const RedwoodRecordRef & base , int commonPrefix = - 1 ) const {
d . flags = version = = 0 ? 0 : Delta : : HAS_VERSION ;
if ( commonPrefix < 0 ) {
commonPrefix = getCommonPrefixLen ( base , 0 ) ;
2019-04-30 08:00:29 +08:00
}
2019-05-29 21:23:32 +08:00
2019-07-02 15:58:43 +08:00
Writer w ( d . data ( ) ) ;
2019-05-29 21:23:32 +08:00
// prefixLen
w . writeVarInt ( commonPrefix ) ;
// valueLen
2019-02-21 18:46:30 +08:00
if ( value . present ( ) ) {
2019-05-29 21:23:32 +08:00
d . flags | = Delta : : HAS_VALUE ;
w . write < uint8_t > ( value . get ( ) . size ( ) ) ;
}
// keySuffixLen
if ( key . size ( ) > commonPrefix ) {
d . flags | = Delta : : HAS_KEY_SUFFIX ;
StringRef keySuffix = key . substr ( commonPrefix ) ;
w . writeVarInt ( keySuffix . size ( ) ) ;
// keySuffix
w . writeString ( keySuffix ) ;
}
// This is a common case, where no int suffix is needed
if ( version = = 0 & & chunk . total = = 0 & & chunk . start = = 0 ) {
// The suffixLen bits in flags are already zero, so nothing to do here.
}
else {
byte fields [ intFieldArraySize ] ;
serializeIntFields ( fields ) ;
// Find the position of the first null byte from the right
// This for loop has no endPos > 0 check because it is known that the array contains non-null bytes
int endPos ;
for ( endPos = intFieldArraySize ; fields [ endPos - 1 ] = = 0 ; - - endPos ) ;
// Start copying after any prefix bytes that matched the int fields of the base
int intFieldPrefixLen = std : : max ( 0 , commonPrefix - key . size ( ) ) ;
2019-05-30 07:26:58 +08:00
int startPos = intFieldPrefixLen + ( intFieldPrefixLen = = 0 & & version = = 0 ? 8 : 0 ) ;
2019-05-29 21:23:32 +08:00
int suffixLen = std : : max ( 0 , endPos - startPos ) ;
if ( suffixLen > 0 ) {
w . writeString ( StringRef ( fields + startPos , suffixLen ) ) ;
d . flags | = suffixLen ;
2019-02-21 18:46:30 +08:00
}
}
2019-05-29 21:23:32 +08:00
if ( value . present ( ) ) {
w . writeString ( value . get ( ) ) ;
}
2019-07-02 15:58:43 +08:00
return w . wptr - d . data ( ) + sizeof ( Delta ) ;
2019-02-21 18:46:30 +08:00
}
2019-05-29 21:23:32 +08:00
template < typename StringRefT >
static std : : string kvformat ( StringRefT s , int hexLimit = - 1 ) {
2019-02-21 18:46:30 +08:00
bool hex = false ;
for ( auto c : s ) {
if ( ! isprint ( c ) ) {
hex = true ;
break ;
}
}
return hex ? s . toHexString ( hexLimit ) : s . toString ( ) ;
}
std : : string toString ( int hexLimit = 15 ) const {
std : : string r ;
2019-06-04 19:03:52 +08:00
r + = format ( " '%s'@% " PRId64 , kvformat ( key , hexLimit ) . c_str ( ) , version ) ;
2019-06-06 11:58:47 +08:00
r + = format ( " [%u/%u]-> " , chunk . start , chunk . total ) ;
2019-02-21 18:46:30 +08:00
if ( value . present ( ) ) {
2019-06-04 19:03:52 +08:00
// Assume that values the size of a page ID are page IDs. It's not perfect but it's just for debugging.
if ( value . get ( ) . size ( ) = = sizeof ( LogicalPageID ) ) {
2019-06-06 11:58:47 +08:00
r + = format ( " [PageID=%u] " , getPageID ( ) ) ;
2019-06-04 19:03:52 +08:00
}
else {
r + = format ( " '%s' " , kvformat ( value . get ( ) , hexLimit ) . c_str ( ) ) ;
}
2019-02-21 18:46:30 +08:00
}
else {
2019-06-04 19:03:52 +08:00
r + = " null " ;
2019-02-21 18:46:30 +08:00
}
return r ;
}
} ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
struct BTreePage {
2019-02-21 18:46:30 +08:00
2018-06-08 18:32:34 +08:00
enum EPageFlags { IS_LEAF = 1 } ;
2017-06-10 05:56:41 +08:00
2019-02-21 18:46:30 +08:00
typedef DeltaTree < RedwoodRecordRef > BinaryTree ;
# pragma pack(push,1)
struct {
uint8_t flags ;
uint16_t count ;
uint32_t kvBytes ;
uint8_t extensionPageCount ;
} ;
2018-10-19 11:26:45 +08:00
# pragma pack(pop)
2017-06-10 05:56:41 +08:00
2019-07-02 15:58:43 +08:00
inline LogicalPageID * extensionPages ( ) {
return ( LogicalPageID * ) ( this + 1 ) ;
}
inline const LogicalPageID * extensionPages ( ) const {
return ( const LogicalPageID * ) ( this + 1 ) ;
}
2019-02-21 18:46:30 +08:00
int size ( ) const {
const BinaryTree * t = & tree ( ) ;
return ( uint8_t * ) t - ( uint8_t * ) this + t - > size ( ) ;
2018-09-19 15:32:39 +08:00
}
2019-02-21 18:46:30 +08:00
bool isLeaf ( ) const {
return flags & IS_LEAF ;
}
BinaryTree & tree ( ) {
2019-07-02 15:58:43 +08:00
return * ( BinaryTree * ) ( extensionPages ( ) + extensionPageCount ) ;
2019-02-21 18:46:30 +08:00
}
const BinaryTree & tree ( ) const {
2019-07-02 15:58:43 +08:00
return * ( const BinaryTree * ) ( extensionPages ( ) + extensionPageCount ) ;
2018-09-19 15:32:39 +08:00
}
static inline int GetHeaderSize ( int extensionPages = 0 ) {
2019-07-02 15:58:43 +08:00
return sizeof ( BTreePage ) + ( extensionPages * sizeof ( LogicalPageID ) ) ;
2018-06-14 19:15:14 +08:00
}
2019-02-21 18:46:30 +08:00
std : : string toString ( bool write , LogicalPageID id , Version ver , const RedwoodRecordRef * lowerBound , const RedwoodRecordRef * upperBound ) const {
2018-06-08 18:32:34 +08:00
std : : string r ;
2019-05-29 21:23:32 +08:00
r + = format ( " BTreePage op=%s id=%d ver=% " PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d extPages=%d \n lowerBound: %s \n upperBound: %s \n " ,
2019-02-21 18:46:30 +08:00
write ? " write " : " read " , id , ver , this , ( int ) flags , ( int ) count , ( int ) kvBytes , ( int ) extensionPageCount ,
lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
2018-08-29 04:46:14 +08:00
try {
if ( count > 0 ) {
2019-02-21 18:46:30 +08:00
// This doesn't use the cached reader for the page but it is only for debugging purposes
BinaryTree : : Reader reader ( & tree ( ) , lowerBound , upperBound ) ;
BinaryTree : : Cursor c = reader . getCursor ( ) ;
2018-08-29 04:46:14 +08:00
c . moveFirst ( ) ;
ASSERT ( c . valid ( ) ) ;
2019-06-06 11:58:47 +08:00
bool anyOutOfRange = false ;
2018-08-29 04:46:14 +08:00
do {
2019-02-21 18:46:30 +08:00
r + = " " ;
2019-06-04 19:03:52 +08:00
r + = c . get ( ) . toString ( ) ;
2017-06-10 05:56:41 +08:00
2019-06-06 11:58:47 +08:00
bool tooLow = c . get ( ) . key < lowerBound - > key ;
bool tooHigh = c . get ( ) . key > upperBound - > key ;
if ( tooLow | | tooHigh ) {
anyOutOfRange = true ;
if ( tooLow ) {
r + = " (too low) " ;
}
if ( tooHigh ) {
r + = " (too high) " ;
2018-08-29 04:46:14 +08:00
}
2018-07-18 18:19:35 +08:00
}
2019-06-06 11:58:47 +08:00
r + = " \n " ;
2017-06-10 05:56:41 +08:00
2018-08-29 04:46:14 +08:00
} while ( c . moveNext ( ) ) ;
2019-06-06 11:58:47 +08:00
ASSERT ( ! anyOutOfRange ) ;
2018-08-29 04:46:14 +08:00
}
2019-04-18 03:57:23 +08:00
} catch ( Error & e ) {
2018-08-29 04:46:14 +08:00
debug_printf ( " BTreePage::toString ERROR: %s \n " , e . what ( ) ) ;
debug_printf ( " BTreePage::toString partial result: %s \n " , r . c_str ( ) ) ;
throw ;
2018-06-08 18:32:34 +08:00
}
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
return r ;
2017-06-10 05:56:41 +08:00
}
2018-10-19 11:26:45 +08:00
} ;
2018-06-08 18:32:34 +08:00
2019-05-22 10:16:32 +08:00
static void makeEmptyPage ( Reference < IPage > page , uint8_t newFlags , int pageSize ) {
2019-02-21 18:46:30 +08:00
VALGRIND_MAKE_MEM_DEFINED ( page - > begin ( ) , page - > size ( ) ) ;
2018-06-08 18:32:34 +08:00
BTreePage * btpage = ( BTreePage * ) page - > begin ( ) ;
btpage - > flags = newFlags ;
btpage - > kvBytes = 0 ;
btpage - > count = 0 ;
2018-10-25 06:57:06 +08:00
btpage - > extensionPageCount = 0 ;
2019-02-21 18:46:30 +08:00
btpage - > tree ( ) . build ( nullptr , nullptr , nullptr , nullptr ) ;
}
BTreePage : : BinaryTree : : Reader * getReader ( Reference < const IPage > page ) {
return ( BTreePage : : BinaryTree : : Reader * ) page - > userData ;
2018-06-08 18:32:34 +08:00
}
2017-06-10 05:56:41 +08:00
2018-09-19 15:32:39 +08:00
struct BoundaryAndPage {
2019-02-21 18:46:30 +08:00
Standalone < RedwoodRecordRef > lowerBound ;
2018-09-19 15:32:39 +08:00
// Only firstPage or multiPage will be in use at once
Reference < IPage > firstPage ;
std : : vector < Reference < IPage > > extPages ;
2018-08-29 04:46:14 +08:00
} ;
2018-07-10 17:24:01 +08:00
// Returns a std::vector of pairs of lower boundary key indices within kvPairs and encoded pages.
2019-02-21 18:46:30 +08:00
// TODO: Refactor this as an accumulator you add sorted keys to which makes pages.
2018-06-08 18:32:34 +08:00
template < typename Allocator >
2019-02-21 18:46:30 +08:00
static std : : vector < BoundaryAndPage > buildPages ( bool minimalBoundaries , const RedwoodRecordRef & lowerBound , const RedwoodRecordRef & upperBound , std : : vector < RedwoodRecordRef > entries , uint8_t newFlags , Allocator const & newBlockFn , int usableBlockSize ) {
2018-09-19 15:32:39 +08:00
// This is how much space for the binary tree exists in the page, after the header
2019-02-21 18:46:30 +08:00
int pageSize = usableBlockSize - BTreePage : : GetHeaderSize ( ) ;
2018-09-19 15:32:39 +08:00
// Each new block adds (usableBlockSize - sizeof(LogicalPageID)) more net usable space *for the binary tree* to pageSize.
int netTreeBlockSize = usableBlockSize - sizeof ( LogicalPageID ) ;
2018-07-15 04:37:52 +08:00
2018-09-19 15:32:39 +08:00
int blockCount = 1 ;
std : : vector < BoundaryAndPage > pages ;
2018-06-08 18:32:34 +08:00
2019-02-21 18:46:30 +08:00
int kvBytes = 0 ;
int compressedBytes = BTreePage : : BinaryTree : : GetTreeOverhead ( ) ;
2018-06-15 08:52:25 +08:00
2018-06-08 18:32:34 +08:00
int start = 0 ;
2018-06-15 08:52:25 +08:00
int i = 0 ;
2018-06-08 18:32:34 +08:00
const int iEnd = entries . size ( ) ;
2018-08-29 04:46:14 +08:00
// Lower bound of the page being added to
2019-05-22 10:16:32 +08:00
RedwoodRecordRef pageLowerBound = lowerBound . withoutValue ( ) ;
2019-02-21 18:46:30 +08:00
RedwoodRecordRef pageUpperBound ;
2018-08-29 04:46:14 +08:00
while ( i < = iEnd ) {
bool end = i = = iEnd ;
bool flush = end ;
// If not the end, add i to the page if necessary
if ( end ) {
2019-05-22 10:16:32 +08:00
pageUpperBound = upperBound . withoutValue ( ) ;
2018-08-29 04:46:14 +08:00
}
else {
2019-02-21 18:46:30 +08:00
// Get delta from previous record
const RedwoodRecordRef & entry = entries [ i ] ;
int deltaSize = entry . deltaSize ( ( i = = start ) ? pageLowerBound : entries [ i - 1 ] ) ;
2018-08-29 04:46:14 +08:00
int keySize = entry . key . size ( ) ;
2019-02-21 18:46:30 +08:00
int valueSize = entry . value . present ( ) ? entry . value . get ( ) . size ( ) : 0 ;
2018-08-29 04:46:14 +08:00
2019-02-21 18:46:30 +08:00
int spaceNeeded = sizeof ( BTreePage : : BinaryTree : : Node ) + deltaSize ;
2018-08-29 04:46:14 +08:00
2019-02-21 18:46:30 +08:00
debug_printf ( " Trying to add record %3d of %3lu (i=%3d) klen %4d vlen %3d deltaSize %4d spaceNeeded %4d compressed %4d / page %4d bytes %s \n " ,
i + 1 , entries . size ( ) , i , keySize , valueSize , deltaSize ,
spaceNeeded , compressedBytes , pageSize , entry . toString ( ) . c_str ( ) ) ;
2018-08-29 04:46:14 +08:00
int spaceAvailable = pageSize - compressedBytes ;
2018-09-19 15:32:39 +08:00
// Does it fit?
bool fits = spaceAvailable > = spaceNeeded ;
2018-08-29 04:46:14 +08:00
2018-09-19 15:32:39 +08:00
// If it doesn't fit, either end the current page or increase the page size
2018-08-29 04:46:14 +08:00
if ( ! fits ) {
2018-09-19 15:32:39 +08:00
// For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor
int minimumEntries = minimalBoundaries ? 1 : 4 ;
int count = i - start ;
// If not enough entries or page less than half full, increase page size to make the entry fit
if ( count < minimumEntries | | spaceAvailable > pageSize / 2 ) {
// Figure out how many additional whole or partial blocks are needed
int newBlocks = 1 + ( spaceNeeded - spaceAvailable - 1 ) / netTreeBlockSize ;
2018-09-24 17:42:23 +08:00
int newPageSize = pageSize + ( newBlocks * netTreeBlockSize ) ;
2019-02-21 18:46:30 +08:00
if ( newPageSize < = BTreePage : : BinaryTree : : MaximumTreeSize ( ) ) {
2018-09-24 17:42:23 +08:00
blockCount + = newBlocks ;
pageSize = newPageSize ;
fits = true ;
}
2018-08-29 04:46:14 +08:00
}
2018-09-24 17:42:23 +08:00
if ( ! fits ) {
2019-05-30 17:10:07 +08:00
pageUpperBound = entry . withoutValue ( ) ;
2018-08-29 04:46:14 +08:00
}
}
2018-07-15 04:37:52 +08:00
2018-09-19 15:32:39 +08:00
// If the record fits then add it to the page set
if ( fits ) {
2018-08-29 04:46:14 +08:00
kvBytes + = keySize + valueSize ;
2018-09-19 15:32:39 +08:00
compressedBytes + = spaceNeeded ;
2018-08-29 04:46:14 +08:00
+ + i ;
}
2018-07-15 04:37:52 +08:00
2018-08-29 04:46:14 +08:00
flush = ! fits ;
}
2018-07-15 04:37:52 +08:00
2018-08-29 04:46:14 +08:00
// If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above.
if ( flush ) {
end = i = = iEnd ; // i could have been moved above
2019-03-15 15:46:09 +08:00
2018-08-29 04:46:14 +08:00
int count = i - start ;
2019-03-15 15:46:09 +08:00
// If not writing the final page, reduce entry count of page by a third
if ( ! end ) {
i - = count / 3 ;
2019-05-22 10:16:32 +08:00
pageUpperBound = entries [ i ] . withoutValue ( ) ;
2019-03-15 15:46:09 +08:00
}
2019-05-30 17:10:07 +08:00
// If this isn't the final page, shorten the upper boundary
if ( ! end & & minimalBoundaries ) {
int commonPrefix = pageUpperBound . getCommonPrefixLen ( entries [ i - 1 ] , 0 ) ;
pageUpperBound . truncate ( commonPrefix + 1 ) ;
}
2019-03-15 15:46:09 +08:00
debug_printf ( " Flushing page start=%d i=%d count=%d \n lower: %s \n upper: %s \n " , start , i , count , pageLowerBound . toString ( ) . c_str ( ) , pageUpperBound . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
# if REDWOOD_DEBUG
2018-08-29 04:46:14 +08:00
for ( int j = start ; j < i ; + + j ) {
2019-02-21 18:46:30 +08:00
debug_printf ( " %3d: %s \n " , j , entries [ j ] . toString ( ) . c_str ( ) ) ;
if ( j > start ) {
2019-05-22 10:16:32 +08:00
//ASSERT(entries[j] > entries[j - 1]);
2019-02-21 18:46:30 +08:00
}
2018-08-29 04:46:14 +08:00
}
2019-05-22 10:16:32 +08:00
ASSERT ( pageLowerBound . key < = pageUpperBound . key ) ;
2019-02-21 18:46:30 +08:00
# endif
2018-09-19 15:32:39 +08:00
union {
BTreePage * btPage ;
uint8_t * btPageMem ;
} ;
2019-02-21 18:46:30 +08:00
int allocatedSize ;
2018-09-19 15:32:39 +08:00
if ( blockCount = = 1 ) {
Reference < IPage > page = newBlockFn ( ) ;
2019-02-21 18:46:30 +08:00
VALGRIND_MAKE_MEM_DEFINED ( page - > begin ( ) , page - > size ( ) ) ;
2018-09-19 15:32:39 +08:00
btPageMem = page - > mutate ( ) ;
2019-02-21 18:46:30 +08:00
allocatedSize = page - > size ( ) ;
pages . push_back ( { pageLowerBound , page } ) ;
2018-09-19 15:32:39 +08:00
}
else {
ASSERT ( blockCount > 1 ) ;
2019-02-21 18:46:30 +08:00
allocatedSize = usableBlockSize * blockCount ;
btPageMem = new uint8_t [ allocatedSize ] ;
VALGRIND_MAKE_MEM_DEFINED ( btPageMem , allocatedSize ) ;
2018-09-19 15:32:39 +08:00
}
btPage - > flags = newFlags ;
btPage - > kvBytes = kvBytes ;
btPage - > count = i - start ;
btPage - > extensionPageCount = blockCount - 1 ;
2019-02-21 18:46:30 +08:00
int written = btPage - > tree ( ) . build ( & entries [ start ] , & entries [ i ] , & pageLowerBound , & pageUpperBound ) ;
2018-06-12 16:43:19 +08:00
if ( written > pageSize ) {
2018-09-19 15:32:39 +08:00
fprintf ( stderr , " ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d \n " , written , pageSize , blockCount , i - start , kvBytes , compressedBytes ) ;
2018-06-12 16:43:19 +08:00
ASSERT ( false ) ;
}
2018-09-19 15:32:39 +08:00
if ( blockCount ! = 1 ) {
Reference < IPage > page = newBlockFn ( ) ;
2019-02-21 18:46:30 +08:00
VALGRIND_MAKE_MEM_DEFINED ( page - > begin ( ) , page - > size ( ) ) ;
2018-09-19 15:32:39 +08:00
const uint8_t * rptr = btPageMem ;
memcpy ( page - > mutate ( ) , rptr , usableBlockSize ) ;
rptr + = usableBlockSize ;
std : : vector < Reference < IPage > > extPages ;
for ( int b = 1 ; b < blockCount ; + + b ) {
Reference < IPage > extPage = newBlockFn ( ) ;
2019-02-21 18:46:30 +08:00
VALGRIND_MAKE_MEM_DEFINED ( page - > begin ( ) , page - > size ( ) ) ;
2018-09-19 15:32:39 +08:00
//debug_printf("block %d write offset %d\n", b, firstBlockSize + (b - 1) * usableBlockSize);
memcpy ( extPage - > mutate ( ) , rptr , usableBlockSize ) ;
rptr + = usableBlockSize ;
extPages . push_back ( std : : move ( extPage ) ) ;
}
pages . push_back ( { std : : move ( pageLowerBound ) , std : : move ( page ) , std : : move ( extPages ) } ) ;
delete btPageMem ;
2018-08-29 04:46:14 +08:00
}
2018-09-19 15:32:39 +08:00
if ( end )
break ;
2018-06-08 18:32:34 +08:00
start = i ;
kvBytes = 0 ;
2019-02-21 18:46:30 +08:00
compressedBytes = BTreePage : : BinaryTree : : GetTreeOverhead ( ) ;
2019-05-22 10:16:32 +08:00
pageLowerBound = pageUpperBound . withoutValue ( ) ;
2017-06-10 05:56:41 +08:00
}
}
2018-06-08 18:32:34 +08:00
//debug_printf("buildPages: returning pages.size %lu, kvpairs %lu\n", pages.size(), kvPairs.size());
return pages ;
}
2017-06-10 05:56:41 +08:00
# define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); }
class VersionedBTree : public IVersionedStore {
public :
2018-07-15 04:37:52 +08:00
// The first possible internal record possible in the tree
2019-02-21 18:46:30 +08:00
static RedwoodRecordRef dbBegin ;
2018-07-15 04:37:52 +08:00
// A record which is greater than the last possible record in the tree
2019-02-21 18:46:30 +08:00
static RedwoodRecordRef dbEnd ;
2017-06-10 05:56:41 +08:00
2018-12-06 14:41:04 +08:00
struct Counts {
Counts ( ) {
memset ( this , 0 , sizeof ( Counts ) ) ;
}
void clear ( ) {
* this = Counts ( ) ;
}
2019-03-15 15:46:09 +08:00
int64_t pageReads ;
int64_t extPageReads ;
int64_t setBytes ;
2018-12-06 14:41:04 +08:00
int64_t pageWrites ;
2019-03-15 15:46:09 +08:00
int64_t extPageWrites ;
2018-12-06 14:41:04 +08:00
int64_t sets ;
int64_t clears ;
int64_t commits ;
2019-03-15 15:46:09 +08:00
int64_t gets ;
int64_t getRanges ;
int64_t commitToPage ;
int64_t commitToPageStart ;
2018-12-06 14:41:04 +08:00
2019-03-15 15:46:09 +08:00
std : : string toString ( bool clearAfter = false ) {
2019-05-29 21:23:32 +08:00
std : : string s = format ( " set=% " PRId64 " clear=% " PRId64 " get=% " PRId64 " getRange=% " PRId64 " commit=% " PRId64 " pageRead=% " PRId64 " extPageRead=% " PRId64 " pageWrite=% " PRId64 " extPageWrite=% " PRId64 " commitPage=% " PRId64 " commitPageStart=% " PRId64 " " ,
2019-03-15 15:46:09 +08:00
sets , clears , gets , getRanges , commits , pageReads , extPageReads , pageWrites , extPageWrites , commitToPage , commitToPageStart ) ;
if ( clearAfter ) {
clear ( ) ;
}
2018-12-06 14:41:04 +08:00
return s ;
}
} ;
2018-07-15 04:37:52 +08:00
2019-03-15 15:46:09 +08:00
// Using a static for metrics because a single process shouldn't normally have multiple storage engines
static Counts counts ;
2017-06-10 05:56:41 +08:00
2018-10-25 06:57:06 +08:00
// All async opts on the btree are based on pager reads, writes, and commits, so
// we can mostly forward these next few functions to the pager
2018-10-15 18:43:43 +08:00
virtual Future < Void > getError ( ) {
return m_pager - > getError ( ) ;
}
virtual Future < Void > onClosed ( ) {
return m_pager - > onClosed ( ) ;
}
2018-10-25 06:57:06 +08:00
void close_impl ( bool dispose ) {
IPager * pager = m_pager ;
delete this ;
if ( dispose )
pager - > dispose ( ) ;
else
pager - > close ( ) ;
2018-10-15 18:43:43 +08:00
}
2018-10-25 06:57:06 +08:00
virtual void dispose ( ) {
return close_impl ( true ) ;
2018-10-15 18:43:43 +08:00
}
virtual void close ( ) {
2018-10-25 06:57:06 +08:00
return close_impl ( false ) ;
2018-10-15 18:43:43 +08:00
}
2017-06-10 05:56:41 +08:00
virtual KeyValueStoreType getType ( ) NOT_IMPLEMENTED
virtual bool supportsMutation ( int op ) NOT_IMPLEMENTED
2018-10-25 06:57:06 +08:00
virtual StorageBytes getStorageBytes ( ) {
return m_pager - > getStorageBytes ( ) ;
}
2017-06-10 05:56:41 +08:00
// Writes are provided in an ordered stream.
// A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion()
// A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns
virtual void set ( KeyValueRef keyValue ) {
2018-12-06 14:41:04 +08:00
+ + counts . sets ;
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion & changes = insertMutationBoundary ( keyValue . key ) - > second . startKeyMutations ;
2019-04-30 08:00:29 +08:00
if ( singleVersion ) {
if ( changes . empty ( ) ) {
changes [ 0 ] = SingleKeyMutation ( keyValue . value ) ;
}
else {
2019-05-22 10:16:32 +08:00
changes . begin ( ) - > second = SingleKeyMutation ( keyValue . value ) ;
2019-04-30 08:00:29 +08:00
}
}
else {
// Add the set if the changes set is empty or the last entry isn't a set to exactly the same value
if ( changes . empty ( ) | | ! changes . rbegin ( ) - > second . equalToSet ( keyValue . value ) ) {
changes [ m_writeVersion ] = SingleKeyMutation ( keyValue . value ) ;
}
2017-08-26 06:48:32 +08:00
}
2017-08-22 13:29:57 +08:00
}
virtual void clear ( KeyRangeRef range ) {
2018-12-06 14:41:04 +08:00
+ + counts . clears ;
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator iBegin = insertMutationBoundary ( range . begin ) ;
MutationBufferT : : iterator iEnd = insertMutationBoundary ( range . end ) ;
2019-04-30 08:00:29 +08:00
// In single version mode, clear all pending updates in the affected range
if ( singleVersion ) {
2017-08-25 08:25:53 +08:00
RangeMutation & range = iBegin - > second ;
2019-04-30 08:00:29 +08:00
range . startKeyMutations . clear ( ) ;
2019-05-22 10:16:32 +08:00
range . startKeyMutations [ 0 ] = SingleKeyMutation ( ) ;
2019-04-30 08:00:29 +08:00
range . rangeClearVersion = 0 ;
+ + iBegin ;
m_pBuffer - > erase ( iBegin , iEnd ) ;
}
else {
// For each boundary in the cleared range
while ( iBegin ! = iEnd ) {
RangeMutation & range = iBegin - > second ;
2017-08-25 08:25:53 +08:00
2019-04-30 08:00:29 +08:00
// Set the rangeClearedVersion if not set
if ( ! range . rangeClearVersion . present ( ) )
range . rangeClearVersion = m_writeVersion ;
2017-08-25 08:25:53 +08:00
2019-04-30 08:00:29 +08:00
// Add a clear to the startKeyMutations map if it's empty or the last item is not a clear
if ( range . startKeyMutations . empty ( ) | | ! range . startKeyMutations . rbegin ( ) - > second . isClear ( ) )
range . startKeyMutations [ m_writeVersion ] = SingleKeyMutation ( ) ;
2017-08-25 08:25:53 +08:00
2019-04-30 08:00:29 +08:00
+ + iBegin ;
}
2017-08-25 08:25:53 +08:00
}
2017-06-10 05:56:41 +08:00
}
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
virtual void mutate ( int op , StringRef param1 , StringRef param2 ) NOT_IMPLEMENTED
// Versions [begin, end) no longer readable
virtual void forgetVersions ( Version begin , Version end ) NOT_IMPLEMENTED
virtual Future < Version > getLatestVersion ( ) {
if ( m_writeVersion ! = invalidVersion )
return m_writeVersion ;
return m_pager - > getLatestVersion ( ) ;
}
2017-09-23 08:18:28 +08:00
Version getWriteVersion ( ) {
return m_writeVersion ;
}
2017-09-21 19:43:49 +08:00
Version getLastCommittedVersion ( ) {
return m_lastCommittedVersion ;
}
2019-04-30 08:00:29 +08:00
VersionedBTree ( IPager * pager , std : : string name , bool singleVersion = false , int target_page_size = - 1 )
2017-08-04 06:07:29 +08:00
: m_pager ( pager ) ,
m_writeVersion ( invalidVersion ) ,
2018-09-19 15:32:39 +08:00
m_usablePageSizeOverride ( pager - > getUsablePageSize ( ) ) ,
2017-09-23 08:18:28 +08:00
m_lastCommittedVersion ( invalidVersion ) ,
m_pBuffer ( nullptr ) ,
2019-04-30 08:00:29 +08:00
m_name ( name ) ,
singleVersion ( singleVersion )
2017-09-06 07:59:31 +08:00
{
2018-09-19 15:32:39 +08:00
if ( target_page_size > 0 & & target_page_size < m_usablePageSizeOverride )
m_usablePageSizeOverride = target_page_size ;
2017-09-23 08:18:28 +08:00
m_init = init_impl ( this ) ;
2017-10-10 04:24:16 +08:00
m_latestCommit = m_init ;
2017-06-10 05:56:41 +08:00
}
2017-09-21 19:43:49 +08:00
ACTOR static Future < Void > init_impl ( VersionedBTree * self ) {
2017-07-15 02:36:49 +08:00
self - > m_root = 0 ;
state Version latest = wait ( self - > m_pager - > getLatestVersion ( ) ) ;
if ( latest = = 0 ) {
2017-08-04 06:07:29 +08:00
+ + latest ;
2017-09-21 08:50:02 +08:00
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
2019-05-22 10:16:32 +08:00
makeEmptyPage ( page , BTreePage : : IS_LEAF , self - > m_usablePageSizeOverride ) ;
2019-02-21 18:46:30 +08:00
self - > writePage ( self - > m_root , page , latest , & dbBegin , & dbEnd ) ;
2017-08-04 06:07:29 +08:00
self - > m_pager - > setLatestVersion ( latest ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_pager - > commit ( ) ) ;
2017-07-15 02:36:49 +08:00
}
2017-08-04 06:07:29 +08:00
self - > m_lastCommittedVersion = latest ;
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
2017-09-23 08:18:28 +08:00
Future < Void > init ( ) { return m_init ; }
2017-06-10 05:56:41 +08:00
2017-08-22 13:29:57 +08:00
virtual ~ VersionedBTree ( ) {
2018-10-15 18:43:43 +08:00
// This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe,
// it will cancel init and commit and leave the pager alive but with potentially an incomplete set of
// uncommitted writes so it should not be committed.
2017-10-02 18:32:22 +08:00
m_init . cancel ( ) ;
m_latestCommit . cancel ( ) ;
2017-08-22 13:29:57 +08:00
}
2017-06-10 05:56:41 +08:00
// readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed
// to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations.
// The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less
// than or equal to the given version.
// If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same
// write version, OR it may represent a snapshot as of the call to readAtVersion().
virtual Reference < IStoreCursor > readAtVersion ( Version v ) {
// TODO: Use the buffer to return uncommitted data
2017-07-26 07:10:19 +08:00
// For now, only committed versions can be read.
2019-04-30 08:00:29 +08:00
Version recordVersion = singleVersion ? 0 : v ;
2017-07-26 07:10:19 +08:00
ASSERT ( v < = m_lastCommittedVersion ) ;
2019-04-30 08:00:29 +08:00
if ( singleVersion ) {
ASSERT ( v = = m_lastCommittedVersion ) ;
}
return Reference < IStoreCursor > ( new Cursor ( m_pager - > getReadSnapshot ( v ) , m_root , recordVersion , m_usablePageSizeOverride ) ) ;
2017-06-10 05:56:41 +08:00
}
// Must be nondecreasing
virtual void setWriteVersion ( Version v ) {
2017-09-23 08:18:28 +08:00
ASSERT ( v > m_lastCommittedVersion ) ;
// If there was no current mutation buffer, create one in the buffer map and update m_pBuffer
if ( m_pBuffer = = nullptr ) {
// When starting a new mutation buffer its start version must be greater than the last write version
ASSERT ( v > m_writeVersion ) ;
m_pBuffer = & m_mutationBuffers [ v ] ;
// Create range representing the entire keyspace. This reduces edge cases to applying mutations
// because now all existing keys are within some range in the mutation map.
2019-02-21 18:46:30 +08:00
( * m_pBuffer ) [ dbBegin . key ] ;
( * m_pBuffer ) [ dbEnd . key ] ;
2017-09-23 08:18:28 +08:00
}
else {
// It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null
ASSERT ( v > = m_writeVersion ) ;
}
2017-06-10 05:56:41 +08:00
m_writeVersion = v ;
}
virtual Future < Void > commit ( ) {
2017-09-23 08:18:28 +08:00
if ( m_pBuffer = = nullptr )
return m_latestCommit ;
2017-06-10 05:56:41 +08:00
return commit_impl ( this ) ;
}
2019-04-30 08:00:29 +08:00
bool isSingleVersion ( ) const {
return singleVersion ;
}
2017-06-10 05:56:41 +08:00
private :
2019-02-21 18:46:30 +08:00
void writePage ( LogicalPageID id , Reference < IPage > page , Version ver , const RedwoodRecordRef * pageLowerBound , const RedwoodRecordRef * pageUpperBound ) {
2018-08-29 04:46:14 +08:00
debug_printf ( " writePage(): %s \n " , ( ( const BTreePage * ) page - > begin ( ) ) - > toString ( true , id , ver , pageLowerBound , pageUpperBound ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
m_pager - > writePage ( id , page , ver ) ;
}
LogicalPageID m_root ;
2019-06-04 19:03:52 +08:00
// TODO: Don't use Standalone
struct VersionedChildPageSet {
Version version ;
std : : vector < Standalone < RedwoodRecordRef > > children ;
Standalone < RedwoodRecordRef > upperBound ;
} ;
2017-08-04 15:01:25 +08:00
2019-06-04 19:03:52 +08:00
typedef std : : vector < VersionedChildPageSet > VersionedChildrenT ;
// Utility class for building a vector of internal page entries.
// Entries must be added in version order. Modified will be set to true
// if any entries differ from the original ones. Additional entries will be
// added when necessary to reconcile differences between the upper and lower
// boundaries of consecutive entries.
struct InternalPageBuilder {
// Cursor must be at first entry in page
InternalPageBuilder ( const BTreePage : : BinaryTree : : Cursor & c )
: cursor ( c ) , modified ( false ) , childPageCount ( 0 )
{
}
inline void addEntry ( const RedwoodRecordRef & rec ) {
if ( rec . value . present ( ) ) {
+ + childPageCount ;
}
// If no modification detected yet then check that this record is identical to the next
// record from the original page which is at the current cursor position.
if ( ! modified ) {
if ( cursor . valid ( ) ) {
if ( ! rec . identical ( cursor . get ( ) ) ) {
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Found internal page difference. new: %s old: %s \n " , rec . toString ( ) . c_str ( ) , cursor . get ( ) . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
modified = true ;
}
else {
cursor . moveNext ( ) ;
}
}
else {
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Found internal page difference. new: %s old: <end> \n " , rec . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
modified = true ;
}
}
entries . push_back ( rec ) ;
}
void addEntries ( const VersionedChildPageSet & newSet ) {
// If there are already entries, the last one links to a child page, and its upper bound is not the same
// as the first lowerBound in newSet (or newSet is empty, as the next newSet is necessarily greater)
// then add the upper bound of the previous set as a value-less record so that on future reads
// the previous child page can be decoded correctly.
if ( ! entries . empty ( ) & & entries . back ( ) . value . present ( )
& & ( newSet . children . empty ( ) | | newSet . children . front ( ) ! = lastUpperBound ) )
{
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Added placeholder %s \n " , lastUpperBound . withoutValue ( ) . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
addEntry ( lastUpperBound . withoutValue ( ) ) ;
}
for ( auto & child : newSet . children ) {
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: Adding child entry %s \n " , child . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
addEntry ( child ) ;
}
2019-04-30 08:00:29 +08:00
2019-06-04 19:03:52 +08:00
lastUpperBound = newSet . upperBound ;
2019-06-04 19:55:09 +08:00
debug_printf ( " InternalPageBuilder: New upper bound: %s \n " , lastUpperBound . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
2019-06-04 19:03:52 +08:00
// Finish comparison to existing data if necesary.
// Handle possible page upper bound changes.
// If modified is set (see below) and our rightmost entry has a child page and its upper bound
// (currently in lastUpperBound) does not match the new desired page upper bound, passed as newUpperBound,
// then write lastUpperBound with no value to allow correct decoding of the rightmost entry.
// This is only done if modified is set to avoid rewriting this page for this purpose only.
//
// After this call, lastUpperBound is internal page's upper bound.
2019-06-06 11:58:47 +08:00
void finalize ( const RedwoodRecordRef & upperBound , const RedwoodRecordRef & decodeUpperBound ) {
debug_printf ( " InternalPageBuilder::end modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s \n " , modified , upperBound . toString ( ) . c_str ( ) , decodeUpperBound . toString ( ) . c_str ( ) , lastUpperBound . toString ( ) . c_str ( ) ) ;
modified = modified | | cursor . valid ( ) ;
debug_printf ( " InternalPageBuilder::end modified=%d after cursor check \n " , modified ) ;
// If there are boundary key entries and the last one has a child page then the
// upper bound for this internal page must match the required upper bound for
// the last child entry.
if ( ! entries . empty ( ) & & entries . back ( ) . value . present ( ) ) {
debug_printf ( " InternalPageBuilder::end last entry is not null \n " ) ;
// If the page contents were not modified so far and the upper bound required
// for the last child page (lastUpperBound) does not match what the page
// was encoded with then the page must be modified.
if ( ! modified & & lastUpperBound ! = decodeUpperBound ) {
debug_printf ( " InternalPageBuilder::end modified set true because lastUpperBound does not match decodeUpperBound \n " ) ;
2019-06-04 19:03:52 +08:00
modified = true ;
}
2019-06-06 11:58:47 +08:00
if ( modified & & lastUpperBound ! = upperBound ) {
debug_printf ( " InternalPageBuilder::end Modified is true but lastUpperBound does not match upperBound so adding placeholder \n " ) ;
2019-06-04 19:03:52 +08:00
addEntry ( lastUpperBound . withoutValue ( ) ) ;
2019-06-06 11:58:47 +08:00
lastUpperBound = upperBound ;
2019-06-04 19:03:52 +08:00
}
}
2019-06-06 11:58:47 +08:00
debug_printf ( " InternalPageBuilder::end exit. modified=%d upperBound=%s decodeUpperBound=%s lastUpperBound=%s \n " , modified , upperBound . toString ( ) . c_str ( ) , decodeUpperBound . toString ( ) . c_str ( ) , lastUpperBound . toString ( ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
}
BTreePage : : BinaryTree : : Cursor cursor ;
std : : vector < Standalone < RedwoodRecordRef > > entries ;
Standalone < RedwoodRecordRef > lastUpperBound ;
bool modified ;
int childPageCount ;
Arena arena ;
} ;
template < typename T >
static std : : string toString ( const T & o ) {
return o . toString ( ) ;
2019-04-30 08:00:29 +08:00
}
2019-06-06 11:58:47 +08:00
static std : : string toString ( const VersionedChildPageSet & c ) {
2019-06-04 19:03:52 +08:00
return format ( " Version=% " PRId64 " children=%s upperBound=%s " , c . version , toString ( c . children ) . c_str ( ) , c . upperBound . toString ( ) . c_str ( ) ) ;
}
template < typename T >
static std : : string toString ( const std : : vector < T > & v ) {
2019-04-30 08:00:29 +08:00
std : : string r = " { " ;
2019-06-04 19:03:52 +08:00
for ( auto & o : v ) {
2019-04-30 08:00:29 +08:00
r + = toString ( o ) + " , " ;
}
return r + " } " ;
}
2017-08-04 15:01:25 +08:00
2017-08-25 08:25:53 +08:00
// Represents a change to a single key - set, clear, or atomic op
struct SingleKeyMutation {
// Clear
SingleKeyMutation ( ) : op ( MutationRef : : ClearRange ) { }
// Set
SingleKeyMutation ( Value val ) : op ( MutationRef : : SetValue ) , value ( val ) { }
// Atomic Op
SingleKeyMutation ( MutationRef : : Type op , Value val ) : op ( op ) , value ( val ) { }
2017-08-22 13:29:57 +08:00
MutationRef : : Type op ;
2017-08-25 08:25:53 +08:00
Value value ;
2017-08-04 15:01:25 +08:00
2017-08-22 13:29:57 +08:00
inline bool isClear ( ) const { return op = = MutationRef : : ClearRange ; }
inline bool isSet ( ) const { return op = = MutationRef : : SetValue ; }
2017-08-25 08:25:53 +08:00
inline bool isAtomicOp ( ) const { return ! isSet ( ) & & ! isClear ( ) ; }
inline bool equalToSet ( ValueRef val ) { return isSet ( ) & & value = = val ; }
2019-02-21 18:46:30 +08:00
inline RedwoodRecordRef toRecord ( KeyRef userKey , Version version ) const {
2017-09-06 07:59:31 +08:00
// No point in serializing an atomic op, it needs to be coalesced to a real value.
2017-08-25 08:25:53 +08:00
ASSERT ( ! isAtomicOp ( ) ) ;
if ( isClear ( ) )
2019-02-21 18:46:30 +08:00
return RedwoodRecordRef ( userKey , version ) ;
2017-08-22 13:29:57 +08:00
2019-02-21 18:46:30 +08:00
return RedwoodRecordRef ( userKey , version , value ) ;
2017-08-25 08:25:53 +08:00
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
std : : string toString ( ) const {
2017-08-26 06:48:32 +08:00
return format ( " op=%d val='%s' " , op , printable ( value ) . c_str ( ) ) ;
2017-08-22 13:29:57 +08:00
}
2017-08-04 15:01:25 +08:00
} ;
2017-08-25 08:25:53 +08:00
// Represents mutations on a single key and a possible clear to a range that begins
// immediately after that key
typedef std : : map < Version , SingleKeyMutation > SingleKeyMutationsByVersion ;
struct RangeMutation {
// Mutations for exactly the start key
SingleKeyMutationsByVersion startKeyMutations ;
// A clear range version, if cleared, for the range starting immediately AFTER the start key
Optional < Version > rangeClearVersion ;
// Returns true if this RangeMutation doesn't actually mutate anything
bool noChanges ( ) const {
return ! rangeClearVersion . present ( ) & & startKeyMutations . empty ( ) ;
}
std : : string toString ( ) const {
std : : string result ;
result . append ( " rangeClearVersion: " ) ;
if ( rangeClearVersion . present ( ) )
2019-05-29 21:23:32 +08:00
result . append ( format ( " % " PRId64 " " , rangeClearVersion . get ( ) ) ) ;
2017-08-25 08:25:53 +08:00
else
result . append ( " <not present> " ) ;
result . append ( " startKeyMutations: " ) ;
for ( SingleKeyMutationsByVersion : : value_type const & m : startKeyMutations )
2019-05-29 21:23:32 +08:00
result . append ( format ( " [% " PRId64 " => %s] " , m . first , m . second . toString ( ) . c_str ( ) ) ) ;
2017-08-25 08:25:53 +08:00
return result ;
}
} ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
typedef std : : map < Key , RangeMutation > MutationBufferT ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
/* Mutation Buffer Overview
2017-08-22 13:29:57 +08:00
*
2019-04-30 08:00:29 +08:00
* This structure ' s organization is meant to put pending updates for the btree in an order
* that makes it efficient to query all pending mutations across all pending versions which are
* relevant to a particular subtree of the btree .
*
* At the top level , it is a map of the start of a range being modified to a RangeMutation .
* The end of the range is map key ( which is the next range start in the map ) .
*
2018-07-15 04:37:52 +08:00
* - The buffer starts out with keys ' ' and endKVV . key already populated .
2017-08-25 08:25:53 +08:00
*
* - When a new key is inserted into the buffer map , it is by definition
* splitting an existing range so it should take on the rangeClearVersion of
* the immediately preceding key which is the start of that range
2017-08-22 13:29:57 +08:00
*
* - Keys are inserted into the buffer map for every individual operation ( set / clear / atomic )
* key and for both the start and end of a range clear .
2017-08-25 08:25:53 +08:00
*
2017-08-22 13:29:57 +08:00
* - To apply a single clear , add it to the individual ops only if the last entry is not also a clear .
*
2017-08-25 08:25:53 +08:00
* - To apply a range clear , after inserting the new range boundaries do the following to the start
* boundary and all successive boundaries < end
* - set the range clear version if not already set
* - add a clear to the startKeyMutations if the final entry is not a clear .
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* - Note that there are actually TWO valid ways to represent
* set c = val1 at version 1
* clear c \ x00 to z at version 2
* with this model . Either
* c = { rangeClearVersion = 2 , startKeyMutations = { 1 = > val1 }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
* OR
* c = { rangeClearVersion = < not present > , startKeyMutations = { 1 = > val1 }
* c \ x00 = { rangeClearVersion = 2 , startKeyMutations = { 2 = > < not present > }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* This is because the rangeClearVersion applies to a range begining with the first
* key AFTER the start key , so that the logic for reading the start key is more simple
* as it only involves consulting startKeyMutations . When adding a clear range , the
* boundary key insert / split described above is valid , and is what is currently done ,
* but it would also be valid to see if the last key before startKey is equal to
* keyBefore ( startKey ) , and if so that mutation buffer boundary key can be used instead
* without adding an additional key to the buffer .
2019-04-30 08:00:29 +08:00
* TODO : A possible optimization here could be to only use existing btree leaf page boundaries as keys ,
* with mutation point keys being stored in an unsorted strucutre under those boundary map keys ,
* to be sorted later just before being merged into the existing leaf page .
*/
2017-08-22 13:29:57 +08:00
2018-10-15 18:43:43 +08:00
IPager * m_pager ;
MutationBufferT * m_pBuffer ;
std : : map < Version , MutationBufferT > m_mutationBuffers ;
Version m_writeVersion ;
Version m_lastCommittedVersion ;
Future < Void > m_latestCommit ;
int m_usablePageSizeOverride ;
Future < Void > m_init ;
std : : string m_name ;
2019-04-30 08:00:29 +08:00
bool singleVersion ;
2018-10-15 18:43:43 +08:00
2017-08-28 16:57:01 +08:00
void printMutationBuffer ( MutationBufferT : : const_iterator begin , MutationBufferT : : const_iterator end ) const {
2017-08-25 08:25:53 +08:00
# if REDWOOD_DEBUG
debug_printf ( " ------------------------------------- \n " ) ;
debug_printf ( " BUFFER \n " ) ;
while ( begin ! = end ) {
debug_printf ( " '%s': %s \n " , printable ( begin - > first ) . c_str ( ) , begin - > second . toString ( ) . c_str ( ) ) ;
+ + begin ;
}
debug_printf ( " ------------------------------------- \n " ) ;
# endif
}
2017-08-22 13:29:57 +08:00
2017-09-23 08:18:28 +08:00
void printMutationBuffer ( MutationBufferT * buf ) const {
return printMutationBuffer ( buf - > begin ( ) , buf - > end ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2017-08-22 13:29:57 +08:00
2017-09-23 08:18:28 +08:00
// Find or create a mutation buffer boundary for bound and return an iterator to it
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator insertMutationBoundary ( Key boundary ) {
2017-09-23 08:18:28 +08:00
ASSERT ( m_pBuffer ! = nullptr ) ;
2017-08-25 08:25:53 +08:00
// Find the first split point in buffer that is >= key
2017-09-23 08:18:28 +08:00
MutationBufferT : : iterator ib = m_pBuffer - > lower_bound ( boundary ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Since the initial state of the mutation buffer contains the range '' through
// the maximum possible key, our search had to have found something.
2017-09-23 08:18:28 +08:00
ASSERT ( ib ! = m_pBuffer - > end ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// If we found the boundary we are looking for, return its iterator
if ( ib - > first = = boundary )
return ib ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is our insert hint. Insert the new boundary and set ib to its entry
2017-09-23 08:18:28 +08:00
ib = m_pBuffer - > insert ( ib , { boundary , RangeMutation ( ) } ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is certainly > begin() because it is guaranteed that the empty string
// boundary exists and the only way to have found that is to look explicitly
// for it in which case we would have returned above.
MutationBufferT : : iterator iPrevious = ib ;
2017-08-26 06:48:32 +08:00
- - iPrevious ;
2017-08-28 16:57:01 +08:00
if ( iPrevious - > second . rangeClearVersion . present ( ) ) {
ib - > second . rangeClearVersion = iPrevious - > second . rangeClearVersion ;
ib - > second . startKeyMutations [ iPrevious - > second . rangeClearVersion . get ( ) ] = SingleKeyMutation ( ) ;
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
return ib ;
2017-08-22 13:29:57 +08:00
}
2017-07-26 07:10:19 +08:00
2018-09-19 15:32:39 +08:00
void buildNewRoot ( Version version , std : : vector < BoundaryAndPage > & pages , std : : vector < LogicalPageID > & logicalPageIDs , const BTreePage * pPage ) {
2018-08-29 04:46:14 +08:00
//debug_printf("buildNewRoot start %lu\n", pages.size());
2017-07-14 02:32:14 +08:00
// While there are multiple child pages for this version we must write new tree levels.
while ( pages . size ( ) > 1 ) {
2019-02-21 18:46:30 +08:00
std : : vector < RedwoodRecordRef > childEntries ;
for ( int i = 0 ; i < pages . size ( ) ; i + + ) {
2019-06-04 19:03:52 +08:00
RedwoodRecordRef entry = pages [ i ] . lowerBound . withPageID ( logicalPageIDs [ i ] ) ;
debug_printf ( " Added new root entry %s \n " , entry . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
childEntries . push_back ( entry ) ;
}
2017-07-14 02:32:14 +08:00
2019-02-21 18:46:30 +08:00
pages = buildPages ( false , dbBegin , dbEnd , childEntries , 0 , [ = ] ( ) { return m_pager - > newPageBuffer ( ) ; } , m_usablePageSizeOverride ) ;
2017-07-14 02:32:14 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " Writing a new root level at version % " PRId64 " with %lu children across %lu pages \n " , version , childEntries . size ( ) , pages . size ( ) ) ;
2017-07-14 02:32:14 +08:00
2019-02-21 18:46:30 +08:00
logicalPageIDs = writePages ( pages , version , m_root , pPage , & dbEnd , nullptr ) ;
2018-09-19 15:32:39 +08:00
}
}
2019-02-21 18:46:30 +08:00
std : : vector < LogicalPageID > writePages ( std : : vector < BoundaryAndPage > pages , Version version , LogicalPageID originalID , const BTreePage * originalPage , const RedwoodRecordRef * upperBound , void * actor_debug ) {
2019-05-29 21:23:32 +08:00
debug_printf ( " %p: writePages(): %u @% " PRId64 " -> %lu replacement pages \n " , actor_debug , originalID , version , pages . size ( ) ) ;
2018-09-19 15:32:39 +08:00
2018-10-02 07:51:57 +08:00
ASSERT ( version ! = 0 | | pages . size ( ) = = 1 ) ;
2018-09-19 15:32:39 +08:00
2018-10-02 07:51:57 +08:00
std : : vector < LogicalPageID > primaryLogicalPageIDs ;
2018-09-19 15:32:39 +08:00
2018-10-02 07:51:57 +08:00
// Reuse original primary page ID if it's not the root or if only one page is being written.
if ( originalID ! = m_root | | pages . size ( ) = = 1 )
primaryLogicalPageIDs . push_back ( originalID ) ;
2017-07-14 02:32:14 +08:00
2018-10-02 07:51:57 +08:00
// Allocate a primary page ID for each page to be written
while ( primaryLogicalPageIDs . size ( ) < pages . size ( ) ) {
primaryLogicalPageIDs . push_back ( m_pager - > allocateLogicalPage ( ) ) ;
}
2018-09-19 15:32:39 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " %p: writePages(): Writing %lu replacement pages for %d at version % " PRId64 " \n " , actor_debug , pages . size ( ) , originalID , version ) ;
2018-09-19 15:32:39 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + ) {
2018-12-06 14:41:04 +08:00
+ + counts . pageWrites ;
2018-09-19 15:32:39 +08:00
// Allocate page number for main page first
2018-10-02 07:51:57 +08:00
LogicalPageID id = primaryLogicalPageIDs [ i ] ;
2018-09-19 15:32:39 +08:00
// Check for extension pages, if they exist assign IDs for them and write them at version
2018-10-02 07:51:57 +08:00
auto const & extPages = pages [ i ] . extPages ;
// If there are extension pages, write all pages using pager directly because this->writePage() is for whole primary pages
if ( extPages . size ( ) ! = 0 ) {
BTreePage * newPage = ( BTreePage * ) pages [ i ] . firstPage - > mutate ( ) ;
2018-09-19 15:32:39 +08:00
ASSERT ( newPage - > extensionPageCount = = extPages . size ( ) ) ;
for ( int e = 0 , eEnd = extPages . size ( ) ; e < eEnd ; + + e ) {
2018-10-02 07:51:57 +08:00
LogicalPageID eid = m_pager - > allocateLogicalPage ( ) ;
2019-06-06 11:58:47 +08:00
debug_printf ( " %p: writePages(): Writing extension page op=write id=%u @% " PRId64 " (%d of %lu) referencePageID=%u \n " , actor_debug , eid , version , e + 1 , extPages . size ( ) , id ) ;
2019-07-02 15:58:43 +08:00
newPage - > extensionPages ( ) [ e ] = bigEndian32 ( eid ) ;
2018-09-19 15:32:39 +08:00
// If replacing the primary page below (version == 0) then pass the primary page's ID as the reference page ID
m_pager - > writePage ( eid , extPages [ e ] , version , ( version = = 0 ) ? id : invalidLogicalPageID ) ;
2019-03-15 15:46:09 +08:00
+ + counts . extPageWrites ;
2018-09-19 15:32:39 +08:00
}
2019-05-29 21:23:32 +08:00
debug_printf ( " %p: writePages(): Writing primary page op=write id=%u @% " PRId64 " (+%lu extension pages) \n " , actor_debug , id , version , extPages . size ( ) ) ;
2018-09-19 15:32:39 +08:00
m_pager - > writePage ( id , pages [ i ] . firstPage , version ) ;
}
else {
2019-05-29 21:23:32 +08:00
debug_printf ( " %p: writePages(): Writing normal page op=write id=%u @% " PRId64 " \n " , actor_debug , id , version ) ;
2019-02-21 18:46:30 +08:00
writePage ( id , pages [ i ] . firstPage , version , & pages [ i ] . lowerBound , ( i = = pages . size ( ) - 1 ) ? upperBound : & pages [ i + 1 ] . lowerBound ) ;
2018-09-19 15:32:39 +08:00
}
}
2017-07-14 02:32:14 +08:00
2018-10-02 07:51:57 +08:00
// Free the old extension pages now that all replacement pages have been written
for ( int i = 0 ; i < originalPage - > extensionPageCount ; + + i ) {
2019-07-02 15:58:43 +08:00
//debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages()[i]));
//m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages()[i]), version);
2018-09-19 15:32:39 +08:00
}
2017-07-14 02:32:14 +08:00
2018-10-02 07:51:57 +08:00
return primaryLogicalPageIDs ;
2018-09-19 15:32:39 +08:00
}
class SuperPage : public IPage , ReferenceCounted < SuperPage > {
public :
2019-04-10 02:16:45 +08:00
SuperPage ( std : : vector < Reference < const IPage > > pages , int usablePageSize )
: m_size ( pages . size ( ) * usablePageSize ) {
2018-09-19 15:32:39 +08:00
m_data = new uint8_t [ m_size ] ;
uint8_t * wptr = m_data ;
for ( auto & p : pages ) {
memcpy ( wptr , p - > begin ( ) , usablePageSize ) ;
wptr + = usablePageSize ;
}
}
virtual ~ SuperPage ( ) {
delete m_data ;
}
virtual void addref ( ) const {
ReferenceCounted < SuperPage > : : addref ( ) ;
}
virtual void delref ( ) const {
ReferenceCounted < SuperPage > : : delref ( ) ;
}
virtual int size ( ) const {
return m_size ;
}
virtual uint8_t const * begin ( ) const {
return m_data ;
}
virtual uint8_t * mutate ( ) {
return m_data ;
}
private :
uint8_t * m_data ;
2019-04-10 02:16:45 +08:00
const int m_size ;
2018-09-19 15:32:39 +08:00
} ;
2019-02-21 18:46:30 +08:00
ACTOR static Future < Reference < const IPage > > readPage ( Reference < IPagerSnapshot > snapshot , LogicalPageID id , int usablePageSize , const RedwoodRecordRef * lowerBound , const RedwoodRecordRef * upperBound ) {
2019-05-29 21:23:32 +08:00
debug_printf ( " readPage() op=read id=%u @% " PRId64 " lower=%s upper=%s \n " , id , snapshot - > getVersion ( ) , lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
2019-06-29 08:32:54 +08:00
wait ( delay ( 0 , TaskPriority : : DiskRead ) ) ;
2018-09-19 15:32:39 +08:00
2019-02-21 18:46:30 +08:00
state Reference < const IPage > result = wait ( snapshot - > getPhysicalPage ( id ) ) ;
2019-03-15 15:46:09 +08:00
+ + counts . pageReads ;
2019-02-21 18:46:30 +08:00
state const BTreePage * pTreePage = ( const BTreePage * ) result - > begin ( ) ;
2018-09-19 15:32:39 +08:00
if ( pTreePage - > extensionPageCount = = 0 ) {
2019-05-29 21:23:32 +08:00
debug_printf ( " readPage() Found normal page for op=read id=%u @% " PRId64 " \n " , id , snapshot - > getVersion ( ) ) ;
2018-09-19 15:32:39 +08:00
}
2019-02-21 18:46:30 +08:00
else {
std : : vector < Future < Reference < const IPage > > > pageGets ;
pageGets . push_back ( std : : move ( result ) ) ;
2018-09-19 15:32:39 +08:00
2019-02-21 18:46:30 +08:00
for ( int i = 0 ; i < pTreePage - > extensionPageCount ; + + i ) {
2019-07-02 15:58:43 +08:00
debug_printf ( " readPage() Reading extension page op=read id=%u @% " PRId64 " ext=%d/%d \n " , bigEndian32 ( pTreePage - > extensionPages ( ) [ i ] ) , snapshot - > getVersion ( ) , i + 1 , ( int ) pTreePage - > extensionPageCount ) ;
pageGets . push_back ( snapshot - > getPhysicalPage ( bigEndian32 ( pTreePage - > extensionPages ( ) [ i ] ) ) ) ;
2019-02-21 18:46:30 +08:00
}
2018-09-19 15:32:39 +08:00
2019-02-21 18:46:30 +08:00
std : : vector < Reference < const IPage > > pages = wait ( getAll ( pageGets ) ) ;
2019-03-15 15:46:09 +08:00
counts . extPageReads + = pTreePage - > extensionPageCount ;
2019-02-21 18:46:30 +08:00
result = Reference < const IPage > ( new SuperPage ( pages , usablePageSize ) ) ;
pTreePage = ( const BTreePage * ) result - > begin ( ) ;
2017-07-14 02:32:14 +08:00
}
2018-09-19 15:32:39 +08:00
2019-02-21 18:46:30 +08:00
if ( result - > userData = = nullptr ) {
2019-06-06 11:58:47 +08:00
debug_printf ( " readPage() Creating Reader for PageID=%u @% " PRId64 " lower=%s upper=%s \n " , id , snapshot - > getVersion ( ) , lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
result - > userData = new BTreePage : : BinaryTree : : Reader ( & pTreePage - > tree ( ) , lowerBound , upperBound ) ;
result - > userDataDestructor = [ ] ( void * ptr ) { delete ( BTreePage : : BinaryTree : : Reader * ) ptr ; } ;
2017-07-14 02:32:14 +08:00
}
2018-09-19 15:32:39 +08:00
2019-02-21 18:46:30 +08:00
debug_printf ( " readPage() %s \n " , pTreePage - > toString ( false , id , snapshot - > getVersion ( ) , lowerBound , upperBound ) . c_str ( ) ) ;
// Nothing should attempt to read bytes in the page outside the BTreePage structure
VALGRIND_MAKE_MEM_UNDEFINED ( result - > begin ( ) + pTreePage - > size ( ) , result - > size ( ) - pTreePage - > size ( ) ) ;
2018-09-19 15:32:39 +08:00
2019-02-21 18:46:30 +08:00
return result ;
2017-07-14 02:32:14 +08:00
}
2017-07-05 14:41:48 +08:00
// Returns list of (version, list of (lower_bound, list of children) )
2019-02-21 18:46:30 +08:00
// TODO: Probably should pass prev/next records by pointer in many places
2019-05-22 10:16:32 +08:00
ACTOR static Future < VersionedChildrenT > commitSubtree ( VersionedBTree * self , MutationBufferT * mutationBuffer , Reference < IPagerSnapshot > snapshot , LogicalPageID root , const RedwoodRecordRef * lowerBound , const RedwoodRecordRef * upperBound , const RedwoodRecordRef * decodeLowerBound , const RedwoodRecordRef * decodeUpperBound ) {
2019-06-04 19:03:52 +08:00
state std : : string context ;
if ( REDWOOD_DEBUG ) {
context = format ( " CommitSubtree(root=%u): " , root ) ;
}
2017-07-05 14:41:48 +08:00
2019-06-04 19:03:52 +08:00
debug_printf ( " %s root=%d lower=%s upper=%s \n " , context . c_str ( ) , root , lowerBound - > toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
debug_printf ( " %s root=%d decodeLower=%s decodeUpper=%s \n " , context . c_str ( ) , root , decodeLowerBound - > toString ( ) . c_str ( ) , decodeUpperBound - > toString ( ) . c_str ( ) ) ;
2019-03-15 15:46:09 +08:00
self - > counts . commitToPageStart + + ;
2017-08-25 08:25:53 +08:00
2019-05-22 10:16:32 +08:00
// If a boundary changed, the page must be rewritten regardless of KV mutations
state bool boundaryChanged = ( lowerBound ! = decodeLowerBound ) | | ( upperBound ! = decodeUpperBound ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s id=%u boundaryChanged=%d \n " , context . c_str ( ) , root , boundaryChanged ) ;
2017-08-25 08:25:53 +08:00
// Find the slice of the mutation buffer that is relevant to this subtree
2019-03-15 15:46:09 +08:00
// TODO: Rather than two lower_bound searches, perhaps just compare each mutation to the upperBound key while iterating
2019-05-22 10:16:32 +08:00
state MutationBufferT : : const_iterator iMutationBoundary = mutationBuffer - > upper_bound ( lowerBound - > key ) ;
- - iMutationBoundary ;
2019-03-15 15:46:09 +08:00
state MutationBufferT : : const_iterator iMutationBoundaryEnd = mutationBuffer - > lower_bound ( upperBound - > key ) ;
2017-08-25 08:25:53 +08:00
2019-05-22 10:16:32 +08:00
if ( REDWOOD_DEBUG ) {
self - > printMutationBuffer ( iMutationBoundary , iMutationBoundaryEnd ) ;
2017-06-10 05:56:41 +08:00
}
2019-05-22 10:16:32 +08:00
// If the boundary range iterators are the same then upperbound and lowerbound have the same key.
// If the key is being mutated, them remove this subtree.
if ( iMutationBoundary = = iMutationBoundaryEnd ) {
if ( ! iMutationBoundary - > second . startKeyMutations . empty ( ) ) {
VersionedChildrenT c ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s id=%u lower and upper bound key/version match and key is modified so deleting page, returning %s \n " , context . c_str ( ) , root , toString ( c ) . c_str ( ) ) ;
2019-05-22 10:16:32 +08:00
return c ;
2017-08-25 08:25:53 +08:00
}
2019-05-22 10:16:32 +08:00
// If there are no forced boundary changes then this subtree is unchanged.
if ( ! boundaryChanged ) {
2019-06-04 19:03:52 +08:00
VersionedChildrenT c ( { { 0 , { * decodeLowerBound } , * decodeUpperBound } } ) ;
debug_printf ( " %s id=%d page contains a single key '%s' which is not changing, returning %s \n " , context . c_str ( ) , root , lowerBound - > key . toString ( ) . c_str ( ) , toString ( c ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
return c ;
2017-08-25 08:25:53 +08:00
}
}
2017-08-29 08:26:53 +08:00
2017-08-25 08:25:53 +08:00
// Another way to have no mutations is to have a single mutation range cover this
// subtree but have no changes in it
MutationBufferT : : const_iterator iMutationBoundaryNext = iMutationBoundary ;
+ + iMutationBoundaryNext ;
2019-05-22 10:16:32 +08:00
if ( ! boundaryChanged & & iMutationBoundaryNext = = iMutationBoundaryEnd & &
2019-03-15 15:46:09 +08:00
( iMutationBoundary - > second . noChanges ( ) | |
( ! iMutationBoundary - > second . rangeClearVersion . present ( ) & &
iMutationBoundary - > first < lowerBound - > key )
)
) {
2019-06-04 19:03:52 +08:00
VersionedChildrenT c ( { { 0 , { * decodeLowerBound } , * decodeUpperBound } } ) ;
debug_printf ( " %s no changes because sole mutation range was not cleared, returning %s \n " , context . c_str ( ) , toString ( c ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
return c ;
2017-08-25 08:25:53 +08:00
}
2019-03-15 15:46:09 +08:00
self - > counts . commitToPage + + ;
2019-05-22 10:16:32 +08:00
state Reference < const IPage > rawPage = wait ( readPage ( snapshot , root , self - > m_usablePageSizeOverride , decodeLowerBound , decodeUpperBound ) ) ;
2018-09-19 15:32:39 +08:00
state BTreePage * page = ( BTreePage * ) rawPage - > begin ( ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s commitSubtree(): %s \n " , context . c_str ( ) , page - > toString ( false , root , snapshot - > getVersion ( ) , decodeLowerBound , decodeUpperBound ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
2019-06-04 19:03:52 +08:00
state BTreePage : : BinaryTree : : Cursor cursor = getReader ( rawPage ) - > getCursor ( ) ;
2019-02-21 18:46:30 +08:00
cursor . moveFirst ( ) ;
2017-06-10 05:56:41 +08:00
2018-08-29 04:46:14 +08:00
// Leaf Page
2018-06-08 18:32:34 +08:00
if ( page - > flags & BTreePage : : IS_LEAF ) {
2017-06-10 05:56:41 +08:00
VersionedChildrenT results ;
2019-02-21 18:46:30 +08:00
std : : vector < RedwoodRecordRef > merged ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
debug_printf ( " %s id=%u MERGING EXISTING DATA WITH MUTATIONS: \n " , context . c_str ( ) , root ) ;
2019-03-15 15:46:09 +08:00
if ( REDWOOD_DEBUG ) {
self - > printMutationBuffer ( iMutationBoundary , iMutationBoundaryEnd ) ;
}
2017-08-26 06:48:32 +08:00
2017-08-22 13:29:57 +08:00
// It's a given that the mutation map is not empty so it's safe to do this
2017-08-25 08:25:53 +08:00
Key mutationRangeStart = iMutationBoundary - > first ;
2017-08-22 13:29:57 +08:00
// If replacement pages are written they will be at the minimum version seen in the mutations for this leaf
2017-08-28 16:57:01 +08:00
Version minVersion = invalidVersion ;
2019-03-15 15:46:09 +08:00
int changes = 0 ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Now, process each mutation range and merge changes with existing data.
while ( iMutationBoundary ! = iMutationBoundaryEnd ) {
2019-06-04 19:03:52 +08:00
debug_printf ( " %s New mutation boundary: '%s': %s \n " , context . c_str ( ) , printable ( iMutationBoundary - > first ) . c_str ( ) , iMutationBoundary - > second . toString ( ) . c_str ( ) ) ;
2017-08-23 02:30:44 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutations ;
2017-08-22 13:29:57 +08:00
2017-08-28 21:28:49 +08:00
// If the mutation boundary key is less than the lower bound key then skip startKeyMutations for
// this bounary, we're only processing this mutation range here to apply any clears to existing data.
2019-04-30 08:00:29 +08:00
if ( iMutationBoundary - > first < lowerBound - > key ) {
2017-08-28 21:28:49 +08:00
iMutations = iMutationBoundary - > second . startKeyMutations . end ( ) ;
2019-04-30 08:00:29 +08:00
}
2017-08-25 08:25:53 +08:00
// If the mutation boundary key is the same as the page lowerBound key then start reading single
2019-02-21 18:46:30 +08:00
// key mutations at the first version greater than the lowerBound key's version.
2019-04-30 08:00:29 +08:00
else if ( ! self - > singleVersion & & iMutationBoundary - > first = = lowerBound - > key ) {
2019-02-21 18:46:30 +08:00
iMutations = iMutationBoundary - > second . startKeyMutations . upper_bound ( lowerBound - > version ) ;
2019-04-30 08:00:29 +08:00
}
else {
2017-08-25 08:25:53 +08:00
iMutations = iMutationBoundary - > second . startKeyMutations . begin ( ) ;
2019-04-30 08:00:29 +08:00
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutationsEnd = iMutationBoundary - > second . startKeyMutations . end ( ) ;
2019-04-30 08:00:29 +08:00
// Iterate over old versions of the mutation boundary key, outputting if necessary
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) & & cursor . get ( ) . key = = iMutationBoundary - > first ) {
2019-04-30 08:00:29 +08:00
// If not in single version mode or there were no changes to the key
if ( ! self - > singleVersion | | iMutationBoundary - > second . noChanges ( ) ) {
merged . push_back ( cursor . get ( ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, boundary start] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
else {
ASSERT ( self - > singleVersion ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Skipped %s [existing, boundary start, singleVersion mode] \n " , context . c_str ( ) , cursor . get ( ) . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
minVersion = 0 ;
}
2019-02-21 18:46:30 +08:00
cursor . moveNext ( ) ;
2017-08-25 08:25:53 +08:00
}
2017-06-10 05:56:41 +08:00
2018-07-18 18:19:35 +08:00
// TODO: If a mutation set is equal to the previous existing value of the key, maybe don't write it.
2017-08-25 08:25:53 +08:00
// Output mutations for the mutation boundary start key
2017-08-22 13:29:57 +08:00
while ( iMutations ! = iMutationsEnd ) {
2017-09-06 07:59:31 +08:00
const SingleKeyMutation & m = iMutations - > second ;
2018-09-19 15:32:39 +08:00
int maxPartSize = std : : min ( 255 , self - > m_usablePageSizeOverride / 5 ) ;
2017-09-20 04:03:30 +08:00
if ( m . isClear ( ) | | m . value . size ( ) < = maxPartSize ) {
2017-09-16 08:27:13 +08:00
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
2019-03-15 15:46:09 +08:00
+ + changes ;
2019-05-22 10:16:32 +08:00
merged . push_back ( m . toRecord ( iMutationBoundary - > first , iMutations - > first ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added non-split %s [mutation, boundary start] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2017-09-06 07:59:31 +08:00
}
else {
2018-07-23 18:09:13 +08:00
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
2019-03-15 15:46:09 +08:00
+ + changes ;
2017-09-06 07:59:31 +08:00
int bytesLeft = m . value . size ( ) ;
2018-07-23 18:09:13 +08:00
int start = 0 ;
2019-02-21 18:46:30 +08:00
RedwoodRecordRef whole ( iMutationBoundary - > first , iMutations - > first , m . value ) ;
2017-09-06 07:59:31 +08:00
while ( bytesLeft > 0 ) {
int partSize = std : : min ( bytesLeft , maxPartSize ) ;
2018-07-23 18:09:13 +08:00
// Don't copy the value chunk because this page will stay in memory until after we've built new version(s) of it
2019-02-21 18:46:30 +08:00
merged . push_back ( whole . split ( start , partSize ) ) ;
2018-07-25 17:29:17 +08:00
bytesLeft - = partSize ;
start + = partSize ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added split %s [mutation, boundary start] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2017-09-06 07:59:31 +08:00
}
}
2017-08-22 13:29:57 +08:00
+ + iMutations ;
}
2017-06-10 05:56:41 +08:00
2017-08-25 08:25:53 +08:00
// Get the clear version for this range, which is the last thing that we need from it,
Optional < Version > clearRangeVersion = iMutationBoundary - > second . rangeClearVersion ;
// Advance to the next boundary because we need to know the end key for the current range.
+ + iMutationBoundary ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Mutation range end: '%s' \n " , context . c_str ( ) , printable ( iMutationBoundary - > first ) . c_str ( ) ) ;
2017-08-29 08:26:53 +08:00
2017-08-25 08:25:53 +08:00
// Write existing keys which are less than the next mutation boundary key, clearing if needed.
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) & & cursor . get ( ) . key < iMutationBoundary - > first ) {
2019-04-30 08:00:29 +08:00
// TODO: Remove old versions that are too old
2019-02-21 18:46:30 +08:00
2019-04-30 08:00:29 +08:00
bool remove = self - > singleVersion & & clearRangeVersion . present ( ) ;
if ( ! remove ) {
merged . push_back ( cursor . get ( ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, middle] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
else {
ASSERT ( self - > singleVersion ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Skipped %s [existing, boundary start, singleVersion mode] \n " , context . c_str ( ) , cursor . get ( ) . toString ( ) . c_str ( ) ) ;
2017-08-28 16:57:01 +08:00
Version clearVersion = clearRangeVersion . get ( ) ;
if ( clearVersion < minVersion | | minVersion = = invalidVersion )
minVersion = clearVersion ;
2017-08-22 13:29:57 +08:00
}
2017-08-25 08:25:53 +08:00
2019-04-30 08:00:29 +08:00
// If keeping version history, write clears for records that exist in this range if the range was cleared
if ( ! self - > singleVersion ) {
// Write a clear of this key if needed. A clear is required if clearRangeVersion is set and the next cursor
// key is different than the current one. If the last cursor key in the page is different from the
// first key in the right sibling page then the page's upper bound will reflect that.
auto nextCursor = cursor ;
nextCursor . moveNext ( ) ;
if ( clearRangeVersion . present ( ) & & cursor . get ( ) . key ! = nextCursor . getOrUpperBound ( ) . key ) {
Version clearVersion = clearRangeVersion . get ( ) ;
if ( clearVersion < minVersion | | minVersion = = invalidVersion )
minVersion = clearVersion ;
+ + changes ;
merged . push_back ( RedwoodRecordRef ( cursor . get ( ) . key , clearVersion ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, middle clear] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
cursor = nextCursor ;
}
else {
cursor . moveNext ( ) ;
}
2017-08-22 13:29:57 +08:00
}
2017-08-26 06:48:32 +08:00
}
2017-06-10 05:56:41 +08:00
2017-08-26 06:48:32 +08:00
// Write any remaining existing keys, which are not subject to clears as they are beyond the cleared range.
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) ) {
merged . push_back ( cursor . get ( ) ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Added %s [existing, tail] \n " , context . c_str ( ) , merged . back ( ) . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
cursor . moveNext ( ) ;
2017-06-10 05:56:41 +08:00
}
2017-08-25 08:25:53 +08:00
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Done merging mutations into existing leaf contents, made %d changes \n " , context . c_str ( ) , changes ) ;
2017-08-28 18:53:29 +08:00
2019-03-15 15:46:09 +08:00
// No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records.
2019-06-04 19:03:52 +08:00
// But if a boundary was changed then we must rewrite the page anyway.
2019-05-22 10:16:32 +08:00
if ( ! boundaryChanged & & minVersion = = invalidVersion ) {
2019-06-04 19:03:52 +08:00
VersionedChildrenT c ( { { 0 , { * decodeLowerBound } , * decodeUpperBound } } ) ;
debug_printf ( " %s No changes were made during mutation merge, returning %s \n " , context . c_str ( ) , toString ( c ) . c_str ( ) ) ;
2019-03-15 15:46:09 +08:00
ASSERT ( changes = = 0 ) ;
2019-04-30 08:00:29 +08:00
return c ;
2017-08-28 18:53:29 +08:00
}
2017-08-28 16:57:01 +08:00
2018-09-19 15:32:39 +08:00
// TODO: Make version and key splits based on contents of merged list, if keeping history
2017-06-10 05:56:41 +08:00
2019-04-30 08:00:29 +08:00
// If everything in the page was deleted then this page should be deleted as of the new version
// Note that if a single range clear covered the entire page then we should not get this far
2019-05-22 10:16:32 +08:00
if ( merged . empty ( ) & & root ! = 0 ) {
2019-04-30 08:00:29 +08:00
// TODO: For multi version mode only delete this page as of the new version
VersionedChildrenT c ( { } ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s id=%u All leaf page contents were cleared, returning %s \n " , context . c_str ( ) , root , toString ( c ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
return c ;
}
2017-06-10 05:56:41 +08:00
IPager * pager = self - > m_pager ;
2019-02-21 18:46:30 +08:00
std : : vector < BoundaryAndPage > pages = buildPages ( true , * lowerBound , * upperBound , merged , BTreePage : : IS_LEAF , [ pager ] ( ) { return pager - > newPageBuffer ( ) ; } , self - > m_usablePageSizeOverride ) ;
2017-06-10 05:56:41 +08:00
2019-04-30 08:00:29 +08:00
if ( ! self - > singleVersion ) {
2019-06-04 19:03:52 +08:00
ASSERT ( false ) ;
// // If there isn't still just a single page of data then this page became too large and was split.
// // The new split pages will be valid as of minVersion, but the old page remains valid at the old version
// if(pages.size() != 1) {
// results.push_back( {0, {*decodeLowerBound}, ??} );
// debug_printf("%s Added versioned child set #1: %s\n", context.c_str(), toString(results.back()).c_str());
// }
// else {
// // The page was updated but not size-split or version-split so the last page version's data
// // can be replaced with the new page contents
// if(pages.size() == 1)
// minVersion = 0;
// }
2017-07-05 14:41:48 +08:00
}
2017-06-10 05:56:41 +08:00
2018-09-19 15:32:39 +08:00
// Write page(s), get new page IDs
2019-04-30 08:00:29 +08:00
Version writeVersion = self - > singleVersion ? self - > getLastCommittedVersion ( ) + 1 : minVersion ;
std : : vector < LogicalPageID > newPageIDs = self - > writePages ( pages , writeVersion , root , page , upperBound , THIS ) ;
2017-06-10 05:56:41 +08:00
2017-07-14 02:32:14 +08:00
// If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page
2018-09-19 15:32:39 +08:00
if ( root = = self - > m_root & & pages . size ( ) > 1 ) {
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Building new root \n " , context . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
self - > buildNewRoot ( writeVersion , pages , newPageIDs , page ) ;
2017-08-28 21:28:49 +08:00
}
2017-07-14 02:32:14 +08:00
2019-06-04 19:03:52 +08:00
results . push_back ( { writeVersion , { } , * upperBound } ) ;
2017-06-10 05:56:41 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + ) {
2017-08-28 16:57:01 +08:00
// The lower bound of the first page is the lower bound of the subtree, not the first entry in the page
2019-02-21 18:46:30 +08:00
const RedwoodRecordRef & lower = ( i = = 0 ) ? * lowerBound : pages [ i ] . lowerBound ;
2019-06-04 19:03:52 +08:00
RedwoodRecordRef entry = lower . withPageID ( newPageIDs [ i ] ) ;
debug_printf ( " %s Adding child page link: %s \n " , context . c_str ( ) , entry . toString ( ) . c_str ( ) ) ;
results . back ( ) . children . push_back ( entry ) ;
2017-06-10 05:56:41 +08:00
}
2019-06-04 19:03:52 +08:00
debug_printf ( " %s Merge complete, returning %s \n " , context . c_str ( ) , toString ( results ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
debug_printf ( " %s DONE. \n " , context . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
return results ;
}
else {
2018-08-29 04:46:14 +08:00
// Internal Page
2019-05-22 10:16:32 +08:00
// TODO: Combine these into one vector and/or do something more elegant
2018-06-08 18:32:34 +08:00
state std : : vector < Future < VersionedChildrenT > > futureChildren ;
2018-06-15 08:52:25 +08:00
bool first = true ;
2019-02-21 18:46:30 +08:00
while ( cursor . valid ( ) ) {
// The lower bound for the first child is the lowerBound arg
const RedwoodRecordRef & childLowerBound = first ? * lowerBound : cursor . get ( ) ;
2019-05-22 10:16:32 +08:00
first = false ;
// Skip over any children that do not link to a page. They exist to preserve the ancestors from
// which adjacent children can borrow prefix bytes.
// If there are any, then the first valid child page will incur a boundary change to move
// its lower bound to the left so we can delete the non-linking entry from this page to free up space.
while ( ! cursor . get ( ) . value . present ( ) ) {
2019-06-04 19:03:52 +08:00
// There should never be an internal page written that has no valid child pages. This loop will find
2019-05-22 10:16:32 +08:00
// the first valid child link, and if there are no more then execution will not return to this loop.
ASSERT ( cursor . moveNext ( ) ) ;
}
2018-06-08 18:32:34 +08:00
2019-05-22 10:16:32 +08:00
ASSERT ( cursor . valid ( ) ) ;
2017-08-28 16:57:01 +08:00
2019-05-22 10:16:32 +08:00
const RedwoodRecordRef & decodeChildLowerBound = cursor . get ( ) ;
2017-08-28 18:53:29 +08:00
2019-06-04 19:03:52 +08:00
LogicalPageID pageID = cursor . get ( ) . getPageID ( ) ;
2018-08-29 04:46:14 +08:00
ASSERT ( pageID ! = 0 ) ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:55:09 +08:00
const RedwoodRecordRef & decodeChildUpperBound = cursor . moveNext ( ) ? cursor . get ( ) : * decodeUpperBound ;
2017-06-10 05:56:41 +08:00
2019-05-22 10:16:32 +08:00
// Skip over any next-children which do not actually link to child pages
while ( cursor . valid ( ) & & ! cursor . get ( ) . value . present ( ) ) {
cursor . moveNext ( ) ;
2017-06-10 05:56:41 +08:00
}
2019-05-22 10:16:32 +08:00
const RedwoodRecordRef & childUpperBound = cursor . valid ( ) ? cursor . get ( ) : * upperBound ;
2017-06-10 05:56:41 +08:00
2019-06-06 11:58:47 +08:00
debug_printf ( " %s recursing to PageID=%u lower=%s upper=%s decodeLower=%s decodeUpper=%s \n " ,
2019-06-04 19:55:09 +08:00
context . c_str ( ) , pageID , childLowerBound . toString ( ) . c_str ( ) , childUpperBound . toString ( ) . c_str ( ) , decodeChildLowerBound . toString ( ) . c_str ( ) , decodeChildUpperBound . toString ( ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2019-04-30 08:00:29 +08:00
/*
// TODO: If lower bound and upper bound have the same key, do something intelligent if possible
//
if ( childLowerBound . key = = childUpperBound . key ) {
if ( key is modified or cleared ) {
if ( self - > singleVersion ) {
// In single version mode, don't keep any records with the old key if the key is modified, so return
// an empty page set to replace the child page
futureChildren . push_back ( VersionedChildrenT ( { { 0 , { } } } ) ) ;
}
else {
// In versioned mode, there is no need to recurse to this page because new versions of key
// will go in the right most page that has the same lowerBound key, but since the key is
// being changed the new version of this page should exclude the old subtree
2017-06-10 05:56:41 +08:00
2019-04-30 08:00:29 +08:00
}
else {
// Return the child page as-is, no need to visit it
futureChildren . push_back ( VersionedChildrenT ( { { 0 , { { childLowerBound , pageID } } } } ) ) ;
2017-06-10 05:56:41 +08:00
}
}
2019-04-30 08:00:29 +08:00
else {
// No changes
futureChildren . push_back ( VersionedChildrenT ( { { 0 , { { childLowerBound , pageID } } } } ) ) ;
2017-06-10 05:56:41 +08:00
}
2019-04-30 08:00:29 +08:00
}
else {
futureChildren . push_back ( self - > commitSubtree ( self , mutationBuffer , snapshot , pageID , & childLowerBound , & childUpperBound ) ) ;
}
*/
2019-05-22 10:16:32 +08:00
futureChildren . push_back ( self - > commitSubtree ( self , mutationBuffer , snapshot , pageID , & childLowerBound , & childUpperBound , & decodeChildLowerBound , & decodeChildUpperBound ) ) ;
2017-06-10 05:56:41 +08:00
}
2019-03-15 15:46:09 +08:00
// Waiting one at a time makes debugging easier
// TODO: Is it better to use waitForAll()?
2019-02-21 18:46:30 +08:00
state int k ;
for ( k = 0 ; k < futureChildren . size ( ) ; + + k ) {
wait ( success ( futureChildren [ k ] ) ) ;
}
2017-06-10 05:56:41 +08:00
2019-04-30 08:00:29 +08:00
if ( REDWOOD_DEBUG ) {
2019-06-06 11:58:47 +08:00
debug_printf ( " %s Subtree update results for root PageID=%u \n " , context . c_str ( ) , root ) ;
2019-04-30 08:00:29 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
2019-06-18 09:55:49 +08:00
debug_printf ( " %s subtree result %s \n " , context . c_str ( ) , toString ( futureChildren [ i ] . get ( ) ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
}
}
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
// TODO: Handle multi-versioned results
ASSERT ( self - > singleVersion ) ;
cursor . moveFirst ( ) ;
InternalPageBuilder pageBuilder ( cursor ) ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
const VersionedChildrenT & versionedChildren = futureChildren [ i ] . get ( ) ;
ASSERT ( versionedChildren . size ( ) < = 1 ) ;
2017-07-05 14:41:48 +08:00
2019-06-04 19:03:52 +08:00
if ( ! versionedChildren . empty ( ) ) {
pageBuilder . addEntries ( versionedChildren . front ( ) ) ;
2017-06-10 05:56:41 +08:00
}
}
2019-06-06 11:58:47 +08:00
pageBuilder . finalize ( * upperBound , * decodeUpperBound ) ;
2019-06-04 19:03:52 +08:00
// If page contents have changed
if ( pageBuilder . modified ) {
// If the page now has no children
if ( pageBuilder . childPageCount = = 0 ) {
// If we are the root, write a new empty btree
if ( root = = 0 ) {
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
makeEmptyPage ( page , BTreePage : : IS_LEAF , self - > m_usablePageSizeOverride ) ;
RedwoodRecordRef rootEntry = dbBegin . withPageID ( 0 ) ;
self - > writePage ( 0 , page , self - > getLastCommittedVersion ( ) + 1 , & dbBegin , & dbEnd ) ;
VersionedChildrenT c ( { { 0 , { dbBegin } , dbEnd } } ) ;
debug_printf ( " %s id=%u All root page children were deleted, rewrote root as leaf, returning %s \n " , context . c_str ( ) , root , toString ( c ) . c_str ( ) ) ;
return c ;
2017-06-10 05:56:41 +08:00
}
2019-04-30 08:00:29 +08:00
else {
2019-06-04 19:03:52 +08:00
VersionedChildrenT c ( { } ) ;
debug_printf ( " %s id=%u All internal page children were deleted #1 so deleting this page too, returning %s \n " , context . c_str ( ) , root , toString ( c ) . c_str ( ) ) ;
return c ;
2017-06-10 05:56:41 +08:00
}
}
2019-06-04 19:03:52 +08:00
else {
2019-06-06 11:58:47 +08:00
debug_printf ( " %s Internal PageID=%u modified, creating replacements. \n " , context . c_str ( ) , root ) ;
2019-06-04 19:03:52 +08:00
debug_printf ( " %s newChildren=%s lastUpperBound=%s upperBound=%s \n " , context . c_str ( ) , toString ( pageBuilder . entries ) . c_str ( ) , pageBuilder . lastUpperBound . toString ( ) . c_str ( ) , upperBound - > toString ( ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
ASSERT ( pageBuilder . lastUpperBound = = * upperBound ) ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
// TODO: Don't do this!
std : : vector < RedwoodRecordRef > entries ;
for ( auto & o : pageBuilder . entries ) {
entries . push_back ( o ) ;
2019-04-30 08:00:29 +08:00
}
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
std : : vector < BoundaryAndPage > pages = buildPages ( false , * lowerBound , * upperBound , entries , 0 , [ = ] ( ) { return self - > m_pager - > newPageBuffer ( ) ; } , self - > m_usablePageSizeOverride ) ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
Version writeVersion = self - > getLastCommittedVersion ( ) + 1 ;
std : : vector < LogicalPageID > newPageIDs = self - > writePages ( pages , writeVersion , root , page , upperBound , THIS ) ;
2017-06-10 05:56:41 +08:00
2017-07-14 02:32:14 +08:00
// If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page
2019-06-04 19:03:52 +08:00
if ( root = = self - > m_root ) {
self - > buildNewRoot ( writeVersion , pages , newPageIDs , page ) ;
}
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
VersionedChildrenT vc ( 1 ) ;
vc . resize ( 1 ) ;
VersionedChildPageSet & c = vc . front ( ) ;
c . version = writeVersion ;
c . upperBound = * upperBound ;
2017-06-10 05:56:41 +08:00
2019-06-04 19:03:52 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + ) {
c . children . push_back ( pages [ i ] . lowerBound . withPageID ( newPageIDs [ i ] ) ) ;
2017-07-05 14:41:48 +08:00
}
2017-06-10 05:56:41 +08:00
2019-06-06 11:58:47 +08:00
debug_printf ( " %s Internal PageID=%u modified, returning %s \n " , context . c_str ( ) , root , toString ( c ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
return vc ;
}
}
else {
VersionedChildrenT c ( { { 0 , { * decodeLowerBound } , * decodeUpperBound } } ) ;
2019-06-06 11:58:47 +08:00
debug_printf ( " %s PageID=%u has no changes, returning %s \n " , context . c_str ( ) , root , toString ( c ) . c_str ( ) ) ;
2019-06-04 19:03:52 +08:00
return c ;
2017-06-10 05:56:41 +08:00
}
}
}
ACTOR static Future < Void > commit_impl ( VersionedBTree * self ) {
2017-09-23 08:18:28 +08:00
state MutationBufferT * mutations = self - > m_pBuffer ;
// No more mutations are allowed to be written to this mutation buffer we will commit
// at m_writeVersion, which we must save locally because it could change during commit.
self - > m_pBuffer = nullptr ;
state Version writeVersion = self - > m_writeVersion ;
// The latest mutation buffer start version is the one we will now (or eventually) commit.
state Version mutationBufferStartVersion = self - > m_mutationBuffers . rbegin ( ) - > first ;
// Replace the lastCommit future with a new one and then wait on the old one
state Promise < Void > committed ;
Future < Void > previousCommit = self - > m_latestCommit ;
self - > m_latestCommit = committed . getFuture ( ) ;
// Wait for the latest commit that started to be finished.
2018-09-20 18:39:55 +08:00
wait ( previousCommit ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " %s: Beginning commit of version % " PRId64 " \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-09-23 08:18:28 +08:00
// Get the latest version from the pager, which is what we will read at
2017-06-10 05:56:41 +08:00
Version latestVersion = wait ( self - > m_pager - > getLatestVersion ( ) ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " %s: pager latestVersion % " PRId64 " \n " , self - > m_name . c_str ( ) , latestVersion ) ;
2017-06-10 05:56:41 +08:00
2019-03-15 15:46:09 +08:00
if ( REDWOOD_DEBUG ) {
self - > printMutationBuffer ( mutations ) ;
}
2017-08-22 13:29:57 +08:00
2019-05-22 10:16:32 +08:00
VersionedChildrenT newRoot = wait ( commitSubtree ( self , mutations , self - > m_pager - > getReadSnapshot ( latestVersion ) , self - > m_root , & dbBegin , & dbEnd , & dbBegin , & dbEnd ) ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
self - > m_pager - > setLatestVersion ( writeVersion ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " %s: Committing pager % " PRId64 " \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_pager - > commit ( ) ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " %s: Committed version % " PRId64 " \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
// Now that everything is committed we must delete the mutation buffer.
// Our buffer's start version should be the oldest mutation buffer version in the map.
ASSERT ( mutationBufferStartVersion = = self - > m_mutationBuffers . begin ( ) - > first ) ;
self - > m_mutationBuffers . erase ( self - > m_mutationBuffers . begin ( ) ) ;
self - > m_lastCommittedVersion = writeVersion ;
2018-12-06 14:41:04 +08:00
+ + self - > counts . commits ;
2019-05-29 21:23:32 +08:00
printf ( " \n Committed: %s \n " , self - > counts . toString ( true ) . c_str ( ) ) ;
2017-09-23 08:18:28 +08:00
committed . send ( Void ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
2019-02-21 18:46:30 +08:00
// InternalCursor is for seeking to and iterating over the 'internal' records (not user-visible) in the Btree.
// These records are versioned and they can represent deletedness or partial values.
struct InternalCursor {
private :
// Each InternalCursor's position is represented by a reference counted PageCursor, which links
// to its parent PageCursor, up to a PageCursor representing a cursor on the root page.
// PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead
struct PageCursor : ReferenceCounted < PageCursor > , FastAllocated < PageCursor > {
Reference < PageCursor > parent ;
LogicalPageID pageID ; // Only needed for debugging purposes
Reference < const IPage > page ;
BTreePage : : BinaryTree : : Cursor cursor ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
PageCursor ( LogicalPageID id , Reference < const IPage > page , Reference < PageCursor > parent = { } )
: pageID ( id ) , page ( page ) , parent ( parent ) , cursor ( getReader ( ) . getCursor ( ) )
{
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
PageCursor ( const PageCursor & toCopy ) : parent ( toCopy . parent ) , pageID ( toCopy . pageID ) , page ( toCopy . page ) , cursor ( toCopy . cursor ) {
}
// Convenience method for copying a PageCursor
Reference < PageCursor > copy ( ) const {
return Reference < PageCursor > ( new PageCursor ( * this ) ) ;
}
// Multiple InternalCursors can share a Page
BTreePage : : BinaryTree : : Reader & getReader ( ) const {
return * ( BTreePage : : BinaryTree : : Reader * ) page - > userData ;
}
bool isLeaf ( ) const {
const BTreePage * p = ( ( const BTreePage * ) page - > begin ( ) ) ;
return p - > isLeaf ( ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
Future < Reference < PageCursor > > getChild ( Reference < IPagerSnapshot > pager , int usablePageSizeOverride ) {
ASSERT ( ! isLeaf ( ) ) ;
BTreePage : : BinaryTree : : Cursor next = cursor ;
next . moveNext ( ) ;
const RedwoodRecordRef & rec = cursor . get ( ) ;
2019-06-04 19:03:52 +08:00
LogicalPageID id = rec . getPageID ( ) ;
2019-02-21 18:46:30 +08:00
Future < Reference < const IPage > > child = readPage ( pager , id , usablePageSizeOverride , & rec , & next . getOrUpperBound ( ) ) ;
return map ( child , [ = ] ( Reference < const IPage > page ) {
return Reference < PageCursor > ( new PageCursor ( id , page , Reference < PageCursor > : : addRef ( this ) ) ) ;
} ) ;
}
std : : string toString ( ) const {
2019-06-06 11:58:47 +08:00
return format ( " PageID=%u, %s " , pageID , cursor . valid ( ) ? cursor . get ( ) . toString ( ) . c_str ( ) : " <invalid> " ) ;
2019-02-21 18:46:30 +08:00
}
} ;
LogicalPageID rootPageID ;
int usablePageSizeOverride ;
Reference < IPagerSnapshot > pager ;
Reference < PageCursor > pageCursor ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
public :
InternalCursor ( ) {
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
InternalCursor ( Reference < IPagerSnapshot > pager , LogicalPageID root , int usablePageSizeOverride )
: pager ( pager ) , rootPageID ( root ) , usablePageSizeOverride ( usablePageSizeOverride ) {
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
std : : string toString ( ) const {
2018-06-12 16:43:19 +08:00
std : : string r ;
2019-04-30 08:00:29 +08:00
2019-02-21 18:46:30 +08:00
Reference < PageCursor > c = pageCursor ;
2019-04-30 08:00:29 +08:00
int maxDepth = 0 ;
2019-02-21 18:46:30 +08:00
while ( c ) {
2019-04-30 08:00:29 +08:00
c = c - > parent ;
+ + maxDepth ;
}
c = pageCursor ;
int depth = maxDepth ;
while ( c ) {
r = format ( " [%d/%d: %s] " , depth - - , maxDepth , c - > toString ( ) . c_str ( ) ) + r ;
2019-02-21 18:46:30 +08:00
c = c - > parent ;
2018-06-12 16:43:19 +08:00
}
return r ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is a valid leaf page record
bool valid ( ) const {
return pageCursor & & pageCursor - > isLeaf ( ) & & pageCursor - > cursor . valid ( ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is valid() and has a present record value
bool present ( ) {
return valid ( ) & & pageCursor - > cursor . get ( ) . value . present ( ) ;
}
2018-07-15 04:37:52 +08:00
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is present() and has an effective version <= v
bool presentAtVersion ( Version v ) {
return present ( ) & & pageCursor - > cursor . get ( ) . version < = v ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// Returns true if cursor position is present() and has an effective version <= v
bool validAtVersion ( Version v ) {
return valid ( ) & & pageCursor - > cursor . get ( ) . version < = v ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
const RedwoodRecordRef & get ( ) const {
return pageCursor - > cursor . get ( ) ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// Ensure that pageCursor is not shared with other cursors so we can modify it
void ensureUnshared ( ) {
if ( ! pageCursor - > isSoleOwner ( ) ) {
pageCursor = pageCursor - > copy ( ) ;
2017-09-15 20:19:39 +08:00
}
}
2019-02-21 18:46:30 +08:00
Future < Void > moveToRoot ( ) {
// If pageCursor exists follow parent links to the root
if ( pageCursor ) {
while ( pageCursor - > parent ) {
pageCursor = pageCursor - > parent ;
}
return Void ( ) ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// Otherwise read the root page
Future < Reference < const IPage > > root = readPage ( pager , rootPageID , usablePageSizeOverride , & dbBegin , & dbEnd ) ;
return map ( root , [ = ] ( Reference < const IPage > p ) {
pageCursor = Reference < PageCursor > ( new PageCursor ( rootPageID , p ) ) ;
return Void ( ) ;
} ) ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
ACTOR Future < bool > seekLessThanOrEqual_impl ( InternalCursor * self , RedwoodRecordRef query ) {
Future < Void > f = self - > moveToRoot ( ) ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// f will almost always be ready
if ( ! f . isReady ( ) ) {
wait ( f ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
self - > ensureUnshared ( ) ;
2018-06-12 16:43:19 +08:00
2017-09-15 20:19:39 +08:00
loop {
2019-05-22 10:16:32 +08:00
bool success = self - > pageCursor - > cursor . seekLessThanOrEqual ( query ) ;
2017-09-15 20:19:39 +08:00
2019-05-22 10:16:32 +08:00
// Skip backwards over internal page entries that do not link to child pages
if ( ! self - > pageCursor - > isLeaf ( ) ) {
// While record has no value, move again
while ( success & & ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
success = self - > pageCursor - > cursor . movePrev ( ) ;
}
2017-09-15 20:19:39 +08:00
}
2019-05-22 10:16:32 +08:00
if ( success ) {
2019-02-21 18:46:30 +08:00
// If we found a record <= query at a leaf page then return success
if ( self - > pageCursor - > isLeaf ( ) ) {
return true ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
Reference < PageCursor > child = wait ( self - > pageCursor - > getChild ( self - > pager , self - > usablePageSizeOverride ) ) ;
self - > pageCursor = child ;
2017-09-15 20:19:39 +08:00
}
else {
2019-02-21 18:46:30 +08:00
// No records <= query on this page, so move to immediate previous record at leaf level
bool success = wait ( self - > move ( false ) ) ;
return success ;
2017-09-15 20:19:39 +08:00
}
}
}
2019-02-21 18:46:30 +08:00
Future < bool > seekLTE ( RedwoodRecordRef query ) {
return seekLessThanOrEqual_impl ( this , query ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
ACTOR Future < bool > move_impl ( InternalCursor * self , bool forward ) {
// Try to move pageCursor, if it fails to go parent, repeat until it works or root cursor can't be moved
while ( 1 ) {
self - > ensureUnshared ( ) ;
bool success = self - > pageCursor - > cursor . valid ( ) & & ( forward ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ) ;
2017-09-15 20:19:39 +08:00
2019-05-22 10:16:32 +08:00
// Skip over internal page entries that do not link to child pages
if ( ! self - > pageCursor - > isLeaf ( ) ) {
// While record has no value, move again
while ( success & & ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
success = forward ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ;
2018-06-12 16:43:19 +08:00
}
}
2018-06-08 18:32:34 +08:00
2019-02-21 18:46:30 +08:00
// Stop if successful or there's no parent to move to
if ( success | | ! self - > pageCursor - > parent ) {
2017-09-15 20:19:39 +08:00
break ;
2018-06-08 18:32:34 +08:00
}
2019-02-21 18:46:30 +08:00
// Move to parent
self - > pageCursor = self - > pageCursor - > parent ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// If pageCursor not valid we've reached an end of the tree
if ( ! self - > pageCursor - > cursor . valid ( ) ) {
return false ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
// While not on a leaf page, move down to get to one.
while ( ! self - > pageCursor - > isLeaf ( ) ) {
2019-05-22 10:16:32 +08:00
// Skip over internal page entries that do not link to child pages
while ( ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
bool success = forward ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ;
if ( ! success ) {
return false ;
}
}
2019-02-21 18:46:30 +08:00
Reference < PageCursor > child = wait ( self - > pageCursor - > getChild ( self - > pager , self - > usablePageSizeOverride ) ) ;
2019-06-18 09:55:49 +08:00
forward ? child - > cursor . moveFirst ( ) : child - > cursor . moveLast ( ) ;
2019-02-21 18:46:30 +08:00
self - > pageCursor = child ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
return true ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
Future < bool > move ( bool forward ) {
return move_impl ( this , forward ) ;
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
Future < bool > moveNext ( ) {
return move_impl ( this , true ) ;
}
Future < bool > movePrev ( ) {
return move_impl ( this , false ) ;
}
2018-08-29 04:46:14 +08:00
2019-02-21 18:46:30 +08:00
// Move to the first or last record of the database.
ACTOR Future < bool > move_end ( InternalCursor * self , bool begin ) {
Future < Void > f = self - > moveToRoot ( ) ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// f will almost always be ready
if ( ! f . isReady ( ) ) {
wait ( f ) ;
2018-06-12 16:43:19 +08:00
}
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
self - > ensureUnshared ( ) ;
2018-06-08 18:32:34 +08:00
2019-02-21 18:46:30 +08:00
loop {
// Move to first or last record in the page
bool success = begin ? self - > pageCursor - > cursor . moveFirst ( ) : self - > pageCursor - > cursor . moveLast ( ) ;
2019-05-22 10:16:32 +08:00
// Skip over internal page entries that do not link to child pages
if ( ! self - > pageCursor - > isLeaf ( ) ) {
// While record has no value, move past it
while ( success & & ! self - > pageCursor - > cursor . get ( ) . value . present ( ) ) {
success = begin ? self - > pageCursor - > cursor . moveNext ( ) : self - > pageCursor - > cursor . movePrev ( ) ;
}
}
2019-02-21 18:46:30 +08:00
// If it worked, return true if we've reached a leaf page otherwise go to the next child
if ( success ) {
if ( self - > pageCursor - > isLeaf ( ) ) {
return true ;
}
2019-05-22 10:16:32 +08:00
2019-02-21 18:46:30 +08:00
Reference < PageCursor > child = wait ( self - > pageCursor - > getChild ( self - > pager , self - > usablePageSizeOverride ) ) ;
self - > pageCursor = child ;
}
else {
return false ;
}
}
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
Future < bool > moveFirst ( ) {
return move_end ( this , true ) ;
}
Future < bool > moveLast ( ) {
return move_end ( this , false ) ;
2017-09-15 20:19:39 +08:00
}
2019-02-21 18:46:30 +08:00
2017-09-15 20:19:39 +08:00
} ;
// Cursor is for reading and interating over user visible KV pairs at a specific version
2019-02-21 18:46:30 +08:00
// KeyValueRefs returned become invalid once the cursor is moved
class Cursor : public IStoreCursor , public ReferenceCounted < Cursor > , public FastAllocated < Cursor > , NonCopyable {
2017-06-10 05:56:41 +08:00
public :
2019-04-30 08:00:29 +08:00
Cursor ( Reference < IPagerSnapshot > pageSource , LogicalPageID root , Version recordVersion , int usablePageSizeOverride )
: m_version ( recordVersion ) ,
2019-02-21 18:46:30 +08:00
m_cur1 ( pageSource , root , usablePageSizeOverride ) ,
m_cur2 ( m_cur1 )
{
2017-06-10 05:56:41 +08:00
}
2019-02-21 18:46:30 +08:00
void addref ( ) { ReferenceCounted < Cursor > : : addref ( ) ; }
void delref ( ) { ReferenceCounted < Cursor > : : delref ( ) ; }
2017-09-09 16:29:25 +08:00
2019-02-21 18:46:30 +08:00
private :
Version m_version ;
// If kv is valid
// - kv.key references memory held by cur1
// - If cur1 points to a non split KV pair
// - kv.value references memory held by cur1
// - cur2 points to the next internal record after cur1
// Else
// - kv.value references memory in arena
// - cur2 points to the first internal record of the split KV pair
InternalCursor m_cur1 ;
InternalCursor m_cur2 ;
Arena m_arena ;
Optional < KeyValueRef > m_kv ;
public :
virtual Future < Void > findEqual ( KeyRef key ) { return find_impl ( this , key , true , 0 ) ; }
virtual Future < Void > findFirstEqualOrGreater ( KeyRef key , bool needValue , int prefetchNextBytes ) { return find_impl ( this , key , needValue , 1 ) ; }
virtual Future < Void > findLastLessOrEqual ( KeyRef key , bool needValue , int prefetchPriorBytes ) { return find_impl ( this , key , needValue , - 1 ) ; }
2017-09-09 16:29:25 +08:00
2019-02-21 18:46:30 +08:00
virtual Future < Void > next ( bool needValue ) { return move ( this , true , needValue ) ; }
virtual Future < Void > prev ( bool needValue ) { return move ( this , false , needValue ) ; }
2017-06-10 05:56:41 +08:00
virtual bool isValid ( ) {
return m_kv . present ( ) ;
}
virtual KeyRef getKey ( ) {
return m_kv . get ( ) . key ;
}
2019-02-21 18:46:30 +08:00
2017-06-10 05:56:41 +08:00
//virtual StringRef getCompressedKey() = 0;
virtual ValueRef getValue ( ) {
return m_kv . get ( ) . value ;
}
2018-09-19 15:32:39 +08:00
// TODO: Either remove this method or change the contract so that key and value strings returned are still valid after the cursor is
// moved and allocate them in some arena that this method resets.
2017-06-10 05:56:41 +08:00
virtual void invalidateReturnedStrings ( ) {
}
2019-02-21 18:46:30 +08:00
std : : string toString ( ) const {
2018-06-12 16:43:19 +08:00
std : : string r ;
2019-05-29 21:23:32 +08:00
r + = format ( " Cursor(%p) ver: % " PRId64 " " , this , m_version ) ;
2019-02-24 19:47:32 +08:00
if ( m_kv . present ( ) ) {
r + = format ( " KV: '%s' -> '%s' \n " , m_kv . get ( ) . key . printable ( ) . c_str ( ) , m_kv . get ( ) . value . printable ( ) . c_str ( ) ) ;
}
else {
2019-04-30 08:00:29 +08:00
r + = " KV: <np> \n " ;
2019-02-24 19:47:32 +08:00
}
2019-02-21 18:46:30 +08:00
r + = format ( " Cur1: %s \n " , m_cur1 . toString ( ) . c_str ( ) ) ;
r + = format ( " Cur2: %s \n " , m_cur2 . toString ( ) . c_str ( ) ) ;
2019-02-24 19:47:32 +08:00
2018-06-12 16:43:19 +08:00
return r ;
}
2017-09-15 20:19:39 +08:00
private :
2018-07-23 18:09:13 +08:00
// find key in tree closest to or equal to key (at this cursor's version)
2017-09-15 20:19:39 +08:00
// for less than or equal use cmp < 0
// for greater than or equal use cmp > 0
// for equal use cmp == 0
2019-02-21 18:46:30 +08:00
ACTOR static Future < Void > find_impl ( Cursor * self , KeyRef key , bool needValue , int cmp ) {
2018-07-23 18:09:13 +08:00
// Search for the last key at or before (key, version, \xff)
2019-05-22 10:16:32 +08:00
state RedwoodRecordRef query ( key , self - > m_version , { } , 0 , std : : numeric_limits < int32_t > : : max ( ) ) ;
2019-02-21 18:46:30 +08:00
self - > m_kv . reset ( ) ;
2017-06-10 05:56:41 +08:00
2019-02-21 18:46:30 +08:00
wait ( success ( self - > m_cur1 . seekLTE ( query ) ) ) ;
debug_printf ( " find%sE(%s): %s \n " , cmp > 0 ? " GT " : ( cmp = = 0 ? " " : " LT " ) , query . toString ( ) . c_str ( ) , self - > toString ( ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2019-02-21 18:46:30 +08:00
// If we found the target key with a present value then return it as it is valid for any cmp type
if ( self - > m_cur1 . present ( ) & & self - > m_cur1 . get ( ) . key = = key ) {
debug_printf ( " Target key found, reading full KV pair. Cursor: %s \n " , self - > toString ( ) . c_str ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > readFullKVPair ( self ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
2017-09-09 16:29:25 +08:00
}
2019-02-21 18:46:30 +08:00
// Mode is ==, so if we're still here we didn't find it.
2017-09-15 20:19:39 +08:00
if ( cmp = = 0 ) {
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2019-02-21 18:46:30 +08:00
// Mode is >=, so if we're here we have to go to the next present record at the target version
// because the seek done above was <= query
2017-09-15 20:19:39 +08:00
if ( cmp > 0 ) {
2019-02-21 18:46:30 +08:00
// icur is at a record < query or invalid.
// If cursor is invalid, try to go to start of tree
if ( ! self - > m_cur1 . valid ( ) ) {
bool valid = wait ( self - > m_cur1 . moveFirst ( ) ) ;
if ( ! valid ) {
self - > m_kv . reset ( ) ;
return Void ( ) ;
}
}
else {
loop {
bool valid = wait ( self - > m_cur1 . move ( true ) ) ;
if ( ! valid ) {
self - > m_kv . reset ( ) ;
return Void ( ) ;
}
if ( self - > m_cur1 . get ( ) . key > key ) {
break ;
}
}
2017-09-16 16:45:39 +08:00
}
2019-02-21 18:46:30 +08:00
2017-09-16 16:45:39 +08:00
// Get the next present key at the target version. Handles invalid cursor too.
2018-09-20 18:39:55 +08:00
wait ( self - > next ( needValue ) ) ;
2017-09-09 16:29:25 +08:00
}
2017-09-16 16:45:39 +08:00
else if ( cmp < 0 ) {
2019-02-21 18:46:30 +08:00
// Mode is <=, which is the same as the seekLTE(query)
if ( ! self - > m_cur1 . valid ( ) ) {
self - > m_kv . reset ( ) ;
return Void ( ) ;
}
2017-09-17 19:38:01 +08:00
// Move to previous present kv pair at the target version
2018-09-20 18:39:55 +08:00
wait ( self - > prev ( needValue ) ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2019-02-21 18:46:30 +08:00
// TODO: use needValue
ACTOR static Future < Void > move ( Cursor * self , bool fwd , bool needValue ) {
debug_printf ( " Cursor::move(%d): Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
ASSERT ( self - > m_cur1 . valid ( ) ) ;
2017-09-15 20:19:39 +08:00
2019-02-21 18:46:30 +08:00
// If kv is present then the key/version at cur1 was already returned so move to a new key
// Move cur1 until failure or a new key is found, keeping prior record visited in cur2
2017-09-16 16:45:39 +08:00
if ( self - > m_kv . present ( ) ) {
2019-02-21 18:46:30 +08:00
ASSERT ( self - > m_cur1 . valid ( ) ) ;
loop {
self - > m_cur2 = self - > m_cur1 ;
bool valid = wait ( self - > m_cur1 . move ( fwd ) ) ;
if ( ! valid | | self - > m_cur1 . get ( ) . key ! = self - > m_cur2 . get ( ) . key ) {
break ;
}
2017-09-16 16:45:39 +08:00
}
}
2019-02-21 18:46:30 +08:00
// Given two consecutive cursors c1 and c2, c1 represents a returnable record if
// c1.presentAtVersion(v) || (!c2.validAtVersion() || c2.get().key != c1.get().key())
// Note the distinction between 'present' and 'valid'. Present means the value for the key
// exists at the version (but could be the empty string) while valid just means the internal
// record is in effect at that version but it could indicate that the key was cleared and
// no longer exists from the user's perspective at that version
//
2019-02-24 19:47:32 +08:00
// cur2 must be the record immediately after cur1
// TODO: This may already be the case, store state to track this condition and avoid the reset here
if ( self - > m_cur1 . valid ( ) ) {
2019-02-21 18:46:30 +08:00
self - > m_cur2 = self - > m_cur1 ;
wait ( success ( self - > m_cur2 . move ( true ) ) ) ;
}
while ( self - > m_cur1 . valid ( ) ) {
if ( self - > m_cur1 . presentAtVersion ( self - > m_version ) & &
( ! self - > m_cur2 . validAtVersion ( self - > m_version ) | |
self - > m_cur2 . get ( ) . key ! = self - > m_cur1 . get ( ) . key )
2017-09-16 08:27:13 +08:00
) {
2018-09-20 18:39:55 +08:00
wait ( readFullKVPair ( self ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
2019-02-21 18:46:30 +08:00
if ( fwd ) {
// Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record
2019-02-24 19:47:32 +08:00
debug_printf ( " Cursor::move(%d): Moving forward, Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
self - > m_cur1 = self - > m_cur2 ;
wait ( success ( self - > m_cur2 . move ( true ) ) ) ;
}
else {
// Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record
2019-02-24 19:47:32 +08:00
debug_printf ( " Cursor::move(%d): Moving backward, Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
self - > m_cur2 = self - > m_cur1 ;
wait ( success ( self - > m_cur1 . move ( false ) ) ) ;
}
2017-06-10 05:56:41 +08:00
}
2017-09-16 08:27:13 +08:00
2019-02-21 18:46:30 +08:00
self - > m_kv . reset ( ) ;
2019-02-24 19:47:32 +08:00
debug_printf ( " Cursor::move(%d): Exit, end of db reached. Cursor = %s \n " , fwd , self - > toString ( ) . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
2019-02-21 18:46:30 +08:00
// Read all of the current key-value record starting at cur1 into kv
ACTOR static Future < Void > readFullKVPair ( Cursor * self ) {
self - > m_arena = Arena ( ) ;
const RedwoodRecordRef & rec = self - > m_cur1 . get ( ) ;
debug_printf ( " readFullKVPair: Starting at %s \n " , self - > toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
2019-02-21 18:46:30 +08:00
// Unsplit value, cur1 will hold the key and value memory
if ( ! rec . isMultiPart ( ) ) {
2019-02-24 19:47:32 +08:00
self - > m_kv = KeyValueRef ( rec . key , rec . value . get ( ) ) ;
2019-02-21 18:46:30 +08:00
debug_printf ( " readFullKVPair: Unsplit, exit. %s \n " , self - > toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
2019-02-21 18:46:30 +08:00
return Void ( ) ;
2017-09-17 19:38:01 +08:00
}
2019-02-21 18:46:30 +08:00
// Split value, need to coalesce split value parts into a buffer in arena,
// after which cur1 will point to the first part and kv.key will reference its key
2019-04-30 08:00:29 +08:00
ASSERT ( rec . chunk . start + rec . value . get ( ) . size ( ) = = rec . chunk . total ) ;
2017-09-17 19:38:01 +08:00
2019-04-30 08:00:29 +08:00
debug_printf ( " readFullKVPair: Split, totalsize %d %s \n " , rec . chunk . total , self - > toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
2019-02-21 18:46:30 +08:00
// Allocate space for the entire value in the same arena as the key
2019-04-30 08:00:29 +08:00
state int bytesLeft = rec . chunk . total ;
2019-02-21 18:46:30 +08:00
state StringRef dst = makeString ( bytesLeft , self - > m_arena ) ;
2017-09-17 19:38:01 +08:00
2019-02-21 18:46:30 +08:00
loop {
const RedwoodRecordRef & rec = self - > m_cur1 . get ( ) ;
2018-07-23 18:09:13 +08:00
2019-02-21 18:46:30 +08:00
debug_printf ( " readFullKVPair: Adding chunk %s \n " , rec . toString ( ) . c_str ( ) ) ;
2017-09-09 16:29:25 +08:00
2019-02-21 18:46:30 +08:00
int partSize = rec . value . get ( ) . size ( ) ;
2019-04-30 08:00:29 +08:00
memcpy ( mutateString ( dst ) + rec . chunk . start , rec . value . get ( ) . begin ( ) , partSize ) ;
2019-02-21 18:46:30 +08:00
bytesLeft - = partSize ;
if ( bytesLeft = = 0 ) {
self - > m_kv = KeyValueRef ( rec . key , dst ) ;
return Void ( ) ;
2017-09-09 16:29:25 +08:00
}
2019-02-21 18:46:30 +08:00
ASSERT ( bytesLeft > 0 ) ;
// Move backward
bool success = wait ( self - > m_cur1 . move ( false ) ) ;
ASSERT ( success ) ;
2017-09-09 16:29:25 +08:00
}
2017-06-10 05:56:41 +08:00
}
} ;
2019-02-21 18:46:30 +08:00
2017-06-10 05:56:41 +08:00
} ;
2019-02-21 18:46:30 +08:00
RedwoodRecordRef VersionedBTree : : dbBegin ( StringRef ( ) , 0 ) ;
2019-05-30 17:10:07 +08:00
RedwoodRecordRef VersionedBTree : : dbEnd ( LiteralStringRef ( " \xff \xff \xff \xff \xff " ) ) ;
2019-03-15 15:46:09 +08:00
VersionedBTree : : Counts VersionedBTree : : counts ;
2017-08-23 02:30:44 +08:00
2017-10-02 18:32:22 +08:00
ACTOR template < class T >
Future < T > catchError ( Promise < Void > error , Future < T > f ) {
try {
T result = wait ( f ) ;
return result ;
} catch ( Error & e ) {
2018-10-25 06:57:06 +08:00
if ( e . code ( ) ! = error_code_actor_cancelled & & error . canBeSet ( ) )
2017-10-02 18:32:22 +08:00
error . sendError ( e ) ;
throw ;
}
}
2017-09-22 14:51:55 +08:00
class KeyValueStoreRedwoodUnversioned : public IKeyValueStore {
2017-09-21 19:43:49 +08:00
public :
2017-09-22 14:51:55 +08:00
KeyValueStoreRedwoodUnversioned ( std : : string filePrefix , UID logID ) : m_filePrefix ( filePrefix ) {
2018-10-25 06:57:06 +08:00
// TODO: This constructor should really just take an IVersionedStore
IPager * pager = new IndirectShadowPager ( filePrefix ) ;
2019-04-30 08:00:29 +08:00
m_tree = new VersionedBTree ( pager , filePrefix , true , pager - > getUsablePageSize ( ) ) ;
2018-10-25 06:57:06 +08:00
m_init = catchError ( init_impl ( this ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual Future < Void > init ( ) {
return m_init ;
}
ACTOR Future < Void > init_impl ( KeyValueStoreRedwoodUnversioned * self ) {
2018-10-25 06:57:06 +08:00
TraceEvent ( SevInfo , " RedwoodInit " ) . detail ( " FilePrefix " , self - > m_filePrefix ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_tree - > init ( ) ) ;
2017-09-21 19:43:49 +08:00
Version v = wait ( self - > m_tree - > getLatestVersion ( ) ) ;
self - > m_tree - > setWriteVersion ( v + 1 ) ;
2018-10-25 06:57:06 +08:00
TraceEvent ( SevInfo , " RedwoodInitComplete " ) . detail ( " FilePrefix " , self - > m_filePrefix ) ;
2017-09-21 19:43:49 +08:00
return Void ( ) ;
}
2017-10-02 18:32:22 +08:00
ACTOR void shutdown ( KeyValueStoreRedwoodUnversioned * self , bool dispose ) {
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdown " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2018-10-25 06:57:06 +08:00
if ( self - > m_error . canBeSet ( ) ) {
self - > m_error . sendError ( actor_cancelled ( ) ) ; // Ideally this should be shutdown_in_progress
}
2017-09-23 08:18:28 +08:00
self - > m_init . cancel ( ) ;
2018-10-25 06:57:06 +08:00
Future < Void > closedFuture = self - > m_tree - > onClosed ( ) ;
2017-10-02 18:32:22 +08:00
if ( dispose )
2018-10-25 06:57:06 +08:00
self - > m_tree - > dispose ( ) ;
2017-10-02 18:32:22 +08:00
else
2018-10-25 06:57:06 +08:00
self - > m_tree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-09-21 19:43:49 +08:00
self - > m_closed . send ( Void ( ) ) ;
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdownComplete " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2017-10-02 18:32:22 +08:00
delete self ;
2017-09-21 19:43:49 +08:00
}
virtual void close ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , false ) ;
2017-09-21 19:43:49 +08:00
}
virtual void dispose ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , true ) ;
2017-09-21 19:43:49 +08:00
}
virtual Future < Void > onClosed ( ) {
return m_closed . getFuture ( ) ;
}
Future < Void > commit ( bool sequential = false ) {
2017-10-10 04:24:16 +08:00
Future < Void > c = m_tree - > commit ( ) ;
m_tree - > setWriteVersion ( m_tree - > getWriteVersion ( ) + 1 ) ;
2018-10-25 06:57:06 +08:00
return catchError ( c ) ;
2017-09-21 19:43:49 +08:00
}
virtual KeyValueStoreType getType ( ) {
2017-09-22 14:51:55 +08:00
return KeyValueStoreType : : SSD_REDWOOD_V1 ;
2017-09-21 19:43:49 +08:00
}
virtual StorageBytes getStorageBytes ( ) {
2018-10-25 06:57:06 +08:00
return m_tree - > getStorageBytes ( ) ;
2017-09-21 19:43:49 +08:00
}
2018-10-25 06:57:06 +08:00
virtual Future < Void > getError ( ) {
return delayed ( m_error . getFuture ( ) ) ;
} ;
2017-09-21 19:43:49 +08:00
void clear ( KeyRangeRef range , const Arena * arena = 0 ) {
2019-03-15 15:46:09 +08:00
debug_printf ( " CLEAR %s \n " , printable ( range ) . c_str ( ) ) ;
2017-09-21 19:43:49 +08:00
m_tree - > clear ( range ) ;
}
virtual void set ( KeyValueRef keyValue , const Arena * arena = NULL ) {
2019-03-15 15:46:09 +08:00
debug_printf ( " SET %s \n " , keyValue . key . printable ( ) . c_str ( ) ) ;
2017-09-21 19:43:49 +08:00
m_tree - > set ( keyValue ) ;
}
2019-03-15 15:46:09 +08:00
virtual Future < Standalone < VectorRef < KeyValueRef > > > readRange ( KeyRangeRef keys , int rowLimit = 1 < < 30 , int byteLimit = 1 < < 30 ) {
debug_printf ( " READRANGE %s \n " , printable ( keys ) . c_str ( ) ) ;
return catchError ( readRange_impl ( this , keys , rowLimit , byteLimit ) ) ;
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Standalone < VectorRef < KeyValueRef > > > readRange_impl ( KeyValueStoreRedwoodUnversioned * self , KeyRange keys , int rowLimit , int byteLimit ) {
2019-03-15 15:46:09 +08:00
self - > m_tree - > counts . getRanges + + ;
2017-09-21 19:43:49 +08:00
state Standalone < VectorRef < KeyValueRef > > result ;
state int accumulatedBytes = 0 ;
ASSERT ( byteLimit > 0 ) ;
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2017-10-02 18:32:22 +08:00
2017-09-21 19:43:49 +08:00
if ( rowLimit > = 0 ) {
2018-09-20 18:39:55 +08:00
wait ( cur - > findFirstEqualOrGreater ( keys . begin , true , 0 ) ) ;
2017-09-21 19:43:49 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) < keys . end ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2018-10-25 06:57:06 +08:00
if ( - - rowLimit = = 0 | | accumulatedBytes > = byteLimit ) {
2017-09-21 19:43:49 +08:00
break ;
2018-10-25 06:57:06 +08:00
}
2018-09-20 18:39:55 +08:00
wait ( cur - > next ( true ) ) ;
2017-09-21 19:43:49 +08:00
}
} else {
2018-09-20 18:39:55 +08:00
wait ( cur - > findLastLessOrEqual ( keys . end , true , 0 ) ) ;
2017-09-21 19:43:49 +08:00
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = keys . end )
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-21 19:43:49 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = keys . begin ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2018-10-25 06:57:06 +08:00
if ( - - rowLimit = = 0 | | accumulatedBytes > = byteLimit ) {
2017-09-21 19:43:49 +08:00
break ;
2018-10-25 06:57:06 +08:00
}
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-21 19:43:49 +08:00
}
}
return result ;
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Optional < Value > > readValue_impl ( KeyValueStoreRedwoodUnversioned * self , Key key , Optional < UID > debugID ) {
2019-03-15 15:46:09 +08:00
self - > m_tree - > counts . gets + + ;
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findEqual ( key ) ) ;
2017-10-02 18:32:22 +08:00
if ( cur - > isValid ( ) ) {
2017-09-21 19:43:49 +08:00
return cur - > getValue ( ) ;
2017-10-02 18:32:22 +08:00
}
2017-09-21 19:43:49 +08:00
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValue ( KeyRef key , Optional < UID > debugID = Optional < UID > ( ) ) {
2018-10-25 06:57:06 +08:00
return catchError ( readValue_impl ( this , key , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Optional < Value > > readValuePrefix_impl ( KeyValueStoreRedwoodUnversioned * self , Key key , int maxLength , Optional < UID > debugID ) {
2019-03-15 15:46:09 +08:00
self - > m_tree - > counts . gets + + ;
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findEqual ( key ) ) ;
2017-09-21 19:43:49 +08:00
if ( cur - > isValid ( ) ) {
Value v = cur - > getValue ( ) ;
int len = std : : min ( v . size ( ) , maxLength ) ;
return Value ( cur - > getValue ( ) . substr ( 0 , len ) ) ;
}
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValuePrefix ( KeyRef key , int maxLength , Optional < UID > debugID = Optional < UID > ( ) ) {
2018-10-25 06:57:06 +08:00
return catchError ( readValuePrefix_impl ( this , key , maxLength , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual ~ KeyValueStoreRedwoodUnversioned ( ) {
2017-09-21 19:43:49 +08:00
} ;
private :
std : : string m_filePrefix ;
VersionedBTree * m_tree ;
Future < Void > m_init ;
Promise < Void > m_closed ;
2017-10-02 18:32:22 +08:00
Promise < Void > m_error ;
2018-10-25 06:57:06 +08:00
template < typename T > inline Future < T > catchError ( Future < T > f ) {
return : : catchError ( m_error , f ) ;
}
2017-09-21 19:43:49 +08:00
} ;
2017-09-22 14:51:55 +08:00
IKeyValueStore * keyValueStoreRedwoodV1 ( std : : string const & filename , UID logID ) {
return new KeyValueStoreRedwoodUnversioned ( filename , logID ) ;
2017-09-21 19:43:49 +08:00
}
2018-09-28 07:07:29 +08:00
int randomSize ( int max ) {
2019-06-25 11:17:49 +08:00
int n = pow ( deterministicRandom ( ) - > random01 ( ) , 3 ) * max ;
2018-09-28 07:07:29 +08:00
return n ;
}
2017-09-21 19:43:49 +08:00
2019-06-24 16:05:16 +08:00
StringRef randomString ( Arena & arena , int len , char firstChar = ' a ' , char lastChar = ' z ' ) {
+ + lastChar ;
StringRef s = makeString ( len , arena ) ;
for ( int i = 0 ; i < len ; + + i ) {
2019-06-25 11:17:49 +08:00
* ( uint8_t * ) ( s . begin ( ) + i ) = ( uint8_t ) deterministicRandom ( ) - > randomInt ( firstChar , lastChar ) ;
2019-06-24 16:05:16 +08:00
}
return s ;
}
Standalone < StringRef > randomString ( int len , char firstChar = ' a ' , char lastChar = ' z ' ) {
Standalone < StringRef > s ;
( StringRef & ) s = randomString ( s . arena ( ) , len , firstChar , lastChar ) ;
return s ;
}
KeyValue randomKV ( int maxKeySize = 10 , int maxValueSize = 5 ) {
int kLen = randomSize ( 1 + maxKeySize ) ;
int vLen = maxValueSize > 0 ? randomSize ( maxValueSize ) : 0 ;
2017-06-10 05:56:41 +08:00
KeyValue kv ;
2019-06-24 16:05:16 +08:00
kv . key = randomString ( kv . arena ( ) , kLen , ' a ' , ' m ' ) ;
2017-06-10 05:56:41 +08:00
for ( int i = 0 ; i < kLen ; + + i )
2019-05-11 05:01:52 +08:00
mutateString ( kv . key ) [ i ] = ( uint8_t ) deterministicRandom ( ) - > randomInt ( ' a ' , ' m ' ) ;
2019-06-24 16:05:16 +08:00
if ( vLen > 0 ) {
kv . value = randomString ( kv . arena ( ) , vLen , ' n ' , ' z ' ) ;
for ( int i = 0 ; i < vLen ; + + i )
2019-06-25 11:17:49 +08:00
mutateString ( kv . value ) [ i ] = ( uint8_t ) deterministicRandom ( ) - > randomInt ( ' o ' , ' z ' ) ;
2019-06-24 16:05:16 +08:00
}
2017-06-10 05:56:41 +08:00
return kv ;
}
2019-04-30 08:00:29 +08:00
ACTOR Future < int > verifyRange ( VersionedBTree * btree , Key start , Key end , Version v , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written , int * pErrorCount ) {
2017-09-15 20:19:39 +08:00
state int errors = 0 ;
if ( end < = start )
end = keyAfter ( start ) ;
2017-09-16 16:45:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written - > lower_bound ( std : : make_pair ( start . toString ( ) , 0 ) ) ;
2017-09-15 20:19:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written - > upper_bound ( std : : make_pair ( end . toString ( ) , 0 ) ) ;
2017-09-16 08:27:13 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iLast ;
2017-09-15 20:19:39 +08:00
2017-09-17 19:38:01 +08:00
state Reference < IStoreCursor > cur = btree - > readAtVersion ( v ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s): Start cur=%p \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur . getPtr ( ) ) ;
2017-09-17 19:38:01 +08:00
// Randomly use the cursor for something else first.
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > coinflip ( ) ) {
2017-09-21 15:58:56 +08:00
state Key randomKey = randomKV ( ) . key ;
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s): Dummy seek to '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , randomKey . toString ( ) . c_str ( ) ) ;
2019-05-11 05:01:52 +08:00
wait ( deterministicRandom ( ) - > coinflip ( ) ? cur - > findFirstEqualOrGreater ( randomKey , true , 0 ) : cur - > findLastLessOrEqual ( randomKey , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
}
2018-06-14 19:15:14 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s): Actual seek \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findFirstEqualOrGreater ( start , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
state std : : vector < KeyValue > results ;
2017-09-15 20:19:39 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) < end ) {
// Find the next written kv pair that would be present at this version
while ( 1 ) {
iLast = i ;
2017-09-16 08:27:13 +08:00
if ( i = = iEnd )
break ;
+ + i ;
2019-05-22 10:16:32 +08:00
2017-09-16 08:27:13 +08:00
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
2019-05-22 10:16:32 +08:00
) {
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRange(@% " PRId64 " , %s, %s) Found key in written map: %s \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , iLast - > first . first . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
break ;
2019-05-22 10:16:32 +08:00
}
2017-09-15 20:19:39 +08:00
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( iLast = = iEnd ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( cur - > getKey ( ) ! = iLast - > first . first ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , iLast - > first . first . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
if ( cur - > getValue ( ) ! = iLast - > second . get ( ) ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , iLast - > second . get ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-17 19:38:01 +08:00
2019-05-22 10:16:32 +08:00
ASSERT ( errors = = 0 ) ;
2017-09-17 19:38:01 +08:00
results . push_back ( KeyValue ( KeyValueRef ( cur - > getKey ( ) , cur - > getValue ( ) ) ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > next ( true ) ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-16 08:27:13 +08:00
// Make sure there are no further written kv pairs that would be present at this version.
while ( 1 ) {
iLast = i ;
if ( i = = iEnd )
break ;
+ + i ;
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
)
break ;
}
if ( iLast ! = iEnd ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree range ended but written has @% " PRId64 " '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , iLast - > first . second , iLast - > first . first . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
}
2017-09-16 16:45:39 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s): start \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) ) ;
2019-04-30 08:00:29 +08:00
// Randomly use a new cursor for the reverse range read but only if version history is available
2019-06-25 11:17:49 +08:00
if ( ! btree - > isSingleVersion ( ) & & deterministicRandom ( ) - > coinflip ( ) ) {
2017-09-17 19:38:01 +08:00
cur = btree - > readAtVersion ( v ) ;
}
// Now read the range from the tree in reverse order and compare to the saved results
2018-09-20 18:39:55 +08:00
wait ( cur - > findLastLessOrEqual ( end , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = end )
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-17 19:38:01 +08:00
state std : : vector < KeyValue > : : const_reverse_iterator r = results . rbegin ( ) ;
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = start ) {
if ( r = = results . rend ( ) ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
if ( cur - > getKey ( ) ! = r - > key ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
if ( cur - > getValue ( ) ! = r - > value ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , r - > value . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
+ + r ;
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-17 19:38:01 +08:00
}
if ( r ! = results . rend ( ) ) {
2019-02-24 19:47:32 +08:00
+ + errors ;
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree range ended but written has '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
}
2017-09-15 20:19:39 +08:00
return errors ;
}
2019-02-24 19:47:32 +08:00
ACTOR Future < int > verifyAll ( VersionedBTree * btree , Version maxCommittedVersion , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written , int * pErrorCount ) {
2018-09-28 15:35:03 +08:00
// Read back every key at every version set or cleared and verify the result.
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written - > cbegin ( ) ;
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written - > cend ( ) ;
state int errors = 0 ;
while ( i ! = iEnd ) {
state std : : string key = i - > first . first ;
state Version ver = i - > first . second ;
if ( ver < = maxCommittedVersion ) {
state Optional < std : : string > val = i - > second ;
state Reference < IStoreCursor > cur = btree - > readAtVersion ( ver ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " Verifying @% " PRId64 " '%s' \n " , ver , key . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
state Arena arena ;
wait ( cur - > findEqual ( KeyRef ( arena , key ) ) ) ;
2018-09-28 15:35:03 +08:00
if ( val . present ( ) ) {
if ( ! ( cur - > isValid ( ) & & cur - > getKey ( ) = = key & & cur - > getValue ( ) = = val . get ( ) ) ) {
+ + errors ;
2019-02-24 19:47:32 +08:00
+ + * pErrorCount ;
2018-09-28 15:35:03 +08:00
if ( ! cur - > isValid ( ) )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: key_not_found: '%s' -> '%s' @% " PRId64 " \n " , key . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
else if ( cur - > getKey ( ) ! = key )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: key_incorrect: found '%s' expected '%s' @% " PRId64 " \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , key . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
else if ( cur - > getValue ( ) ! = val . get ( ) )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @% " PRId64 " \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
}
} else {
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = key ) {
+ + errors ;
2019-02-24 19:47:32 +08:00
+ + * pErrorCount ;
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: cleared_key_found: '%s' -> '%s' @% " PRId64 " \n " , key . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
}
}
}
+ + i ;
}
return errors ;
}
2019-05-22 10:16:32 +08:00
ACTOR Future < Void > verify ( VersionedBTree * btree , FutureStream < Version > vStream , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written , int * pErrorCount , bool serial ) {
2019-04-30 08:00:29 +08:00
state Future < int > vall ;
state Future < int > vrange ;
2018-09-28 15:35:03 +08:00
try {
loop {
2018-10-02 07:51:57 +08:00
state Version v = waitNext ( vStream ) ;
2018-09-28 15:35:03 +08:00
2019-04-30 08:00:29 +08:00
if ( btree - > isSingleVersion ( ) ) {
v = btree - > getLastCommittedVersion ( ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " Verifying at latest committed version % " PRId64 " \n " , v ) ;
2019-04-30 08:00:29 +08:00
vall = verifyRange ( btree , LiteralStringRef ( " " ) , LiteralStringRef ( " \xff \xff " ) , v , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vall ) ) ;
}
2019-04-30 08:00:29 +08:00
vrange = verifyRange ( btree , randomKV ( ) . key , randomKV ( ) . key , v , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vrange ) ) ;
}
2019-04-30 08:00:29 +08:00
}
else {
2019-05-29 21:23:32 +08:00
debug_printf ( " Verifying through version % " PRId64 " \n " , v ) ;
2019-04-30 08:00:29 +08:00
vall = verifyAll ( btree , v , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vall ) ) ;
}
2019-06-25 11:17:49 +08:00
vrange = verifyRange ( btree , randomKV ( ) . key , randomKV ( ) . key , deterministicRandom ( ) - > randomInt ( 1 , v + 1 ) , written , pErrorCount ) ;
2019-05-22 10:16:32 +08:00
if ( serial ) {
wait ( success ( vrange ) ) ;
}
2019-04-30 08:00:29 +08:00
}
2018-09-28 15:35:03 +08:00
wait ( success ( vall ) & & success ( vrange ) ) ;
2019-06-18 09:55:49 +08:00
debug_printf ( " Verified through version % " PRId64 " , %d errors \n " , v , * pErrorCount ) ;
2018-09-28 15:35:03 +08:00
if ( * pErrorCount ! = 0 )
break ;
}
} catch ( Error & e ) {
if ( e . code ( ) ! = error_code_end_of_stream ) {
throw ;
}
}
return Void ( ) ;
}
2018-10-05 14:46:37 +08:00
// Does a random range read, doesn't trap/report errors
ACTOR Future < Void > randomReader ( VersionedBTree * btree ) {
state Reference < IStoreCursor > cur ;
loop {
wait ( yield ( ) ) ;
2019-05-11 05:01:52 +08:00
if ( ! cur | | deterministicRandom ( ) - > random01 ( ) > .1 ) {
2019-04-30 08:00:29 +08:00
Version v = btree - > getLastCommittedVersion ( ) ;
if ( ! btree - > isSingleVersion ( ) ) {
2019-06-25 11:17:49 +08:00
v = deterministicRandom ( ) - > randomInt ( 1 , v + 1 ) ;
2019-04-30 08:00:29 +08:00
}
2018-10-05 14:46:37 +08:00
cur = btree - > readAtVersion ( v ) ;
}
2019-02-21 18:46:30 +08:00
state KeyValue kv = randomKV ( 10 , 0 ) ;
wait ( cur - > findFirstEqualOrGreater ( kv . key , true , 0 ) ) ;
2019-05-11 05:01:52 +08:00
state int c = deterministicRandom ( ) - > randomInt ( 0 , 100 ) ;
2018-10-05 14:46:37 +08:00
while ( cur - > isValid ( ) & & c - - > 0 ) {
wait ( success ( cur - > next ( true ) ) ) ;
wait ( yield ( ) ) ;
}
}
}
2018-08-29 04:46:14 +08:00
2019-02-21 18:46:30 +08:00
struct IntIntPair {
IntIntPair ( ) { }
IntIntPair ( int k , int v ) : k ( k ) , v ( v ) { }
IntIntPair ( Arena & arena , const IntIntPair & toCopy ) {
* this = toCopy ;
}
struct Delta {
2019-05-29 21:23:32 +08:00
bool prefixSource ;
2019-02-21 18:46:30 +08:00
int dk ;
int dv ;
2019-05-29 21:23:32 +08:00
IntIntPair apply ( const IntIntPair & base , Arena & arena ) {
return { base . k + dk , base . v + dv } ;
}
void setPrefixSource ( bool val ) {
prefixSource = val ;
}
bool getPrefixSource ( ) const {
return prefixSource ;
2019-02-21 18:46:30 +08:00
}
int size ( ) const {
return sizeof ( Delta ) ;
}
std : : string toString ( ) const {
2019-05-29 21:23:32 +08:00
return format ( " DELTA{prefixSource=%d dk=%d(0x%x) dv = % d ( 0 x % x ) } " , prefixSource, dk, dk, dv, dv) ;
2019-02-21 18:46:30 +08:00
}
} ;
int compare ( const IntIntPair & rhs ) const {
//printf("compare %s to %s\n", toString().c_str(), rhs.toString().c_str());
return k - rhs . k ;
}
bool operator = = ( const IntIntPair & rhs ) const {
return k = = rhs . k ;
}
2019-05-29 21:23:32 +08:00
int getCommonPrefixLen ( const IntIntPair & other , int skip ) const {
return 0 ;
}
2019-02-21 18:46:30 +08:00
int deltaSize ( const IntIntPair & base ) const {
return sizeof ( Delta ) ;
}
2019-05-29 21:23:32 +08:00
int writeDelta ( Delta & d , const IntIntPair & base , int commonPrefix = - 1 ) const {
d . dk = k - base . k ;
d . dv = v - base . v ;
return sizeof ( Delta ) ;
2019-02-21 18:46:30 +08:00
}
int k ;
int v ;
std : : string toString ( ) const {
return format ( " {k=%d(0x%x) v = % d ( 0 x % x ) } " , k, k, v, v) ;
}
} ;
2019-05-29 21:23:32 +08:00
int getCommonIntFieldPrefix2 ( const RedwoodRecordRef & a , const RedwoodRecordRef & b ) {
RedwoodRecordRef : : byte aFields [ RedwoodRecordRef : : intFieldArraySize ] ;
RedwoodRecordRef : : byte bFields [ RedwoodRecordRef : : intFieldArraySize ] ;
a . serializeIntFields ( aFields ) ;
b . serializeIntFields ( bFields ) ;
//printf("a: %s\n", StringRef(aFields, RedwoodRecordRef::intFieldArraySize).toHexString().c_str());
//printf("b: %s\n", StringRef(bFields, RedwoodRecordRef::intFieldArraySize).toHexString().c_str());
int i = 0 ;
while ( i < RedwoodRecordRef : : intFieldArraySize & & aFields [ i ] = = bFields [ i ] ) {
+ + i ;
}
//printf("%d\n", i);
return i ;
}
void deltaTest ( RedwoodRecordRef rec , RedwoodRecordRef base ) {
char buf [ 500 ] ;
RedwoodRecordRef : : Delta & d = * ( RedwoodRecordRef : : Delta * ) buf ;
Arena mem ;
2019-06-06 11:58:47 +08:00
int expectedSize = rec . deltaSize ( base , false ) ;
2019-05-30 07:47:53 +08:00
int deltaSize = rec . writeDelta ( d , base ) ;
2019-05-29 21:23:32 +08:00
RedwoodRecordRef decoded = d . apply ( base , mem ) ;
2019-05-30 09:06:11 +08:00
if ( decoded ! = rec | | expectedSize ! = deltaSize ) {
printf ( " \n " ) ;
2019-05-30 07:47:53 +08:00
printf ( " Base: %s \n " , base . toString ( ) . c_str ( ) ) ;
printf ( " ExpectedSize: %d \n " , expectedSize ) ;
printf ( " DeltaSize: %d \n " , deltaSize ) ;
printf ( " Delta: %s \n " , d . toString ( ) . c_str ( ) ) ;
printf ( " Record: %s \n " , rec . toString ( ) . c_str ( ) ) ;
printf ( " Decoded: %s \n " , decoded . toString ( ) . c_str ( ) ) ;
2019-05-30 07:26:58 +08:00
printf ( " RedwoodRecordRef::Delta test failure! \n " ) ;
2019-05-29 21:23:32 +08:00
ASSERT ( false ) ;
}
}
2019-05-30 07:26:58 +08:00
Standalone < RedwoodRecordRef > randomRedwoodRecordRef ( int maxKeySize = 3 , int maxValueSize = 255 ) {
RedwoodRecordRef rec ;
KeyValue kv = randomKV ( 3 , 10 ) ;
rec . key = kv . key ;
2019-06-25 11:17:49 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .9 ) {
2019-05-30 07:26:58 +08:00
rec . value = kv . value ;
}
2019-06-25 11:17:49 +08:00
rec . version = deterministicRandom ( ) - > coinflip ( ) ? 0 : deterministicRandom ( ) - > randomInt64 ( 0 , std : : numeric_limits < Version > : : max ( ) ) ;
2019-05-30 07:26:58 +08:00
2019-06-25 11:17:49 +08:00
if ( deterministicRandom ( ) - > coinflip ( ) ) {
rec . chunk . total = deterministicRandom ( ) - > randomInt ( 1 , 100000 ) ;
rec . chunk . start = deterministicRandom ( ) - > randomInt ( 0 , rec . chunk . total ) ;
2019-05-30 07:26:58 +08:00
}
return Standalone < RedwoodRecordRef > ( rec , kv . arena ( ) ) ;
}
2019-05-29 21:23:32 +08:00
TEST_CASE ( " !/redwood/correctness/unit/RedwoodRecordRef " ) {
2019-06-04 19:03:52 +08:00
// Test pageID stuff.
{
LogicalPageID id = 1 ;
RedwoodRecordRef r ;
r . setPageID ( id ) ;
ASSERT ( r . getPageID ( ) = = id ) ;
RedwoodRecordRef s ;
s = r ;
ASSERT ( s . getPageID ( ) = = id ) ;
RedwoodRecordRef t ( r ) ;
ASSERT ( t . getPageID ( ) = = id ) ;
r . setPageID ( id + 1 ) ;
ASSERT ( s . getPageID ( ) = = id ) ;
ASSERT ( t . getPageID ( ) = = id ) ;
}
2019-05-29 21:23:32 +08:00
// Testing common prefix calculation for integer fields using the member function that calculates this directly
// and by serializing the integer fields to arrays and finding the common prefix length of the two arrays
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abcd " ) , 0 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 0 , 0 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 0 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " ab " ) , 2 , LiteralStringRef ( " " ) , 1 , 3 )
) ;
deltaTest ( RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 5 , 0 ) ,
RedwoodRecordRef ( LiteralStringRef ( " abc " ) , 2 , LiteralStringRef ( " " ) , 5 , 1 )
) ;
2019-05-30 07:47:53 +08:00
RedwoodRecordRef : : byte varInts [ 100 ] ;
RedwoodRecordRef : : Writer w ( varInts ) ;
RedwoodRecordRef : : Reader r ( varInts ) ;
w . writeVarInt ( 1 ) ;
w . writeVarInt ( 128 ) ;
w . writeVarInt ( 32000 ) ;
ASSERT ( r . readVarInt ( ) = = 1 ) ;
ASSERT ( r . readVarInt ( ) = = 128 ) ;
ASSERT ( r . readVarInt ( ) = = 32000 ) ;
2019-05-29 21:23:32 +08:00
RedwoodRecordRef rec1 ;
RedwoodRecordRef rec2 ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12995678 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 5 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 14 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = invalidVersion ;
rec2 . version = 0 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 0 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . total = 4 ;
rec2 . chunk . total = 4 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 14 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . start = 4 ;
rec2 . chunk . start = 4 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 14 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . start = 4 ;
rec2 . chunk . start = 5 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 13 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
rec1 . version = 0x12345678 ;
rec2 . version = 0x12345678 ;
rec1 . chunk . total = 256 ;
rec2 . chunk . total = 512 ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = 9 ) ;
ASSERT ( rec1 . getCommonIntFieldPrefix ( rec2 ) = = getCommonIntFieldPrefix2 ( rec1 , rec2 ) ) ;
Arena mem ;
double start ;
uint64_t total ;
uint64_t count ;
uint64_t i ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 1e9 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . total = i & 0xffffff ;
rec2 . chunk . total = i & 0xffffff ;
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . getCommonIntFieldPrefix ( rec2 ) ;
}
printf ( " % " PRId64 " getCommonIntFieldPrefix() %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
rec1 . key = LiteralStringRef ( " alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf " ) ;
rec2 . key = LiteralStringRef ( " alksdfjaklsdfjlkasdjflkasdjfklajsdflk;ajsdflkajdsflkjadsf " ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 1e9 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
RedwoodRecordRef : : byte fields [ RedwoodRecordRef : : intFieldArraySize ] ;
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
rec1 . serializeIntFields ( fields ) ;
total + = fields [ RedwoodRecordRef : : intFieldArraySize - 1 ] ;
}
printf ( " % " PRId64 " serializeIntFields() %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 100e6 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . getCommonPrefixLen ( rec2 , 50 ) ;
}
printf ( " % " PRId64 " getCommonPrefixLen(skip=50) %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 100e6 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . getCommonPrefixLen ( rec2 , 0 ) ;
}
printf ( " % " PRId64 " getCommonPrefixLen(skip=0) %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
char buf [ 1000 ] ;
RedwoodRecordRef : : Delta & d = * ( RedwoodRecordRef : : Delta * ) buf ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 100e6 ;
2019-05-29 21:23:32 +08:00
int commonPrefix = rec1 . getCommonPrefixLen ( rec2 , 0 ) ;
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . writeDelta ( d , rec2 , commonPrefix ) ;
}
printf ( " % " PRId64 " writeDelta(commonPrefix=%d) %g M/s \n " , total , commonPrefix , count / ( timer ( ) - start ) / 1e6 ) ;
start = timer ( ) ;
total = 0 ;
2019-05-30 07:26:58 +08:00
count = 10e6 ;
2019-05-29 21:23:32 +08:00
for ( i = 0 ; i < count ; + + i ) {
rec1 . chunk . start = i & 0xffffff ;
rec2 . chunk . start = ( i + 1 ) & 0xffffff ;
total + = rec1 . writeDelta ( d , rec2 ) ;
}
printf ( " % " PRId64 " writeDelta() %g M/s \n " , total , count / ( timer ( ) - start ) / 1e6 ) ;
2019-05-30 07:26:58 +08:00
start = timer ( ) ;
total = 0 ;
2019-05-30 07:47:53 +08:00
count = 1e6 ;
2019-05-30 07:26:58 +08:00
for ( i = 0 ; i < count ; + + i ) {
Standalone < RedwoodRecordRef > a = randomRedwoodRecordRef ( ) ;
Standalone < RedwoodRecordRef > b = randomRedwoodRecordRef ( ) ;
deltaTest ( a , b ) ;
}
printf ( " Random deltaTest() %g M/s \n " , count / ( timer ( ) - start ) / 1e6 ) ;
2019-05-29 21:23:32 +08:00
return Void ( ) ;
}
TEST_CASE ( " !/redwood/correctness/unit/deltaTree/RedwoodRecordRef " ) {
2019-02-21 18:46:30 +08:00
const int N = 200 ;
RedwoodRecordRef prev ;
RedwoodRecordRef next ( LiteralStringRef ( " \xff \xff \xff \xff " ) ) ;
Arena arena ;
std : : vector < RedwoodRecordRef > items ;
for ( int i = 0 ; i < N ; + + i ) {
2019-06-25 11:17:49 +08:00
std : : string k = deterministicRandom ( ) - > randomAlphaNumeric ( 30 ) ;
std : : string v = deterministicRandom ( ) - > randomAlphaNumeric ( 30 ) ;
2019-02-21 18:46:30 +08:00
RedwoodRecordRef rec ;
rec . key = StringRef ( arena , k ) ;
2019-06-25 11:17:49 +08:00
rec . version = deterministicRandom ( ) - > coinflip ( ) ? deterministicRandom ( ) - > randomInt64 ( 0 , std : : numeric_limits < Version > : : max ( ) ) : invalidVersion ;
if ( deterministicRandom ( ) - > coinflip ( ) ) {
2019-02-21 18:46:30 +08:00
rec . value = StringRef ( arena , v ) ;
2019-06-25 11:17:49 +08:00
if ( deterministicRandom ( ) - > coinflip ( ) ) {
rec . chunk . start = deterministicRandom ( ) - > randomInt ( 0 , 100000 ) ;
rec . chunk . total = rec . chunk . start + v . size ( ) + deterministicRandom ( ) - > randomInt ( 0 , 100000 ) ;
2019-02-21 18:46:30 +08:00
}
}
items . push_back ( rec ) ;
//printf("i=%d %s\n", i, items.back().toString().c_str());
}
std : : sort ( items . begin ( ) , items . end ( ) ) ;
DeltaTree < RedwoodRecordRef > * tree = ( DeltaTree < RedwoodRecordRef > * ) new uint8_t [ N * 100 ] ;
tree - > build ( & items [ 0 ] , & items [ items . size ( ) ] , & prev , & next ) ;
printf ( " Count=%d Size=%d InitialDepth=%d \n " , ( int ) items . size ( ) , ( int ) tree - > size ( ) , ( int ) tree - > initialDepth ) ;
debug_printf ( " Data(%p): %s \n " , tree , StringRef ( ( uint8_t * ) tree , tree - > size ( ) ) . toHexString ( ) . c_str ( ) ) ;
DeltaTree < RedwoodRecordRef > : : Reader r ( tree , & prev , & next ) ;
DeltaTree < RedwoodRecordRef > : : Cursor fwd = r . getCursor ( ) ;
DeltaTree < RedwoodRecordRef > : : Cursor rev = r . getCursor ( ) ;
ASSERT ( fwd . moveFirst ( ) ) ;
ASSERT ( rev . moveLast ( ) ) ;
int i = 0 ;
while ( 1 ) {
if ( fwd . get ( ) ! = items [ i ] ) {
printf ( " forward iterator i=%d \n %s found \n %s expected \n " , i , fwd . get ( ) . toString ( ) . c_str ( ) , items [ i ] . toString ( ) . c_str ( ) ) ;
2019-07-02 15:58:43 +08:00
printf ( " Delta: %s \n " , fwd . node - > raw - > delta ( ) . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
ASSERT ( false ) ;
}
if ( rev . get ( ) ! = items [ items . size ( ) - 1 - i ] ) {
printf ( " reverse iterator i=%d \n %s found \n %s expected \n " , i , rev . get ( ) . toString ( ) . c_str ( ) , items [ items . size ( ) - 1 - i ] . toString ( ) . c_str ( ) ) ;
2019-07-02 15:58:43 +08:00
printf ( " Delta: %s \n " , rev . node - > raw - > delta ( ) . toString ( ) . c_str ( ) ) ;
2019-02-21 18:46:30 +08:00
ASSERT ( false ) ;
}
+ + i ;
ASSERT ( fwd . moveNext ( ) = = rev . movePrev ( ) ) ;
ASSERT ( fwd . valid ( ) = = rev . valid ( ) ) ;
if ( ! fwd . valid ( ) ) {
break ;
}
}
ASSERT ( i = = items . size ( ) ) ;
double start = timer ( ) ;
DeltaTree < RedwoodRecordRef > : : Cursor c = r . getCursor ( ) ;
for ( int i = 0 ; i < 20000000 ; + + i ) {
2019-06-25 11:17:49 +08:00
const RedwoodRecordRef & query = items [ deterministicRandom ( ) - > randomInt ( 0 , items . size ( ) ) ] ;
2019-02-21 18:46:30 +08:00
if ( ! c . seekLessThanOrEqual ( query ) ) {
printf ( " Not found! query=%s \n " , query . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
if ( c . get ( ) ! = query ) {
printf ( " Found incorrect node! query=%s found=%s \n " , query . toString ( ) . c_str ( ) , c . get ( ) . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
}
double elapsed = timer ( ) - start ;
printf ( " Elapsed %f \n " , elapsed ) ;
return Void ( ) ;
}
2019-05-29 21:23:32 +08:00
TEST_CASE ( " !/redwood/correctness/unit/deltaTree/IntIntPair " ) {
2019-02-21 18:46:30 +08:00
const int N = 200 ;
IntIntPair prev = { 0 , 0 } ;
IntIntPair next = { 1000 , 0 } ;
std : : vector < IntIntPair > items ;
for ( int i = 0 ; i < N ; + + i ) {
items . push_back ( { i * 10 , i * 1000 } ) ;
//printf("i=%d %s\n", i, items.back().toString().c_str());
}
DeltaTree < IntIntPair > * tree = ( DeltaTree < IntIntPair > * ) new uint8_t [ 10000 ] ;
tree - > build ( & items [ 0 ] , & items [ items . size ( ) ] , & prev , & next ) ;
printf ( " Count=%d Size=%d InitialDepth=%d \n " , ( int ) items . size ( ) , ( int ) tree - > size ( ) , ( int ) tree - > initialDepth ) ;
debug_printf ( " Data(%p): %s \n " , tree , StringRef ( ( uint8_t * ) tree , tree - > size ( ) ) . toHexString ( ) . c_str ( ) ) ;
DeltaTree < IntIntPair > : : Reader r ( tree , & prev , & next ) ;
DeltaTree < IntIntPair > : : Cursor fwd = r . getCursor ( ) ;
DeltaTree < IntIntPair > : : Cursor rev = r . getCursor ( ) ;
ASSERT ( fwd . moveFirst ( ) ) ;
ASSERT ( rev . moveLast ( ) ) ;
int i = 0 ;
while ( 1 ) {
if ( fwd . get ( ) ! = items [ i ] ) {
printf ( " forward iterator i=%d \n %s found \n %s expected \n " , i , fwd . get ( ) . toString ( ) . c_str ( ) , items [ i ] . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
if ( rev . get ( ) ! = items [ items . size ( ) - 1 - i ] ) {
printf ( " reverse iterator i=%d \n %s found \n %s expected \n " , i , rev . get ( ) . toString ( ) . c_str ( ) , items [ items . size ( ) - 1 - i ] . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
+ + i ;
ASSERT ( fwd . moveNext ( ) = = rev . movePrev ( ) ) ;
ASSERT ( fwd . valid ( ) = = rev . valid ( ) ) ;
if ( ! fwd . valid ( ) ) {
break ;
}
}
ASSERT ( i = = items . size ( ) ) ;
DeltaTree < IntIntPair > : : Cursor c = r . getCursor ( ) ;
double start = timer ( ) ;
for ( int i = 0 ; i < 20000000 ; + + i ) {
2019-06-25 11:17:49 +08:00
IntIntPair p ( { deterministicRandom ( ) - > randomInt ( 0 , items . size ( ) * 10 ) , 0 } ) ;
2019-02-21 18:46:30 +08:00
if ( ! c . seekLessThanOrEqual ( p ) ) {
printf ( " Not found! query=%s \n " , p . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
if ( c . get ( ) . k ! = ( p . k - ( p . k % 10 ) ) ) {
printf ( " Found incorrect node! query=%s found=%s \n " , p . toString ( ) . c_str ( ) , c . get ( ) . toString ( ) . c_str ( ) ) ;
ASSERT ( false ) ;
}
}
double elapsed = timer ( ) - start ;
printf ( " Elapsed %f \n " , elapsed ) ;
return Void ( ) ;
}
struct SimpleCounter {
SimpleCounter ( ) : x ( 0 ) , xt ( 0 ) , t ( timer ( ) ) , start ( t ) { }
void operator + = ( int n ) { x + = n ; }
void operator + + ( ) { x + + ; }
int64_t get ( ) { return x ; }
double rate ( ) {
double t2 = timer ( ) ;
int r = ( x - xt ) / ( t2 - t ) ;
xt = x ;
t = t2 ;
return r ;
}
double avgRate ( ) { return x / ( timer ( ) - start ) ; }
int64_t x ;
double t ;
double start ;
int64_t xt ;
2019-05-29 21:23:32 +08:00
std : : string toString ( ) { return format ( " % " PRId64 " /%.2f/%.2f " , x , rate ( ) / 1e6 , avgRate ( ) / 1e6 ) ; }
2019-02-21 18:46:30 +08:00
} ;
2019-05-29 21:23:32 +08:00
TEST_CASE ( " !/redwood/correctness/btree " ) {
2018-09-20 10:16:18 +08:00
state bool useDisk = true ; // MemoryPager is not being maintained currently.
2017-07-14 13:11:48 +08:00
2018-07-05 12:12:09 +08:00
state std : : string pagerFile = " unittest_pageFile " ;
2018-10-15 18:43:43 +08:00
IPager * pager ;
2018-08-29 04:46:14 +08:00
2019-06-25 11:17:49 +08:00
state bool serialTest = deterministicRandom ( ) - > coinflip ( ) ;
state bool shortTest = deterministicRandom ( ) - > coinflip ( ) ;
2019-05-22 10:16:32 +08:00
state bool singleVersion = true ; // Multi-version mode is broken / not finished
2019-05-22 13:19:14 +08:00
state double startTime = now ( ) ;
2019-05-22 10:16:32 +08:00
printf ( " serialTest: %d shortTest: %d singleVersion: %d \n " , serialTest , shortTest , singleVersion ) ;
2018-10-15 18:43:43 +08:00
if ( useDisk ) {
2019-02-24 19:47:32 +08:00
printf ( " Deleting existing test data... \n " ) ;
2018-10-15 18:43:43 +08:00
deleteFile ( pagerFile ) ;
deleteFile ( pagerFile + " 0.pagerlog " ) ;
deleteFile ( pagerFile + " 1.pagerlog " ) ;
2017-09-22 14:51:55 +08:00
pager = new IndirectShadowPager ( pagerFile ) ;
2018-10-15 18:43:43 +08:00
}
2017-07-14 13:11:48 +08:00
else
pager = createMemoryPager ( ) ;
2019-02-24 19:47:32 +08:00
printf ( " Initializing... \n " ) ;
2019-06-25 11:17:49 +08:00
state int pageSize = shortTest ? 200 : ( deterministicRandom ( ) - > coinflip ( ) ? pager - > getUsablePageSize ( ) : deterministicRandom ( ) - > randomInt ( 200 , 400 ) ) ;
2019-04-30 08:00:29 +08:00
state VersionedBTree * btree = new VersionedBTree ( pager , pagerFile , singleVersion , pageSize ) ;
2018-09-20 18:39:55 +08:00
wait ( btree - > init ( ) ) ;
2017-06-10 05:56:41 +08:00
2017-09-06 07:59:31 +08:00
// We must be able to fit at least two any two keys plus overhead in a page to prevent
// a situation where the tree cannot be grown upward with decreasing level size.
2018-06-12 16:43:19 +08:00
// TODO: Handle arbitrarily large keys
2019-06-25 11:17:49 +08:00
state int maxKeySize = deterministicRandom ( ) - > randomInt ( 4 , pageSize * 2 ) ;
state int maxValueSize = deterministicRandom ( ) - > randomInt ( 0 , pageSize * 4 ) ;
2019-05-22 10:16:32 +08:00
state int maxCommitSize = shortTest ? 1000 : randomSize ( 10e6 ) ;
state int mutationBytesTarget = shortTest ? 5000 : randomSize ( 50e6 ) ;
2019-06-25 11:17:49 +08:00
state double clearChance = deterministicRandom ( ) - > random01 ( ) * .1 ;
2017-09-06 07:59:31 +08:00
2019-03-15 15:46:09 +08:00
printf ( " Using page size %d, max key size %d, max value size %d, clearchance %f, total mutation byte target %d \n " , pageSize , maxKeySize , maxValueSize , clearChance , mutationBytesTarget ) ;
2017-09-06 07:59:31 +08:00
2017-08-22 13:29:57 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > written ;
state std : : set < Key > keys ;
2017-06-10 05:56:41 +08:00
2017-09-15 20:19:39 +08:00
state Version lastVer = wait ( btree - > getLatestVersion ( ) ) ;
2019-05-05 01:52:02 +08:00
printf ( " Starting from version: % " PRId64 " \n " , lastVer ) ;
2017-07-14 13:11:48 +08:00
2017-06-10 05:56:41 +08:00
state Version version = lastVer + 1 ;
2018-09-28 07:07:29 +08:00
btree - > setWriteVersion ( version ) ;
2018-09-28 15:35:03 +08:00
2019-02-21 18:46:30 +08:00
state SimpleCounter mutationBytes ;
state SimpleCounter keyBytesInserted ;
state SimpleCounter valueBytesInserted ;
state SimpleCounter sets ;
state SimpleCounter rangeClears ;
state SimpleCounter keyBytesCleared ;
2018-09-28 15:35:03 +08:00
state int errorCount ;
2019-02-24 19:47:32 +08:00
state int mutationBytesThisCommit = 0 ;
state int mutationBytesTargetThisCommit = randomSize ( maxCommitSize ) ;
2017-07-26 07:10:19 +08:00
2018-09-28 15:35:03 +08:00
state PromiseStream < Version > committedVersions ;
2019-05-22 10:16:32 +08:00
state Future < Void > verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount , serialTest ) ;
state Future < Void > randomTask = serialTest ? Void ( ) : ( randomReader ( btree ) | | btree - > getError ( ) ) ;
2018-09-28 07:07:29 +08:00
2018-10-02 07:51:57 +08:00
state Future < Void > commit = Void ( ) ;
2019-02-21 18:46:30 +08:00
while ( mutationBytes . get ( ) < mutationBytesTarget ) {
2019-05-22 13:19:14 +08:00
if ( now ( ) - startTime > 600 ) {
mutationBytesTarget = mutationBytes . get ( ) ;
}
2018-09-28 07:07:29 +08:00
// Sometimes advance the version
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < 0.10 ) {
2017-07-15 02:37:08 +08:00
+ + version ;
2017-06-10 05:56:41 +08:00
btree - > setWriteVersion ( version ) ;
}
2018-09-28 07:07:29 +08:00
// Sometimes do a clear range
2019-06-25 11:17:49 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < clearChance ) {
2018-09-28 07:07:29 +08:00
Key start = randomKV ( maxKeySize , 1 ) . key ;
2019-05-11 05:01:52 +08:00
Key end = ( deterministicRandom ( ) - > random01 ( ) < .01 ) ? keyAfter ( start ) : randomKV ( maxKeySize , 1 ) . key ;
2017-06-10 05:56:41 +08:00
2018-09-28 07:07:29 +08:00
// Sometimes replace start and/or end with a close actual (previously used) value
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .10 ) {
2018-09-28 07:07:29 +08:00
auto i = keys . upper_bound ( start ) ;
if ( i ! = keys . end ( ) )
start = * i ;
}
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .10 ) {
2018-09-28 07:07:29 +08:00
auto i = keys . upper_bound ( end ) ;
if ( i ! = keys . end ( ) )
end = * i ;
}
2017-07-14 13:11:48 +08:00
2018-09-28 07:07:29 +08:00
if ( end = = start )
end = keyAfter ( start ) ;
else if ( end < start ) {
std : : swap ( end , start ) ;
}
2017-07-14 13:11:48 +08:00
2019-02-21 18:46:30 +08:00
+ + rangeClears ;
2018-09-28 07:07:29 +08:00
KeyRangeRef range ( start , end ) ;
2019-05-29 21:23:32 +08:00
debug_printf ( " Mutation: Clear '%s' to '%s' @% " PRId64 " \n " , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , version ) ;
2018-09-28 07:07:29 +08:00
auto e = written . lower_bound ( std : : make_pair ( start . toString ( ) , 0 ) ) ;
if ( e ! = written . end ( ) ) {
auto last = e ;
auto eEnd = written . lower_bound ( std : : make_pair ( end . toString ( ) , 0 ) ) ;
while ( e ! = eEnd ) {
auto w = * e ;
+ + e ;
// If e key is different from last and last was present then insert clear for last's key at version
if ( last ! = eEnd & & ( ( e = = eEnd | | e - > first . first ! = last - > first . first ) & & last - > second . present ( ) ) ) {
2019-05-29 21:23:32 +08:00
debug_printf ( " Mutation: Clearing key '%s' @% " PRId64 " \n " , last - > first . first . c_str ( ) , version ) ;
2019-02-21 18:46:30 +08:00
keyBytesCleared + = last - > first . first . size ( ) ;
2019-02-24 19:47:32 +08:00
mutationBytes + = last - > first . first . size ( ) ;
mutationBytesThisCommit + = last - > first . first . size ( ) ;
2019-02-21 18:46:30 +08:00
2018-09-28 07:07:29 +08:00
// If the last set was at version then just make it not present
if ( last - > first . second = = version ) {
2019-02-21 18:46:30 +08:00
last - > second . reset ( ) ;
2018-09-28 07:07:29 +08:00
}
else {
2019-02-21 18:46:30 +08:00
written [ std : : make_pair ( last - > first . first , version ) ] . reset ( ) ;
2018-09-28 07:07:29 +08:00
}
}
last = e ;
}
}
2017-07-14 13:11:48 +08:00
2018-09-28 07:07:29 +08:00
btree - > clear ( range ) ;
2017-07-14 13:11:48 +08:00
}
2018-09-28 07:07:29 +08:00
else {
// Set a key
KeyValue kv = randomKV ( maxKeySize , maxValueSize ) ;
// Sometimes change key to a close previously used key
2019-05-11 05:01:52 +08:00
if ( deterministicRandom ( ) - > random01 ( ) < .01 ) {
2018-09-28 07:07:29 +08:00
auto i = keys . upper_bound ( kv . key ) ;
if ( i ! = keys . end ( ) )
kv . key = StringRef ( kv . arena ( ) , * i ) ;
}
2019-02-21 18:46:30 +08:00
2019-05-29 21:23:32 +08:00
debug_printf ( " Mutation: Set '%s' -> '%s' @% " PRId64 " \n " , kv . key . toString ( ) . c_str ( ) , kv . value . toString ( ) . c_str ( ) , version ) ;
2019-02-21 18:46:30 +08:00
+ + sets ;
2018-09-28 07:07:29 +08:00
keyBytesInserted + = kv . key . size ( ) ;
2019-02-21 18:46:30 +08:00
valueBytesInserted + = kv . value . size ( ) ;
2018-09-28 07:07:29 +08:00
mutationBytes + = ( kv . key . size ( ) + kv . value . size ( ) ) ;
2019-02-24 19:47:32 +08:00
mutationBytesThisCommit + = ( kv . key . size ( ) + kv . value . size ( ) ) ;
2019-02-21 18:46:30 +08:00
2018-09-28 07:07:29 +08:00
btree - > set ( kv ) ;
written [ std : : make_pair ( kv . key . toString ( ) , version ) ] = kv . value . toString ( ) ;
keys . insert ( kv . key ) ;
}
2019-02-24 19:47:32 +08:00
// Commit at end or after this commit's mutation bytes are reached
if ( mutationBytes . get ( ) > = mutationBytesTarget | | mutationBytesThisCommit > = mutationBytesTargetThisCommit ) {
// Wait for previous commit to finish
wait ( commit ) ;
2019-05-29 21:23:32 +08:00
printf ( " Committed. Next commit %d bytes, % " PRId64 " /%d (%.2f%%) Stats: Insert %.2f MB/s ClearedKeys %.2f MB/s Total %.2f \n " ,
2019-02-24 19:47:32 +08:00
mutationBytesThisCommit ,
mutationBytes . get ( ) ,
mutationBytesTarget ,
( double ) mutationBytes . get ( ) / mutationBytesTarget * 100 ,
( keyBytesInserted . rate ( ) + valueBytesInserted . rate ( ) ) / 1e6 ,
keyBytesCleared . rate ( ) / 1e6 ,
mutationBytes . rate ( ) / 1e6
) ;
Version v = version ; // Avoid capture of version as a member of *this
commit = map ( btree - > commit ( ) , [ = ] ( Void ) {
2018-10-02 07:51:57 +08:00
// Notify the background verifier that version is committed and therefore readable
committedVersions . send ( v ) ;
return Void ( ) ;
} ) ;
2019-05-22 10:16:32 +08:00
if ( serialTest ) {
// Wait for commit, wait for verification, then start new verification
wait ( commit ) ;
committedVersions . sendError ( end_of_stream ( ) ) ;
debug_printf ( " Waiting for verification to complete. \n " ) ;
wait ( verifyTask ) ;
committedVersions = PromiseStream < Version > ( ) ;
verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount , serialTest ) ;
}
2019-02-24 19:47:32 +08:00
mutationBytesThisCommit = 0 ;
mutationBytesTargetThisCommit = randomSize ( maxCommitSize ) ;
2018-09-28 07:07:29 +08:00
2018-10-02 07:51:57 +08:00
// Recover from disk at random
2019-06-25 11:17:49 +08:00
if ( ! serialTest & & useDisk & & deterministicRandom ( ) - > random01 ( ) < .02 ) {
2018-10-02 07:51:57 +08:00
printf ( " Recovering from disk. \n " ) ;
// Wait for outstanding commit
debug_printf ( " Waiting for outstanding commit \n " ) ;
wait ( commit ) ;
2018-09-28 15:35:03 +08:00
// Stop and wait for the verifier task
committedVersions . sendError ( end_of_stream ( ) ) ;
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for verification to complete. \n " ) ;
2018-09-28 15:35:03 +08:00
wait ( verifyTask ) ;
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-28 07:07:29 +08:00
wait ( closedFuture ) ;
2018-10-25 06:57:06 +08:00
debug_printf ( " Reopening btree \n " ) ;
2018-10-15 18:43:43 +08:00
IPager * pager = new IndirectShadowPager ( pagerFile ) ;
2019-04-30 08:00:29 +08:00
btree = new VersionedBTree ( pager , pagerFile , singleVersion , pageSize ) ;
2018-09-28 07:07:29 +08:00
wait ( btree - > init ( ) ) ;
Version v = wait ( btree - > getLatestVersion ( ) ) ;
ASSERT ( v = = version ) ;
2019-05-05 01:52:02 +08:00
printf ( " Recovered from disk. Latest version % " PRId64 " \n " , v ) ;
2017-06-10 05:56:41 +08:00
2018-09-28 15:35:03 +08:00
// Create new promise stream and start the verifier again
committedVersions = PromiseStream < Version > ( ) ;
2019-05-22 10:16:32 +08:00
verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount , serialTest ) ;
2018-10-15 18:43:43 +08:00
randomTask = randomReader ( btree ) | | btree - > getError ( ) ;
2017-06-10 05:56:41 +08:00
}
2018-09-28 15:35:03 +08:00
+ + version ;
btree - > setWriteVersion ( version ) ;
2018-09-28 07:07:29 +08:00
}
2018-06-08 18:32:34 +08:00
2019-02-24 19:47:32 +08:00
// Check for errors
if ( errorCount ! = 0 )
throw internal_error ( ) ;
2017-06-10 05:56:41 +08:00
}
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for outstanding commit \n " ) ;
wait ( commit ) ;
2018-09-28 15:35:03 +08:00
committedVersions . sendError ( end_of_stream ( ) ) ;
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for verification to complete. \n " ) ;
2018-09-28 15:35:03 +08:00
wait ( verifyTask ) ;
2019-02-24 19:47:32 +08:00
// Check for errors
if ( errorCount ! = 0 )
throw internal_error ( ) ;
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-07-14 13:11:48 +08:00
return Void ( ) ;
}
2019-06-24 16:05:16 +08:00
ACTOR Future < Void > randomSeeks ( VersionedBTree * btree , int count ) {
2019-03-15 15:46:09 +08:00
state Version readVer = wait ( btree - > getLatestVersion ( ) ) ;
state int c = 0 ;
state double readStart = timer ( ) ;
2019-06-24 16:05:16 +08:00
printf ( " Executing %d random seeks \n " , count ) ;
2019-03-15 15:46:09 +08:00
state Reference < IStoreCursor > cur = btree - > readAtVersion ( readVer ) ;
while ( c < count ) {
2019-06-24 16:05:16 +08:00
state Key k = randomString ( 20 , ' a ' , ' b ' ) ;
wait ( success ( cur - > findFirstEqualOrGreater ( k , false , 0 ) ) ) ;
2019-03-15 15:46:09 +08:00
+ + c ;
}
double elapsed = timer ( ) - readStart ;
printf ( " Point read speed %d/s \n " , int ( count / elapsed ) ) ;
return Void ( ) ;
}
2018-10-06 13:13:22 +08:00
TEST_CASE ( " !/redwood/performance/set " ) {
2018-10-15 18:43:43 +08:00
state std : : string pagerFile = " unittest_pageFile " ;
2019-02-24 19:47:32 +08:00
printf ( " Deleting old test data \n " ) ;
2018-10-15 18:43:43 +08:00
deleteFile ( pagerFile ) ;
deleteFile ( pagerFile + " 0.pagerlog " ) ;
deleteFile ( pagerFile + " 1.pagerlog " ) ;
2019-04-30 08:00:29 +08:00
2018-10-15 18:43:43 +08:00
IPager * pager = new IndirectShadowPager ( pagerFile ) ;
2019-04-30 08:00:29 +08:00
state bool singleVersion = true ;
state VersionedBTree * btree = new VersionedBTree ( pager , " unittest_pageFile " , singleVersion ) ;
2018-09-20 18:39:55 +08:00
wait ( btree - > init ( ) ) ;
2017-07-14 13:11:48 +08:00
2019-03-15 15:46:09 +08:00
state int nodeCount = 1e9 ;
2019-06-24 16:05:16 +08:00
state int maxChangesPerVersion = 500000 ;
2019-04-30 08:00:29 +08:00
state int64_t kvBytesTarget = 200e6 ;
2019-06-24 16:05:16 +08:00
state int maxKeyPrefixSize = 50 ;
state int maxValueSize = 100 ;
state int maxConsecutiveRun = 1 ;
2018-06-15 08:52:25 +08:00
state int64_t kvBytes = 0 ;
2019-03-15 15:46:09 +08:00
state int64_t kvBytesTotal = 0 ;
2018-06-15 08:52:25 +08:00
state int records = 0 ;
2018-10-02 07:51:57 +08:00
state Future < Void > commit = Void ( ) ;
2019-06-24 16:05:16 +08:00
state std : : string value ( maxValueSize , ' v ' ) ;
2017-07-14 13:11:48 +08:00
2019-02-24 19:47:32 +08:00
printf ( " Starting. \n " ) ;
state double intervalStart = timer ( ) ;
2019-06-24 16:05:16 +08:00
state double start = intervalStart ;
2019-02-24 19:47:32 +08:00
2019-03-15 15:46:09 +08:00
while ( kvBytesTotal < kvBytesTarget ) {
2017-07-14 13:11:48 +08:00
Version lastVer = wait ( btree - > getLatestVersion ( ) ) ;
state Version version = lastVer + 1 ;
btree - > setWriteVersion ( version ) ;
2019-05-11 05:01:52 +08:00
int changes = deterministicRandom ( ) - > randomInt ( 0 , maxChangesPerVersion ) ;
2019-06-24 16:05:16 +08:00
while ( changes > 0 ) {
2017-07-14 13:11:48 +08:00
KeyValue kv ;
2019-06-25 11:17:49 +08:00
kv . key = randomString ( kv . arena ( ) , deterministicRandom ( ) - > randomInt ( sizeof ( uint32_t ) , maxKeyPrefixSize + sizeof ( uint32_t ) + 1 ) , ' a ' , ' b ' ) ;
int32_t index = deterministicRandom ( ) - > randomInt ( 0 , nodeCount ) ;
int runLength = deterministicRandom ( ) - > randomInt ( 1 , maxConsecutiveRun + 1 ) ;
2019-06-24 16:05:16 +08:00
while ( runLength > 0 & & changes > 0 ) {
* ( uint32_t * ) ( kv . key . end ( ) - sizeof ( uint32_t ) ) = bigEndian32 ( index + + ) ;
2019-06-25 11:17:49 +08:00
kv . value = StringRef ( ( uint8_t * ) value . data ( ) , deterministicRandom ( ) - > randomInt ( 0 , value . size ( ) ) ) ;
2019-06-24 16:05:16 +08:00
btree - > set ( kv ) ;
- - runLength ;
- - changes ;
kvBytes + = kv . key . size ( ) + kv . value . size ( ) ;
+ + records ;
2019-04-30 08:00:29 +08:00
}
2017-07-14 13:11:48 +08:00
}
2019-06-24 16:05:16 +08:00
if ( kvBytes > 2e6 ) {
2018-10-02 07:51:57 +08:00
wait ( commit ) ;
2019-06-24 16:05:16 +08:00
printf ( " Cumulative %.2f MB keyValue bytes written at %.2f MB/s \n " , kvBytesTotal / 1e6 , kvBytesTotal / ( timer ( ) - start ) / 1e6 ) ;
2019-05-22 14:49:27 +08:00
2019-06-24 16:05:16 +08:00
// Avoid capturing via this to freeze counter values
2019-02-24 19:47:32 +08:00
int recs = records ;
int kvb = kvBytes ;
2019-06-18 09:55:49 +08:00
// Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object
double * pIntervalStart = & intervalStart ;
2019-02-24 19:47:32 +08:00
commit = map ( btree - > commit ( ) , [ = ] ( Void result ) {
2019-06-18 09:55:49 +08:00
double elapsed = timer ( ) - * pIntervalStart ;
2019-02-24 19:47:32 +08:00
printf ( " Committed %d kvBytes in %d records in %f seconds, %.2f MB/s \n " , kvb , recs , elapsed , kvb / elapsed / 1e6 ) ;
2019-06-18 09:55:49 +08:00
* pIntervalStart = timer ( ) ;
2019-02-24 19:47:32 +08:00
return Void ( ) ;
} ) ;
2019-06-24 16:05:16 +08:00
kvBytesTotal + = kvBytes ;
2019-02-24 19:47:32 +08:00
kvBytes = 0 ;
2019-06-24 16:05:16 +08:00
records = 0 ;
2017-07-14 13:11:48 +08:00
}
}
2019-02-24 19:47:32 +08:00
wait ( commit ) ;
2019-06-25 11:17:49 +08:00
printf ( " Cumulative %.2f MB keyValue bytes written at %.2f MB/s \n " , kvBytesTotal / 1e6 , kvBytesTotal / ( timer ( ) - start ) / 1e6 ) ;
2017-07-14 13:11:48 +08:00
2019-06-24 16:05:16 +08:00
state int reads = 30000 ;
wait ( randomSeeks ( btree , reads ) & & randomSeeks ( btree , reads ) & & randomSeeks ( btree , reads ) ) ;
2017-07-14 13:11:48 +08:00
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-07-14 13:11:48 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}