2017-06-10 05:56:41 +08:00
/*
* VersionedBTree . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
*
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
*
* http : //www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
# include "flow/flow.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/IVersionedStore.h"
# include "fdbserver/IPager.h"
2017-06-10 05:56:41 +08:00
# include "fdbclient/Tuple.h"
# include "flow/serialize.h"
# include "flow/genericactors.actor.h"
# include "flow/UnitTest.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/MemoryPager.h"
# include "fdbserver/IndirectShadowPager.h"
2017-06-10 05:56:41 +08:00
# include <map>
# include <vector>
2017-08-04 15:01:25 +08:00
# include "fdbclient/CommitTransaction.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/IKeyValueStore.h"
# include "fdbserver/PrefixTree.h"
2018-07-23 18:09:13 +08:00
# include <string.h>
2018-10-19 11:26:45 +08:00
# include "flow/actorcompiler.h"
2019-05-05 01:52:02 +08:00
# include <cinttypes>
2018-07-23 18:09:13 +08:00
// Convenience method for converting a Standalone to a Ref while adding its arena to another arena.
2018-07-25 17:29:17 +08:00
template < typename T > inline const Standalone < T > & dependsOn ( Arena & arena , const Standalone < T > & s ) {
2018-07-23 18:09:13 +08:00
arena . dependsOn ( s . arena ( ) ) ;
return s ;
}
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
struct BTreePage {
enum EPageFlags { IS_LEAF = 1 } ;
2017-06-10 05:56:41 +08:00
2018-10-19 11:26:45 +08:00
# pragma pack(push,4)
2018-06-08 18:32:34 +08:00
uint8_t flags ;
uint16_t count ;
uint32_t kvBytes ;
2018-09-19 15:32:39 +08:00
uint8_t extensionPageCount ;
LogicalPageID extensionPages [ 0 ] ;
2018-10-19 11:26:45 +08:00
# pragma pack(pop)
2017-06-10 05:56:41 +08:00
2018-09-19 15:32:39 +08:00
PrefixTree & tree ( ) {
return * ( PrefixTree * ) ( extensionPages + extensionPageCount ) ;
}
const PrefixTree & tree ( ) const {
return * ( const PrefixTree * ) ( extensionPages + extensionPageCount ) ;
}
static inline int GetHeaderSize ( int extensionPages = 0 ) {
return sizeof ( BTreePage ) + extensionPages + sizeof ( LogicalPageID ) ;
2018-06-14 19:15:14 +08:00
}
2018-08-29 04:46:14 +08:00
std : : string toString ( bool write , LogicalPageID id , Version ver , StringRef lowerBoundKey , StringRef upperBoundKey ) const {
2018-06-08 18:32:34 +08:00
std : : string r ;
2018-08-29 04:46:14 +08:00
r + = format ( " BTreePage op=%s id=%d ver=%lld ptr=%p flags=0x%X count=%d kvBytes=%d \n lowerBoundKey='%s' \n upperBoundKey='%s' " ,
write ? " write " : " read " , id , ver , this , ( int ) flags , ( int ) count , ( int ) kvBytes ,
2018-09-24 17:42:23 +08:00
lowerBoundKey . toHexString ( 20 ) . c_str ( ) , upperBoundKey . toHexString ( 20 ) . c_str ( ) ) ;
2018-08-29 04:46:14 +08:00
try {
if ( count > 0 ) {
2018-09-19 15:32:39 +08:00
PrefixTree : : Cursor c = tree ( ) . getCursor ( lowerBoundKey , upperBoundKey ) ;
2018-08-29 04:46:14 +08:00
c . moveFirst ( ) ;
ASSERT ( c . valid ( ) ) ;
do {
r + = " \n " ;
Tuple t ;
try {
t = Tuple : : unpack ( c . getKeyRef ( ) ) ;
for ( int i = 0 ; i < t . size ( ) ; + + i ) {
if ( i ! = 0 )
r + = " , " ;
if ( t . getType ( i ) = = Tuple : : ElementType : : BYTES )
r + = format ( " '%s' " , t . getString ( i ) . printable ( ) . c_str ( ) ) ;
if ( t . getType ( i ) = = Tuple : : ElementType : : INT )
r + = format ( " %lld " , t . getInt ( i , true ) ) ;
}
2019-03-27 00:58:54 +08:00
} catch ( Error & ) {
2018-07-18 18:19:35 +08:00
}
2018-09-24 17:42:23 +08:00
r + = format ( " ['%s'] " , c . getKeyRef ( ) . toHexString ( 20 ) . c_str ( ) ) ;
2018-07-18 18:19:35 +08:00
2018-08-29 04:46:14 +08:00
r + = " -> " ;
2018-12-15 06:43:50 +08:00
if ( flags & IS_LEAF )
2018-09-24 17:42:23 +08:00
r + = format ( " '%s' " , c . getValueRef ( ) . toHexString ( 20 ) . c_str ( ) ) ;
2018-08-29 04:46:14 +08:00
else
2018-09-19 15:32:39 +08:00
r + = format ( " Page id=%u " , * ( const uint32_t * ) c . getValueRef ( ) . begin ( ) ) ;
2017-06-10 05:56:41 +08:00
2018-08-29 04:46:14 +08:00
} while ( c . moveNext ( ) ) ;
}
2019-04-18 03:57:23 +08:00
} catch ( Error & e ) {
2018-08-29 04:46:14 +08:00
debug_printf ( " BTreePage::toString ERROR: %s \n " , e . what ( ) ) ;
debug_printf ( " BTreePage::toString partial result: %s \n " , r . c_str ( ) ) ;
throw ;
2018-06-08 18:32:34 +08:00
}
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
return r ;
2017-06-10 05:56:41 +08:00
}
2018-10-19 11:26:45 +08:00
} ;
2018-06-08 18:32:34 +08:00
static void writeEmptyPage ( Reference < IPage > page , uint8_t newFlags , int pageSize ) {
BTreePage * btpage = ( BTreePage * ) page - > begin ( ) ;
btpage - > flags = newFlags ;
btpage - > kvBytes = 0 ;
btpage - > count = 0 ;
2018-10-25 06:57:06 +08:00
btpage - > extensionPageCount = 0 ;
2018-09-19 15:32:39 +08:00
btpage - > tree ( ) . build ( nullptr , nullptr , StringRef ( ) , StringRef ( ) ) ;
2018-06-08 18:32:34 +08:00
}
2017-06-10 05:56:41 +08:00
2018-09-19 15:32:39 +08:00
struct BoundaryAndPage {
2018-08-29 04:46:14 +08:00
Key lowerBound ;
2018-09-19 15:32:39 +08:00
// Only firstPage or multiPage will be in use at once
Reference < IPage > firstPage ;
std : : vector < Reference < IPage > > extPages ;
2018-08-29 04:46:14 +08:00
} ;
2018-07-10 17:24:01 +08:00
// Returns a std::vector of pairs of lower boundary key indices within kvPairs and encoded pages.
2018-06-08 18:32:34 +08:00
template < typename Allocator >
2018-09-19 15:32:39 +08:00
static std : : vector < BoundaryAndPage > buildPages ( bool minimalBoundaries , StringRef lowerBound , StringRef upperBound , std : : vector < PrefixTree : : EntryRef > entries , uint8_t newFlags , Allocator const & newBlockFn , int usableBlockSize ) {
// This is how much space for the binary tree exists in the page, after the header
int pageSize = usableBlockSize - ( BTreePage : : GetHeaderSize ( ) + PrefixTree : : GetHeaderSize ( ) ) ;
// Each new block adds (usableBlockSize - sizeof(LogicalPageID)) more net usable space *for the binary tree* to pageSize.
int netTreeBlockSize = usableBlockSize - sizeof ( LogicalPageID ) ;
2018-07-15 04:37:52 +08:00
2018-09-19 15:32:39 +08:00
int blockCount = 1 ;
std : : vector < BoundaryAndPage > pages ;
2018-06-08 18:32:34 +08:00
2018-07-15 04:37:52 +08:00
// TODO: Move all of this abstraction breaking stuff into PrefixTree in the form of a helper function or class.
int kvBytes = 0 ; // User key/value bytes in page
2018-08-29 04:46:14 +08:00
int compressedBytes = 0 ; // Conservative estimate of size of compressed page. TODO: Make this exactly right if possible
2018-06-15 08:52:25 +08:00
2018-06-08 18:32:34 +08:00
int start = 0 ;
2018-06-15 08:52:25 +08:00
int i = 0 ;
2018-06-08 18:32:34 +08:00
const int iEnd = entries . size ( ) ;
2018-08-29 04:46:14 +08:00
// Lower bound of the page being added to
Key pageLowerBound = lowerBound ;
Key pageUpperBound ;
while ( i < = iEnd ) {
bool end = i = = iEnd ;
bool flush = end ;
// If not the end, add i to the page if necessary
if ( end ) {
pageUpperBound = upperBound ;
}
else {
// Common prefix with previous record
const PrefixTree : : EntryRef & entry = entries [ i ] ;
int prefixLen = commonPrefixLength ( entry . key , ( i = = start ) ? pageLowerBound : entries [ i - 1 ] . key ) ;
int keySize = entry . key . size ( ) ;
int valueSize = entry . value . size ( ) ;
2018-09-19 15:32:39 +08:00
int spaceNeeded = valueSize + keySize - prefixLen + PrefixTree : : Node : : getMaxOverhead ( i , entry . key . size ( ) , entry . value . size ( ) ) ;
2018-08-29 04:46:14 +08:00
2018-09-19 15:32:39 +08:00
debug_printf ( " Trying to add record %d of %lu (i=%d) klen %d vlen %d prefixLen %d spaceNeeded %d usedSoFar %d/%d '%s' \n " ,
2018-08-29 04:46:14 +08:00
i + 1 , entries . size ( ) , i , keySize , valueSize , prefixLen ,
2018-09-19 15:32:39 +08:00
spaceNeeded , compressedBytes , pageSize , entry . key . toHexString ( 15 ) . c_str ( ) ) ;
2018-08-29 04:46:14 +08:00
int spaceAvailable = pageSize - compressedBytes ;
2018-09-19 15:32:39 +08:00
// Does it fit?
bool fits = spaceAvailable > = spaceNeeded ;
2018-08-29 04:46:14 +08:00
2018-09-19 15:32:39 +08:00
// If it doesn't fit, either end the current page or increase the page size
2018-08-29 04:46:14 +08:00
if ( ! fits ) {
2018-09-19 15:32:39 +08:00
// For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor
int minimumEntries = minimalBoundaries ? 1 : 4 ;
int count = i - start ;
// If not enough entries or page less than half full, increase page size to make the entry fit
if ( count < minimumEntries | | spaceAvailable > pageSize / 2 ) {
// Figure out how many additional whole or partial blocks are needed
int newBlocks = 1 + ( spaceNeeded - spaceAvailable - 1 ) / netTreeBlockSize ;
2018-09-24 17:42:23 +08:00
int newPageSize = pageSize + ( newBlocks * netTreeBlockSize ) ;
if ( newPageSize < = PrefixTree : : MaximumTreeSize ( ) ) {
blockCount + = newBlocks ;
pageSize = newPageSize ;
fits = true ;
}
2018-08-29 04:46:14 +08:00
}
2018-09-24 17:42:23 +08:00
if ( ! fits ) {
2018-09-19 15:32:39 +08:00
// Flush page
2018-08-29 04:46:14 +08:00
if ( minimalBoundaries ) {
// Note that prefixLen is guaranteed to be < entry.key.size() because entries are in increasing order and cannot repeat.
int len = prefixLen + 1 ;
if ( entry . key [ prefixLen ] = = 0 )
len = std : : min ( len + 1 , entry . key . size ( ) ) ;
pageUpperBound = entry . key . substr ( 0 , len ) ;
}
else {
pageUpperBound = entry . key ;
}
}
}
2018-07-15 04:37:52 +08:00
2018-09-19 15:32:39 +08:00
// If the record fits then add it to the page set
if ( fits ) {
2018-08-29 04:46:14 +08:00
kvBytes + = keySize + valueSize ;
2018-09-19 15:32:39 +08:00
compressedBytes + = spaceNeeded ;
2018-08-29 04:46:14 +08:00
+ + i ;
}
2018-07-15 04:37:52 +08:00
2018-08-29 04:46:14 +08:00
flush = ! fits ;
}
2018-07-15 04:37:52 +08:00
2018-08-29 04:46:14 +08:00
// If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above.
if ( flush ) {
end = i = = iEnd ; // i could have been moved above
2018-09-24 17:42:23 +08:00
debug_printf ( " Flushing page start=%d i=%d \n lower='%s' \n upper='%s' \n " , start , i , pageLowerBound . toHexString ( 20 ) . c_str ( ) , pageUpperBound . toHexString ( 20 ) . c_str ( ) ) ;
2018-08-29 04:46:14 +08:00
ASSERT ( pageLowerBound < = pageUpperBound ) ;
for ( int j = start ; j < i ; + + j ) {
debug_printf ( " %d: %s -> %s \n " , j , entries [ j ] . key . toHexString ( 15 ) . c_str ( ) , entries [ j ] . value . toHexString ( 15 ) . c_str ( ) ) ;
}
2018-09-19 15:32:39 +08:00
union {
BTreePage * btPage ;
uint8_t * btPageMem ;
} ;
if ( blockCount = = 1 ) {
Reference < IPage > page = newBlockFn ( ) ;
btPageMem = page - > mutate ( ) ;
pages . push_back ( { std : : move ( pageLowerBound ) , std : : move ( page ) } ) ;
}
else {
ASSERT ( blockCount > 1 ) ;
2018-09-20 09:54:50 +08:00
btPageMem = new uint8_t [ usableBlockSize * blockCount ] ;
# if VALGRIND
// Prevent valgrind errors caused by writing random unneeded bytes to disk.
memset ( btPageMem , 0 , usableBlockSize * blockCount ) ;
# endif
2018-09-19 15:32:39 +08:00
}
btPage - > flags = newFlags ;
btPage - > kvBytes = kvBytes ;
btPage - > count = i - start ;
btPage - > extensionPageCount = blockCount - 1 ;
int written = btPage - > tree ( ) . build ( & entries [ start ] , & entries [ i ] , pageLowerBound , pageUpperBound ) ;
2018-06-12 16:43:19 +08:00
if ( written > pageSize ) {
2018-09-19 15:32:39 +08:00
fprintf ( stderr , " ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d \n " , written , pageSize , blockCount , i - start , kvBytes , compressedBytes ) ;
2018-06-12 16:43:19 +08:00
ASSERT ( false ) ;
}
2018-09-19 15:32:39 +08:00
if ( blockCount ! = 1 ) {
Reference < IPage > page = newBlockFn ( ) ;
const uint8_t * rptr = btPageMem ;
memcpy ( page - > mutate ( ) , rptr , usableBlockSize ) ;
rptr + = usableBlockSize ;
std : : vector < Reference < IPage > > extPages ;
for ( int b = 1 ; b < blockCount ; + + b ) {
Reference < IPage > extPage = newBlockFn ( ) ;
//debug_printf("block %d write offset %d\n", b, firstBlockSize + (b - 1) * usableBlockSize);
memcpy ( extPage - > mutate ( ) , rptr , usableBlockSize ) ;
rptr + = usableBlockSize ;
extPages . push_back ( std : : move ( extPage ) ) ;
}
pages . push_back ( { std : : move ( pageLowerBound ) , std : : move ( page ) , std : : move ( extPages ) } ) ;
delete btPageMem ;
2018-08-29 04:46:14 +08:00
}
2018-09-19 15:32:39 +08:00
if ( end )
break ;
2018-06-08 18:32:34 +08:00
start = i ;
kvBytes = 0 ;
2018-08-29 04:46:14 +08:00
compressedBytes = 0 ;
2018-07-15 04:37:52 +08:00
pageLowerBound = pageUpperBound ;
2017-06-10 05:56:41 +08:00
}
}
2018-06-08 18:32:34 +08:00
//debug_printf("buildPages: returning pages.size %lu, kvpairs %lu\n", pages.size(), kvPairs.size());
return pages ;
}
2017-06-10 05:56:41 +08:00
2018-07-23 18:09:13 +08:00
// Internal key/value records represent either a cleared key at a version or a shard of a value of a key at a version.
// When constructing and packing these it is assumed that the key and value memory is being held elsewhere.
struct KeyVersionValueRef {
2018-07-23 18:49:40 +08:00
KeyVersionValueRef ( ) : version ( invalidVersion ) { }
2018-07-23 18:09:13 +08:00
// Cleared key at version
2018-07-23 18:49:40 +08:00
KeyVersionValueRef ( KeyRef key , Version ver , Optional < ValueRef > val = { } )
: key ( key ) , version ( ver ) , value ( val ) , valueIndex ( 0 )
2018-07-23 18:09:13 +08:00
{
2018-07-23 18:49:40 +08:00
if ( value . present ( ) )
2018-07-23 18:09:13 +08:00
valueTotalSize = value . get ( ) . size ( ) ;
}
KeyVersionValueRef ( Arena & a , const KeyVersionValueRef & toCopy ) {
key = KeyRef ( a , toCopy . key ) ;
version = toCopy . version ;
if ( toCopy . value . present ( ) ) {
value = ValueRef ( a , toCopy . value . get ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2018-07-23 18:09:13 +08:00
valueTotalSize = toCopy . valueTotalSize ;
valueIndex = toCopy . valueIndex ;
}
static inline Key searchKey ( StringRef key , Version ver ) {
Tuple t ;
t . append ( key ) ;
t . append ( ver ) ;
Standalone < VectorRef < uint8_t > > packed = t . getData ( ) ;
packed . append ( packed . arena ( ) , ( const uint8_t * ) " \xff " , 1 ) ;
return Key ( KeyRef ( packed . begin ( ) , packed . size ( ) ) , packed . arena ( ) ) ;
}
KeyRef key ;
2017-08-25 08:25:53 +08:00
Version version ;
2018-07-23 18:09:13 +08:00
int64_t valueTotalSize ; // Total size of value, including all other KVV parts if multipart
int64_t valueIndex ; // Index within reconstituted value of this part
Optional < ValueRef > value ;
2017-08-25 08:25:53 +08:00
2018-07-23 18:09:13 +08:00
// Result undefined if value is not present
bool isMultiPart ( ) const { return value . get ( ) . size ( ) ! = valueTotalSize ; }
2017-08-28 16:57:01 +08:00
bool valid ( ) const { return version ! = invalidVersion ; }
2017-08-25 08:25:53 +08:00
2018-07-23 18:09:13 +08:00
// Generate a kv shard from a complete kv
KeyVersionValueRef split ( int start , int len ) {
ASSERT ( value . present ( ) ) ;
2018-07-23 18:49:40 +08:00
KeyVersionValueRef r ( key , version ) ;
r . value = value . get ( ) . substr ( start , len ) ;
r . valueIndex = start ;
r . valueTotalSize = valueTotalSize ;
return r ;
2018-07-23 18:09:13 +08:00
}
// Encode the record for writing to a btree page.
// If copyValue is false, the value is not copied into the returned arena.
//
// Encoded forms:
// userKey, version - the value is present and complete (which includes an empty value)
// userKey, version, valueSize=0 - the key was deleted as of this version
// userKey, version, valueSize>=0, valuePart - the value is present and spans multiple records
inline PrefixTree : : Entry pack ( bool copyValue = true ) const {
Tuple t ;
t . append ( key ) ;
t . append ( version ) ;
if ( ! value . present ( ) ) {
t . append ( 0 ) ;
}
else {
if ( isMultiPart ( ) ) {
t . append ( valueTotalSize ) ;
t . append ( valueIndex ) ;
}
}
Key k = t . getDataAsStandalone ( ) ;
ValueRef v ;
if ( value . present ( ) ) {
v = copyValue ? StringRef ( k . arena ( ) , value . get ( ) ) : value . get ( ) ;
}
return PrefixTree : : Entry ( { k , v } , k . arena ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2017-09-06 07:59:31 +08:00
// Supports partial/incomplete encoded sequences.
2018-07-23 18:09:13 +08:00
// Unpack an encoded key/value pair.
// Both key and value will be in the returned arena unless copyValue is false in which case
// the value will not be copied to the arena.
static Standalone < KeyVersionValueRef > unpack ( KeyValueRef kv , bool copyValue = true ) {
2018-07-25 17:29:17 +08:00
//debug_printf("Unpacking: '%s' -> '%s' \n", kv.key.toHexString(15).c_str(), kv.value.toHexString(15).c_str());
2018-07-23 18:09:13 +08:00
Standalone < KeyVersionValueRef > result ;
2017-09-06 07:59:31 +08:00
if ( kv . key . size ( ) ! = 0 ) {
2018-07-25 17:29:17 +08:00
# if REDWOOD_DEBUG
2018-09-24 17:42:23 +08:00
try { Tuple t = Tuple : : unpack ( kv . key ) ; } catch ( Error & e ) { debug_printf ( " UNPACK FAIL %s %s \n " , kv . key . toHexString ( 20 ) . c_str ( ) , platform : : get_backtrace ( ) . c_str ( ) ) ; }
2018-07-25 17:29:17 +08:00
# endif
2018-07-18 18:19:35 +08:00
Tuple k = Tuple : : unpack ( kv . key ) ;
2018-07-23 18:09:13 +08:00
int s = k . size ( ) ;
switch ( s ) {
case 4 :
// Value shard
result . valueTotalSize = k . getInt ( 2 ) ;
2018-07-25 17:29:17 +08:00
result . valueIndex = k . getInt ( 3 , true ) ;
2018-07-23 18:09:13 +08:00
result . value = kv . value ;
break ;
case 3 :
// Deleted or Complete value
result . valueIndex = 0 ;
2018-07-25 17:29:17 +08:00
result . valueTotalSize = k . getInt ( 2 , true ) ;
2018-07-23 18:09:13 +08:00
// If not a clear, set the value, otherwise it remains non-present
if ( result . valueTotalSize ! = 0 )
result . value = kv . value ;
break ;
default :
result . valueIndex = 0 ;
result . valueTotalSize = kv . value . size ( ) ;
result . value = kv . value ;
break ;
} ;
if ( s > 0 ) {
Key sk = k . getString ( 0 ) ;
result . arena ( ) . dependsOn ( sk . arena ( ) ) ;
result . key = sk ;
if ( s > 1 ) {
2018-07-25 17:29:17 +08:00
result . version = k . getInt ( 1 , true ) ;
2017-09-06 07:59:31 +08:00
}
}
}
2018-07-23 18:09:13 +08:00
if ( copyValue & & result . value . present ( ) ) {
result . value = StringRef ( result . arena ( ) , result . value . get ( ) ) ;
2017-09-06 07:59:31 +08:00
}
return result ;
2017-08-25 08:25:53 +08:00
}
2018-07-23 18:09:13 +08:00
static Standalone < KeyVersionValueRef > unpack ( KeyRef k ) {
return unpack ( KeyValueRef ( k , StringRef ( ) ) ) ;
2017-08-25 08:25:53 +08:00
}
std : : string toString ( ) const {
2018-07-23 18:09:13 +08:00
std : : string r ;
2018-08-29 04:46:14 +08:00
r + = format ( " '%s' @%lld -> " , key . toHexString ( 15 ) . c_str ( ) , version ) ;
r + = value . present ( ) ? format ( " '%s' %d/%d " , value . get ( ) . toHexString ( 15 ) . c_str ( ) , valueIndex , valueTotalSize ) . c_str ( ) : " <cleared> " ;
2018-07-23 18:09:13 +08:00
return r ;
2017-08-25 08:25:53 +08:00
}
} ;
2018-07-23 18:09:13 +08:00
typedef Standalone < KeyVersionValueRef > KeyVersionValue ;
2017-06-10 05:56:41 +08:00
# define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); }
class VersionedBTree : public IVersionedStore {
public :
2018-07-15 04:37:52 +08:00
// The first possible internal record possible in the tree
2018-07-23 18:09:13 +08:00
static KeyVersionValueRef beginKVV ;
2018-07-15 04:37:52 +08:00
// A record which is greater than the last possible record in the tree
2018-07-23 18:09:13 +08:00
static KeyVersionValueRef endKVV ;
2018-07-15 04:37:52 +08:00
// The encoded key form of the above two things.
static Key beginKey ;
static Key endKey ;
2017-06-10 05:56:41 +08:00
2018-10-25 06:57:06 +08:00
// All async opts on the btree are based on pager reads, writes, and commits, so
// we can mostly forward these next few functions to the pager
2018-10-15 18:43:43 +08:00
virtual Future < Void > getError ( ) {
return m_pager - > getError ( ) ;
}
virtual Future < Void > onClosed ( ) {
return m_pager - > onClosed ( ) ;
}
2018-10-25 06:57:06 +08:00
void close_impl ( bool dispose ) {
IPager * pager = m_pager ;
delete this ;
if ( dispose )
pager - > dispose ( ) ;
else
pager - > close ( ) ;
2018-10-15 18:43:43 +08:00
}
2018-10-25 06:57:06 +08:00
virtual void dispose ( ) {
return close_impl ( true ) ;
2018-10-15 18:43:43 +08:00
}
virtual void close ( ) {
2018-10-25 06:57:06 +08:00
return close_impl ( false ) ;
2018-10-15 18:43:43 +08:00
}
2017-06-10 05:56:41 +08:00
virtual KeyValueStoreType getType ( ) NOT_IMPLEMENTED
virtual bool supportsMutation ( int op ) NOT_IMPLEMENTED
2018-10-25 06:57:06 +08:00
virtual StorageBytes getStorageBytes ( ) {
return m_pager - > getStorageBytes ( ) ;
}
2017-06-10 05:56:41 +08:00
// Writes are provided in an ordered stream.
// A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion()
// A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns
virtual void set ( KeyValueRef keyValue ) {
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion & changes = insertMutationBoundary ( keyValue . key ) - > second . startKeyMutations ;
// Add the set if the changes set is empty or the last entry isn't a set to exactly the same value
2017-08-26 06:48:32 +08:00
if ( changes . empty ( ) | | ! changes . rbegin ( ) - > second . equalToSet ( keyValue . value ) ) {
2017-08-25 08:25:53 +08:00
changes [ m_writeVersion ] = SingleKeyMutation ( keyValue . value ) ;
2017-08-26 06:48:32 +08:00
}
2017-08-22 13:29:57 +08:00
}
virtual void clear ( KeyRangeRef range ) {
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator iBegin = insertMutationBoundary ( range . begin ) ;
MutationBufferT : : iterator iEnd = insertMutationBoundary ( range . end ) ;
// For each boundary in the cleared range
while ( iBegin ! = iEnd ) {
RangeMutation & range = iBegin - > second ;
// Set the rangeClearedVersion if not set
if ( ! range . rangeClearVersion . present ( ) )
range . rangeClearVersion = m_writeVersion ;
// Add a clear to the startKeyMutations map if it's empty or the last item is not a clear
if ( range . startKeyMutations . empty ( ) | | ! range . startKeyMutations . rbegin ( ) - > second . isClear ( ) )
range . startKeyMutations [ m_writeVersion ] = SingleKeyMutation ( ) ;
+ + iBegin ;
}
2017-06-10 05:56:41 +08:00
}
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
virtual void mutate ( int op , StringRef param1 , StringRef param2 ) NOT_IMPLEMENTED
// Versions [begin, end) no longer readable
virtual void forgetVersions ( Version begin , Version end ) NOT_IMPLEMENTED
virtual Future < Version > getLatestVersion ( ) {
if ( m_writeVersion ! = invalidVersion )
return m_writeVersion ;
return m_pager - > getLatestVersion ( ) ;
}
2017-09-23 08:18:28 +08:00
Version getWriteVersion ( ) {
return m_writeVersion ;
}
2017-09-21 19:43:49 +08:00
Version getLastCommittedVersion ( ) {
return m_lastCommittedVersion ;
}
2017-10-10 04:24:16 +08:00
VersionedBTree ( IPager * pager , std : : string name , int target_page_size = - 1 )
2017-08-04 06:07:29 +08:00
: m_pager ( pager ) ,
m_writeVersion ( invalidVersion ) ,
2018-09-19 15:32:39 +08:00
m_usablePageSizeOverride ( pager - > getUsablePageSize ( ) ) ,
2017-09-23 08:18:28 +08:00
m_lastCommittedVersion ( invalidVersion ) ,
m_pBuffer ( nullptr ) ,
2017-10-10 04:24:16 +08:00
m_name ( name )
2017-09-06 07:59:31 +08:00
{
2018-09-19 15:32:39 +08:00
if ( target_page_size > 0 & & target_page_size < m_usablePageSizeOverride )
m_usablePageSizeOverride = target_page_size ;
2017-09-23 08:18:28 +08:00
m_init = init_impl ( this ) ;
2017-10-10 04:24:16 +08:00
m_latestCommit = m_init ;
2017-06-10 05:56:41 +08:00
}
2017-09-21 19:43:49 +08:00
ACTOR static Future < Void > init_impl ( VersionedBTree * self ) {
2017-07-15 02:36:49 +08:00
self - > m_root = 0 ;
state Version latest = wait ( self - > m_pager - > getLatestVersion ( ) ) ;
if ( latest = = 0 ) {
2017-08-04 06:07:29 +08:00
+ + latest ;
2017-09-21 08:50:02 +08:00
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
2018-09-19 15:32:39 +08:00
writeEmptyPage ( page , BTreePage : : IS_LEAF , self - > m_usablePageSizeOverride ) ;
2018-07-15 04:37:52 +08:00
self - > writePage ( self - > m_root , page , latest , StringRef ( ) , StringRef ( ) ) ;
2017-08-04 06:07:29 +08:00
self - > m_pager - > setLatestVersion ( latest ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_pager - > commit ( ) ) ;
2017-07-15 02:36:49 +08:00
}
2017-08-04 06:07:29 +08:00
self - > m_lastCommittedVersion = latest ;
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
2017-09-23 08:18:28 +08:00
Future < Void > init ( ) { return m_init ; }
2017-06-10 05:56:41 +08:00
2017-08-22 13:29:57 +08:00
virtual ~ VersionedBTree ( ) {
2018-10-15 18:43:43 +08:00
// This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe,
// it will cancel init and commit and leave the pager alive but with potentially an incomplete set of
// uncommitted writes so it should not be committed.
2017-10-02 18:32:22 +08:00
m_init . cancel ( ) ;
m_latestCommit . cancel ( ) ;
2017-08-22 13:29:57 +08:00
}
2017-06-10 05:56:41 +08:00
// readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed
// to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations.
// The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less
// than or equal to the given version.
// If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same
// write version, OR it may represent a snapshot as of the call to readAtVersion().
virtual Reference < IStoreCursor > readAtVersion ( Version v ) {
// TODO: Use the buffer to return uncommitted data
2017-07-26 07:10:19 +08:00
// For now, only committed versions can be read.
ASSERT ( v < = m_lastCommittedVersion ) ;
2018-09-19 15:32:39 +08:00
return Reference < IStoreCursor > ( new Cursor ( v , m_pager , m_root , m_usablePageSizeOverride ) ) ;
2017-06-10 05:56:41 +08:00
}
// Must be nondecreasing
virtual void setWriteVersion ( Version v ) {
2017-09-23 08:18:28 +08:00
ASSERT ( v > m_lastCommittedVersion ) ;
// If there was no current mutation buffer, create one in the buffer map and update m_pBuffer
if ( m_pBuffer = = nullptr ) {
// When starting a new mutation buffer its start version must be greater than the last write version
ASSERT ( v > m_writeVersion ) ;
m_pBuffer = & m_mutationBuffers [ v ] ;
// Create range representing the entire keyspace. This reduces edge cases to applying mutations
// because now all existing keys are within some range in the mutation map.
2018-07-15 04:37:52 +08:00
( * m_pBuffer ) [ beginKVV . key ] ;
( * m_pBuffer ) [ endKVV . key ] ;
2017-09-23 08:18:28 +08:00
}
else {
// It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null
ASSERT ( v > = m_writeVersion ) ;
}
2017-06-10 05:56:41 +08:00
m_writeVersion = v ;
}
virtual Future < Void > commit ( ) {
2017-09-23 08:18:28 +08:00
if ( m_pBuffer = = nullptr )
return m_latestCommit ;
2017-06-10 05:56:41 +08:00
return commit_impl ( this ) ;
}
private :
2018-07-15 04:37:52 +08:00
void writePage ( LogicalPageID id , Reference < IPage > page , Version ver , StringRef pageLowerBound , StringRef pageUpperBound ) {
2018-08-29 04:46:14 +08:00
debug_printf ( " writePage(): %s \n " , ( ( const BTreePage * ) page - > begin ( ) ) - > toString ( true , id , ver , pageLowerBound , pageUpperBound ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
m_pager - > writePage ( id , page , ver ) ;
}
LogicalPageID m_root ;
2017-08-22 13:29:57 +08:00
typedef std : : pair < Key , LogicalPageID > KeyPagePairT ;
2017-07-05 14:41:48 +08:00
typedef std : : pair < Version , std : : vector < KeyPagePairT > > VersionedKeyToPageSetT ;
typedef std : : vector < VersionedKeyToPageSetT > VersionedChildrenT ;
2017-08-04 15:01:25 +08:00
2017-08-25 08:25:53 +08:00
// Represents a change to a single key - set, clear, or atomic op
struct SingleKeyMutation {
// Clear
SingleKeyMutation ( ) : op ( MutationRef : : ClearRange ) { }
// Set
SingleKeyMutation ( Value val ) : op ( MutationRef : : SetValue ) , value ( val ) { }
// Atomic Op
SingleKeyMutation ( MutationRef : : Type op , Value val ) : op ( op ) , value ( val ) { }
2017-08-22 13:29:57 +08:00
MutationRef : : Type op ;
2017-08-25 08:25:53 +08:00
Value value ;
2017-08-04 15:01:25 +08:00
2017-08-22 13:29:57 +08:00
inline bool isClear ( ) const { return op = = MutationRef : : ClearRange ; }
inline bool isSet ( ) const { return op = = MutationRef : : SetValue ; }
2017-08-25 08:25:53 +08:00
inline bool isAtomicOp ( ) const { return ! isSet ( ) & & ! isClear ( ) ; }
inline bool equalToSet ( ValueRef val ) { return isSet ( ) & & value = = val ; }
2018-07-23 18:09:13 +08:00
// The returned packed key will be added to arena, the value will just point to the SingleKeyMutation's memory
inline KeyVersionValueRef toKVV ( KeyRef userKey , Version version ) const {
2017-09-06 07:59:31 +08:00
// No point in serializing an atomic op, it needs to be coalesced to a real value.
2017-08-25 08:25:53 +08:00
ASSERT ( ! isAtomicOp ( ) ) ;
if ( isClear ( ) )
2018-07-23 18:09:13 +08:00
return KeyVersionValueRef ( userKey , version ) ;
2017-08-22 13:29:57 +08:00
2018-07-23 18:09:13 +08:00
return KeyVersionValueRef ( userKey , version , value ) ;
2017-08-25 08:25:53 +08:00
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
std : : string toString ( ) const {
2017-08-26 06:48:32 +08:00
return format ( " op=%d val='%s' " , op , printable ( value ) . c_str ( ) ) ;
2017-08-22 13:29:57 +08:00
}
2017-08-04 15:01:25 +08:00
} ;
2017-08-25 08:25:53 +08:00
// Represents mutations on a single key and a possible clear to a range that begins
// immediately after that key
typedef std : : map < Version , SingleKeyMutation > SingleKeyMutationsByVersion ;
struct RangeMutation {
// Mutations for exactly the start key
SingleKeyMutationsByVersion startKeyMutations ;
// A clear range version, if cleared, for the range starting immediately AFTER the start key
Optional < Version > rangeClearVersion ;
// Returns true if this RangeMutation doesn't actually mutate anything
bool noChanges ( ) const {
return ! rangeClearVersion . present ( ) & & startKeyMutations . empty ( ) ;
}
std : : string toString ( ) const {
std : : string result ;
result . append ( " rangeClearVersion: " ) ;
if ( rangeClearVersion . present ( ) )
result . append ( format ( " %lld " , rangeClearVersion . get ( ) ) ) ;
else
result . append ( " <not present> " ) ;
result . append ( " startKeyMutations: " ) ;
for ( SingleKeyMutationsByVersion : : value_type const & m : startKeyMutations )
result . append ( format ( " [%lld => %s] " , m . first , m . second . toString ( ) . c_str ( ) ) ) ;
return result ;
}
} ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
typedef std : : map < Key , RangeMutation > MutationBufferT ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
/* Mutation Buffer Overview
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* MutationBuffer maps the start of a range to a RangeMutation . The end of the range is
* the next range start in the map .
2017-08-22 13:29:57 +08:00
*
2018-07-15 04:37:52 +08:00
* - The buffer starts out with keys ' ' and endKVV . key already populated .
2017-08-25 08:25:53 +08:00
*
* - When a new key is inserted into the buffer map , it is by definition
* splitting an existing range so it should take on the rangeClearVersion of
* the immediately preceding key which is the start of that range
2017-08-22 13:29:57 +08:00
*
* - Keys are inserted into the buffer map for every individual operation ( set / clear / atomic )
* key and for both the start and end of a range clear .
2017-08-25 08:25:53 +08:00
*
2017-08-22 13:29:57 +08:00
* - To apply a single clear , add it to the individual ops only if the last entry is not also a clear .
*
2017-08-25 08:25:53 +08:00
* - To apply a range clear , after inserting the new range boundaries do the following to the start
* boundary and all successive boundaries < end
* - set the range clear version if not already set
* - add a clear to the startKeyMutations if the final entry is not a clear .
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* - Note that there are actually TWO valid ways to represent
* set c = val1 at version 1
* clear c \ x00 to z at version 2
* with this model . Either
* c = { rangeClearVersion = 2 , startKeyMutations = { 1 = > val1 }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
* OR
* c = { rangeClearVersion = < not present > , startKeyMutations = { 1 = > val1 }
* c \ x00 = { rangeClearVersion = 2 , startKeyMutations = { 2 = > < not present > }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* This is because the rangeClearVersion applies to a range begining with the first
* key AFTER the start key , so that the logic for reading the start key is more simple
* as it only involves consulting startKeyMutations . When adding a clear range , the
* boundary key insert / split described above is valid , and is what is currently done ,
* but it would also be valid to see if the last key before startKey is equal to
* keyBefore ( startKey ) , and if so that mutation buffer boundary key can be used instead
* without adding an additional key to the buffer .
2017-08-22 13:29:57 +08:00
*/
2018-10-15 18:43:43 +08:00
IPager * m_pager ;
MutationBufferT * m_pBuffer ;
std : : map < Version , MutationBufferT > m_mutationBuffers ;
Version m_writeVersion ;
Version m_lastCommittedVersion ;
Future < Void > m_latestCommit ;
int m_usablePageSizeOverride ;
Future < Void > m_init ;
std : : string m_name ;
2017-08-28 16:57:01 +08:00
void printMutationBuffer ( MutationBufferT : : const_iterator begin , MutationBufferT : : const_iterator end ) const {
2017-08-25 08:25:53 +08:00
# if REDWOOD_DEBUG
debug_printf ( " ------------------------------------- \n " ) ;
debug_printf ( " BUFFER \n " ) ;
while ( begin ! = end ) {
debug_printf ( " '%s': %s \n " , printable ( begin - > first ) . c_str ( ) , begin - > second . toString ( ) . c_str ( ) ) ;
+ + begin ;
}
debug_printf ( " ------------------------------------- \n " ) ;
# endif
}
2017-08-22 13:29:57 +08:00
2017-09-23 08:18:28 +08:00
void printMutationBuffer ( MutationBufferT * buf ) const {
return printMutationBuffer ( buf - > begin ( ) , buf - > end ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2017-08-22 13:29:57 +08:00
2017-09-23 08:18:28 +08:00
// Find or create a mutation buffer boundary for bound and return an iterator to it
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator insertMutationBoundary ( Key boundary ) {
2017-09-23 08:18:28 +08:00
ASSERT ( m_pBuffer ! = nullptr ) ;
2017-08-25 08:25:53 +08:00
// Find the first split point in buffer that is >= key
2017-09-23 08:18:28 +08:00
MutationBufferT : : iterator ib = m_pBuffer - > lower_bound ( boundary ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Since the initial state of the mutation buffer contains the range '' through
// the maximum possible key, our search had to have found something.
2017-09-23 08:18:28 +08:00
ASSERT ( ib ! = m_pBuffer - > end ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// If we found the boundary we are looking for, return its iterator
if ( ib - > first = = boundary )
return ib ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is our insert hint. Insert the new boundary and set ib to its entry
2017-09-23 08:18:28 +08:00
ib = m_pBuffer - > insert ( ib , { boundary , RangeMutation ( ) } ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is certainly > begin() because it is guaranteed that the empty string
// boundary exists and the only way to have found that is to look explicitly
// for it in which case we would have returned above.
MutationBufferT : : iterator iPrevious = ib ;
2017-08-26 06:48:32 +08:00
- - iPrevious ;
2017-08-28 16:57:01 +08:00
if ( iPrevious - > second . rangeClearVersion . present ( ) ) {
ib - > second . rangeClearVersion = iPrevious - > second . rangeClearVersion ;
ib - > second . startKeyMutations [ iPrevious - > second . rangeClearVersion . get ( ) ] = SingleKeyMutation ( ) ;
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
return ib ;
2017-08-22 13:29:57 +08:00
}
2017-07-26 07:10:19 +08:00
2018-09-19 15:32:39 +08:00
void buildNewRoot ( Version version , std : : vector < BoundaryAndPage > & pages , std : : vector < LogicalPageID > & logicalPageIDs , const BTreePage * pPage ) {
2018-08-29 04:46:14 +08:00
//debug_printf("buildNewRoot start %lu\n", pages.size());
2017-07-14 02:32:14 +08:00
// While there are multiple child pages for this version we must write new tree levels.
while ( pages . size ( ) > 1 ) {
2018-07-23 18:09:13 +08:00
std : : vector < PrefixTree : : EntryRef > childEntries ;
2017-07-14 02:32:14 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + )
2018-08-29 04:46:14 +08:00
childEntries . emplace_back ( pages [ i ] . lowerBound , StringRef ( ( unsigned char * ) & logicalPageIDs [ i ] , sizeof ( uint32_t ) ) ) ;
2017-07-14 02:32:14 +08:00
2018-09-19 15:32:39 +08:00
pages = buildPages ( false , beginKey , endKey , childEntries , 0 , [ = ] ( ) { return m_pager - > newPageBuffer ( ) ; } , m_usablePageSizeOverride ) ;
2017-07-14 02:32:14 +08:00
2017-07-14 13:11:48 +08:00
debug_printf ( " Writing a new root level at version %lld with %lu children across %lu pages \n " , version , childEntries . size ( ) , pages . size ( ) ) ;
2017-07-14 02:32:14 +08:00
2018-09-19 15:32:39 +08:00
logicalPageIDs = writePages ( pages , version , m_root , pPage , endKey , nullptr ) ;
}
}
2018-10-02 07:51:57 +08:00
std : : vector < LogicalPageID > writePages ( std : : vector < BoundaryAndPage > pages , Version version , LogicalPageID originalID , const BTreePage * originalPage , StringRef upperBound , void * actor_debug ) {
debug_printf ( " %p: writePages(): %u @%lld -> %lu replacement pages \n " , actor_debug , originalID , version , pages . size ( ) ) ;
2018-09-19 15:32:39 +08:00
2018-10-02 07:51:57 +08:00
ASSERT ( version ! = 0 | | pages . size ( ) = = 1 ) ;
2018-09-19 15:32:39 +08:00
2018-10-02 07:51:57 +08:00
std : : vector < LogicalPageID > primaryLogicalPageIDs ;
2018-09-19 15:32:39 +08:00
2018-10-02 07:51:57 +08:00
// Reuse original primary page ID if it's not the root or if only one page is being written.
if ( originalID ! = m_root | | pages . size ( ) = = 1 )
primaryLogicalPageIDs . push_back ( originalID ) ;
2017-07-14 02:32:14 +08:00
2018-10-02 07:51:57 +08:00
// Allocate a primary page ID for each page to be written
while ( primaryLogicalPageIDs . size ( ) < pages . size ( ) ) {
primaryLogicalPageIDs . push_back ( m_pager - > allocateLogicalPage ( ) ) ;
}
2018-09-19 15:32:39 +08:00
2018-10-02 07:51:57 +08:00
debug_printf ( " %p: writePages(): Writing %lu replacement pages for %d at version %lld \n " , actor_debug , pages . size ( ) , originalID , version ) ;
2018-09-19 15:32:39 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + ) {
// Allocate page number for main page first
2018-10-02 07:51:57 +08:00
LogicalPageID id = primaryLogicalPageIDs [ i ] ;
2018-09-19 15:32:39 +08:00
// Check for extension pages, if they exist assign IDs for them and write them at version
2018-10-02 07:51:57 +08:00
auto const & extPages = pages [ i ] . extPages ;
// If there are extension pages, write all pages using pager directly because this->writePage() is for whole primary pages
if ( extPages . size ( ) ! = 0 ) {
BTreePage * newPage = ( BTreePage * ) pages [ i ] . firstPage - > mutate ( ) ;
2018-09-19 15:32:39 +08:00
ASSERT ( newPage - > extensionPageCount = = extPages . size ( ) ) ;
for ( int e = 0 , eEnd = extPages . size ( ) ; e < eEnd ; + + e ) {
2018-10-02 07:51:57 +08:00
LogicalPageID eid = m_pager - > allocateLogicalPage ( ) ;
2018-09-19 16:34:19 +08:00
debug_printf ( " %p: writePages(): Writing extension page op=write id=%u @%lld (%d of %lu) referencePage=%u \n " , actor_debug , eid , version , e + 1 , extPages . size ( ) , id ) ;
2018-09-19 15:32:39 +08:00
newPage - > extensionPages [ e ] = eid ;
// If replacing the primary page below (version == 0) then pass the primary page's ID as the reference page ID
m_pager - > writePage ( eid , extPages [ e ] , version , ( version = = 0 ) ? id : invalidLogicalPageID ) ;
}
debug_printf ( " %p: writePages(): Writing primary page op=write id=%u @%lld (+%lu extension pages) \n " , actor_debug , id , version , extPages . size ( ) ) ;
m_pager - > writePage ( id , pages [ i ] . firstPage , version ) ;
}
else {
debug_printf ( " %p: writePages(): Writing normal page op=write id=%u @%lld \n " , actor_debug , id , version ) ;
writePage ( id , pages [ i ] . firstPage , version , pages [ i ] . lowerBound , ( i = = pages . size ( ) - 1 ) ? upperBound : pages [ i + 1 ] . lowerBound ) ;
}
}
2017-07-14 02:32:14 +08:00
2018-10-02 07:51:57 +08:00
// Free the old extension pages now that all replacement pages have been written
for ( int i = 0 ; i < originalPage - > extensionPageCount ; + + i ) {
//debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, originalPage->extensionPages[i]);
//m_pager->freeLogicalPage(originalPage->extensionPages[i], version);
2018-09-19 15:32:39 +08:00
}
2017-07-14 02:32:14 +08:00
2018-10-02 07:51:57 +08:00
return primaryLogicalPageIDs ;
2018-09-19 15:32:39 +08:00
}
class SuperPage : public IPage , ReferenceCounted < SuperPage > {
public :
2019-04-10 02:16:45 +08:00
SuperPage ( std : : vector < Reference < const IPage > > pages , int usablePageSize )
: m_size ( pages . size ( ) * usablePageSize ) {
2018-09-19 15:32:39 +08:00
m_data = new uint8_t [ m_size ] ;
uint8_t * wptr = m_data ;
for ( auto & p : pages ) {
memcpy ( wptr , p - > begin ( ) , usablePageSize ) ;
wptr + = usablePageSize ;
}
}
virtual ~ SuperPage ( ) {
delete m_data ;
}
virtual void addref ( ) const {
ReferenceCounted < SuperPage > : : addref ( ) ;
}
virtual void delref ( ) const {
ReferenceCounted < SuperPage > : : delref ( ) ;
}
virtual int size ( ) const {
return m_size ;
}
virtual uint8_t const * begin ( ) const {
return m_data ;
}
virtual uint8_t * mutate ( ) {
return m_data ;
}
private :
uint8_t * m_data ;
2019-04-10 02:16:45 +08:00
const int m_size ;
2018-09-19 15:32:39 +08:00
} ;
ACTOR static Future < Reference < const IPage > > readPage ( Reference < IPagerSnapshot > snapshot , LogicalPageID id , int usablePageSize ) {
2018-10-25 06:57:06 +08:00
debug_printf ( " readPage() op=read id=%u @%lld \n " , id , snapshot - > getVersion ( ) ) ;
2018-09-19 15:32:39 +08:00
Reference < const IPage > raw = wait ( snapshot - > getPhysicalPage ( id ) ) ;
const BTreePage * pTreePage = ( const BTreePage * ) raw - > begin ( ) ;
if ( pTreePage - > extensionPageCount = = 0 ) {
2018-10-25 06:57:06 +08:00
debug_printf ( " readPage() Found normal page for op=read id=%u @%lld \n " , id , snapshot - > getVersion ( ) ) ;
2018-09-19 15:32:39 +08:00
return raw ;
}
std : : vector < Future < Reference < const IPage > > > pageGets ;
pageGets . push_back ( std : : move ( raw ) ) ;
for ( int i = 0 ; i < pTreePage - > extensionPageCount ; + + i ) {
2018-09-19 16:34:19 +08:00
debug_printf ( " readPage() Reading extension page op=read id=%u @%lld ext=%d/%d \n " , pTreePage - > extensionPages [ i ] , snapshot - > getVersion ( ) , i + 1 , ( int ) pTreePage - > extensionPageCount ) ;
2018-09-19 15:32:39 +08:00
pageGets . push_back ( snapshot - > getPhysicalPage ( pTreePage - > extensionPages [ i ] ) ) ;
2017-07-14 02:32:14 +08:00
}
2018-09-19 15:32:39 +08:00
std : : vector < Reference < const IPage > > pages = wait ( getAll ( pageGets ) ) ;
return Reference < const IPage > ( new SuperPage ( pages , usablePageSize ) ) ;
2017-07-14 02:32:14 +08:00
}
2017-07-05 14:41:48 +08:00
// Returns list of (version, list of (lower_bound, list of children) )
2018-09-19 15:32:39 +08:00
ACTOR static Future < VersionedChildrenT > commitSubtree ( VersionedBTree * self , MutationBufferT * mutationBuffer , Reference < IPagerSnapshot > snapshot , LogicalPageID root , Key lowerBoundKey , Key upperBoundKey ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p commitSubtree: root=%d lower='%s' upper='%s' \n " , THIS , root , lowerBoundKey . toHexString ( 20 ) . c_str ( ) , upperBoundKey . toHexString ( 20 ) . c_str ( ) ) ;
2017-07-05 14:41:48 +08:00
2018-07-23 18:09:13 +08:00
// Decode the (likely truncate) upper and lower bound keys for this subtree.
2017-08-25 08:25:53 +08:00
state KeyVersionValue lowerBoundKVV = KeyVersionValue : : unpack ( lowerBoundKey ) ;
state KeyVersionValue upperBoundKVV = KeyVersionValue : : unpack ( upperBoundKey ) ;
// Find the slice of the mutation buffer that is relevant to this subtree
2018-08-29 04:46:14 +08:00
// TODO: Rather than two lower_bound searches, perhaps just compare each mutation to the upperBound key
2017-09-23 08:18:28 +08:00
state MutationBufferT : : const_iterator iMutationBoundary = mutationBuffer - > lower_bound ( lowerBoundKVV . key ) ;
state MutationBufferT : : const_iterator iMutationBoundaryEnd = mutationBuffer - > lower_bound ( upperBoundKVV . key ) ;
2017-08-25 08:25:53 +08:00
// If the lower bound key and the upper bound key are the same then there can't be any changes to
// this subtree since changes would happen after the upper bound key as the mutated versions would
// necessarily be higher.
if ( lowerBoundKVV . key = = upperBoundKVV . key ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p no changes, lower and upper bound keys are the same. \n " , THIS ) ;
2017-06-10 05:56:41 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
}
2017-08-25 08:25:53 +08:00
// If the mutation buffer key found is greater than the lower bound key then go to the previous mutation
// buffer key because it may cover deletion of some keys at the start of this subtree.
2017-09-23 08:18:28 +08:00
if ( iMutationBoundary ! = mutationBuffer - > begin ( ) & & iMutationBoundary - > first > lowerBoundKVV . key ) {
2017-08-25 08:25:53 +08:00
- - iMutationBoundary ;
2017-08-28 21:28:49 +08:00
}
2017-08-25 08:25:53 +08:00
else {
// If the there are no mutations, we're done
if ( iMutationBoundary = = iMutationBoundaryEnd ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p no changes, mutation buffer start/end are the same \n " , THIS ) ;
2017-08-25 08:25:53 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
}
}
2017-08-29 08:26:53 +08:00
// TODO: Check if entire subtree is erased and return no pages, also have the previous pages deleted as of
// the cleared version.
2017-08-25 08:25:53 +08:00
// Another way to have no mutations is to have a single mutation range cover this
// subtree but have no changes in it
MutationBufferT : : const_iterator iMutationBoundaryNext = iMutationBoundary ;
+ + iMutationBoundaryNext ;
2017-08-31 16:23:12 +08:00
if ( iMutationBoundaryNext = = iMutationBoundaryEnd & & iMutationBoundary - > second . noChanges ( ) ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p no changes because sole mutation range was empty \n " , THIS ) ;
2017-08-25 08:25:53 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
}
2018-09-19 15:32:39 +08:00
state Reference < const IPage > rawPage = wait ( readPage ( snapshot , root , self - > m_usablePageSizeOverride ) ) ;
state BTreePage * page = ( BTreePage * ) rawPage - > begin ( ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p commitSubtree(): %s \n " , THIS , page - > toString ( false , root , snapshot - > getVersion ( ) , lowerBoundKey , upperBoundKey ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
2018-09-19 15:32:39 +08:00
PrefixTree : : Cursor existingCursor = page - > tree ( ) . getCursor ( lowerBoundKey , upperBoundKey ) ;
2018-06-08 18:32:34 +08:00
bool existingCursorValid = existingCursor . moveFirst ( ) ;
2017-06-10 05:56:41 +08:00
2018-08-29 04:46:14 +08:00
// Leaf Page
2018-06-08 18:32:34 +08:00
if ( page - > flags & BTreePage : : IS_LEAF ) {
2017-06-10 05:56:41 +08:00
VersionedChildrenT results ;
2018-07-23 18:09:13 +08:00
std : : vector < PrefixTree : : EntryRef > merged ;
Arena mergedArena ;
2017-06-10 05:56:41 +08:00
2019-02-20 05:22:14 +08:00
debug_printf ( " %p MERGING EXISTING DATA WITH MUTATIONS: \n " , THIS ) ;
2017-08-26 06:48:32 +08:00
self - > printMutationBuffer ( iMutationBoundary , iMutationBoundaryEnd ) ;
2017-08-22 13:29:57 +08:00
// It's a given that the mutation map is not empty so it's safe to do this
2017-08-25 08:25:53 +08:00
Key mutationRangeStart = iMutationBoundary - > first ;
2017-08-22 13:29:57 +08:00
2018-06-08 18:32:34 +08:00
// There will be multiple loops advancing existing cursor, existing KVV will track its current value
2017-08-22 13:29:57 +08:00
KeyVersionValue existing ;
2018-07-23 18:09:13 +08:00
if ( existingCursorValid ) {
existing = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
}
2017-08-22 13:29:57 +08:00
// If replacement pages are written they will be at the minimum version seen in the mutations for this leaf
2017-08-28 16:57:01 +08:00
Version minVersion = invalidVersion ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Now, process each mutation range and merge changes with existing data.
while ( iMutationBoundary ! = iMutationBoundaryEnd ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p New mutation boundary: '%s': %s \n " , THIS , printable ( iMutationBoundary - > first ) . c_str ( ) , iMutationBoundary - > second . toString ( ) . c_str ( ) ) ;
2017-08-23 02:30:44 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutations ;
2017-08-22 13:29:57 +08:00
2017-08-28 21:28:49 +08:00
// If the mutation boundary key is less than the lower bound key then skip startKeyMutations for
// this bounary, we're only processing this mutation range here to apply any clears to existing data.
if ( iMutationBoundary - > first < lowerBoundKVV . key )
iMutations = iMutationBoundary - > second . startKeyMutations . end ( ) ;
2017-08-25 08:25:53 +08:00
// If the mutation boundary key is the same as the page lowerBound key then start reading single
// key mutations at the first version greater than the lowerBoundKey version.
2017-08-28 21:28:49 +08:00
else if ( iMutationBoundary - > first = = lowerBoundKVV . key )
2017-08-25 08:25:53 +08:00
iMutations = iMutationBoundary - > second . startKeyMutations . upper_bound ( lowerBoundKVV . version ) ;
else
iMutations = iMutationBoundary - > second . startKeyMutations . begin ( ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutationsEnd = iMutationBoundary - > second . startKeyMutations . end ( ) ;
// Output old versions of the mutation boundary key
2018-06-08 18:32:34 +08:00
while ( existingCursorValid & & existing . key = = iMutationBoundary - > first ) {
2018-07-23 18:09:13 +08:00
// Don't copy the value because this page will stay in memory until after we've built new version(s) of it
merged . push_back ( dependsOn ( mergedArena , existingCursor . getKV ( false ) ) ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p: Added %s [existing, boundary start] \n " , THIS , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-25 08:25:53 +08:00
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
if ( existingCursorValid )
2018-07-23 18:09:13 +08:00
existing = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2017-06-10 05:56:41 +08:00
2018-07-18 18:19:35 +08:00
// TODO: If a mutation set is equal to the previous existing value of the key, maybe don't write it.
2017-08-25 08:25:53 +08:00
// Output mutations for the mutation boundary start key
2017-08-22 13:29:57 +08:00
while ( iMutations ! = iMutationsEnd ) {
2017-09-06 07:59:31 +08:00
const SingleKeyMutation & m = iMutations - > second ;
2018-09-19 15:32:39 +08:00
int maxPartSize = std : : min ( 255 , self - > m_usablePageSizeOverride / 5 ) ;
2017-09-20 04:03:30 +08:00
if ( m . isClear ( ) | | m . value . size ( ) < = maxPartSize ) {
2017-09-16 08:27:13 +08:00
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
2018-07-23 18:09:13 +08:00
// Don't copy the value because this page will stay in memory until after we've built new version(s) of it
merged . push_back ( dependsOn ( mergedArena , iMutations - > second . toKVV ( iMutationBoundary - > first , iMutations - > first ) . pack ( false ) ) ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p: Added %s [mutation, boundary start] \n " , THIS , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-09-06 07:59:31 +08:00
}
else {
2018-07-23 18:09:13 +08:00
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
2017-09-06 07:59:31 +08:00
int bytesLeft = m . value . size ( ) ;
2018-07-23 18:09:13 +08:00
int start = 0 ;
KeyVersionValueRef whole ( iMutationBoundary - > first , iMutations - > first , m . value ) ;
2017-09-06 07:59:31 +08:00
while ( bytesLeft > 0 ) {
int partSize = std : : min ( bytesLeft , maxPartSize ) ;
2018-07-23 18:09:13 +08:00
// Don't copy the value chunk because this page will stay in memory until after we've built new version(s) of it
merged . push_back ( dependsOn ( mergedArena , whole . split ( start , partSize ) . pack ( false ) ) ) ;
2018-07-25 17:29:17 +08:00
bytesLeft - = partSize ;
start + = partSize ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p: Added %s [mutation, boundary start] \n " , THIS , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-09-06 07:59:31 +08:00
}
}
2017-08-22 13:29:57 +08:00
+ + iMutations ;
}
2017-06-10 05:56:41 +08:00
2017-08-25 08:25:53 +08:00
// Get the clear version for this range, which is the last thing that we need from it,
Optional < Version > clearRangeVersion = iMutationBoundary - > second . rangeClearVersion ;
// Advance to the next boundary because we need to know the end key for the current range.
+ + iMutationBoundary ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Mutation range end: '%s' \n " , THIS , printable ( iMutationBoundary - > first ) . c_str ( ) ) ;
2017-08-29 08:26:53 +08:00
2017-08-25 08:25:53 +08:00
// Write existing keys which are less than the next mutation boundary key, clearing if needed.
2018-06-08 18:32:34 +08:00
while ( existingCursorValid & & existing . key < iMutationBoundary - > first ) {
2018-07-25 17:29:17 +08:00
merged . push_back ( dependsOn ( mergedArena , existingCursor . getKV ( false ) ) ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p: Added %s [existing, middle] \n " , THIS , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-25 08:25:53 +08:00
// Write a clear of this key if needed. A clear is required if clearRangeVersion is set and the next key is different
// than this one. Note that the next key might be the in our right sibling, we can use the page upperBound to get that.
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
2017-08-25 08:25:53 +08:00
KeyVersionValue nextEntry ;
2018-06-08 18:32:34 +08:00
if ( existingCursorValid )
2018-07-23 18:09:13 +08:00
nextEntry = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
2017-08-25 08:25:53 +08:00
else
nextEntry = upperBoundKVV ;
if ( clearRangeVersion . present ( ) & & existing . key ! = nextEntry . key ) {
2017-08-28 16:57:01 +08:00
Version clearVersion = clearRangeVersion . get ( ) ;
if ( clearVersion < minVersion | | minVersion = = invalidVersion )
minVersion = clearVersion ;
2018-07-23 18:09:13 +08:00
merged . push_back ( dependsOn ( mergedArena , KeyVersionValueRef ( existing . key , clearVersion ) . pack ( false ) ) ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p: Added %s [existing, middle clear] \n " , THIS , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-22 13:29:57 +08:00
}
2017-08-25 08:25:53 +08:00
2018-06-08 18:32:34 +08:00
if ( existingCursorValid )
2017-08-25 08:25:53 +08:00
existing = nextEntry ;
2017-08-22 13:29:57 +08:00
}
2017-08-26 06:48:32 +08:00
}
2017-06-10 05:56:41 +08:00
2017-08-26 06:48:32 +08:00
// Write any remaining existing keys, which are not subject to clears as they are beyond the cleared range.
2018-06-08 18:32:34 +08:00
while ( existingCursorValid ) {
2018-07-25 17:29:17 +08:00
merged . push_back ( dependsOn ( mergedArena , existingCursor . getKV ( false ) ) ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p: Added %s [existing, tail] \n " , THIS , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-25 08:25:53 +08:00
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
if ( existingCursorValid )
2018-07-23 18:09:13 +08:00
existing = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
2017-06-10 05:56:41 +08:00
}
2017-08-25 08:25:53 +08:00
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Done merging mutations into existing leaf contents \n " , THIS ) ;
2017-08-28 18:53:29 +08:00
2017-08-29 08:26:53 +08:00
// No changes were actually made. This could happen if there is a clear which does not cover an entire leaf but also does
// not which turns out to not match any existing data in the leaf.
2017-08-28 18:53:29 +08:00
if ( minVersion = = invalidVersion ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p No changes were made during mutation merge \n " , THIS ) ;
2017-08-28 16:57:01 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
2017-08-28 18:53:29 +08:00
}
2017-08-28 16:57:01 +08:00
2018-09-19 15:32:39 +08:00
// TODO: Make version and key splits based on contents of merged list, if keeping history
2017-06-10 05:56:41 +08:00
IPager * pager = self - > m_pager ;
2018-09-19 15:32:39 +08:00
std : : vector < BoundaryAndPage > pages = buildPages ( true , lowerBoundKey , upperBoundKey , merged , BTreePage : : IS_LEAF , [ pager ] ( ) { return pager - > newPageBuffer ( ) ; } , self - > m_usablePageSizeOverride ) ;
2017-06-10 05:56:41 +08:00
2018-09-19 15:32:39 +08:00
// If there isn't still just a single page of data then this page became too large and was split.
// The new split pages will be valid as of minVersion, but the old page remains valid at the old version
// (TODO: unless history isn't being kept at all)
2017-07-05 14:41:48 +08:00
if ( pages . size ( ) ! = 1 ) {
2017-06-10 05:56:41 +08:00
results . push_back ( { 0 , { { lowerBoundKey , root } } } ) ;
2017-07-05 14:41:48 +08:00
}
2017-06-10 05:56:41 +08:00
2017-09-15 20:19:39 +08:00
if ( pages . size ( ) = = 1 )
minVersion = 0 ;
2018-09-19 15:32:39 +08:00
// Write page(s), get new page IDs
2019-02-20 05:22:14 +08:00
std : : vector < LogicalPageID > newPageIDs = self - > writePages ( pages , minVersion , root , page , upperBoundKey , THIS ) ;
2017-06-10 05:56:41 +08:00
2017-07-14 02:32:14 +08:00
// If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page
2018-09-19 15:32:39 +08:00
if ( root = = self - > m_root & & pages . size ( ) > 1 ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Building new root \n " , THIS ) ;
2018-09-19 15:32:39 +08:00
self - > buildNewRoot ( minVersion , pages , newPageIDs , page ) ;
2017-08-28 21:28:49 +08:00
}
2017-07-14 02:32:14 +08:00
2017-06-10 05:56:41 +08:00
results . push_back ( { minVersion , { } } ) ;
2018-09-19 15:32:39 +08:00
// TODO: Can this be moved into writePages?
// TODO: This can probably be skipped for root
2017-06-10 05:56:41 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + ) {
2017-08-28 16:57:01 +08:00
// The lower bound of the first page is the lower bound of the subtree, not the first entry in the page
2018-08-29 04:46:14 +08:00
Key lowerBound = ( i = = 0 ) ? lowerBoundKey : pages [ i ] . lowerBound ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Adding page to results: %s => %d \n " , THIS , lowerBound . toHexString ( 20 ) . c_str ( ) , newPageIDs [ i ] ) ;
2018-09-19 15:32:39 +08:00
results . back ( ) . second . push_back ( { lowerBound , newPageIDs [ i ] } ) ;
2017-06-10 05:56:41 +08:00
}
2019-02-20 05:22:14 +08:00
debug_printf ( " %p DONE. \n " , THIS ) ;
2017-06-10 05:56:41 +08:00
return results ;
}
else {
2018-08-29 04:46:14 +08:00
// Internal Page
2018-06-08 18:32:34 +08:00
state std : : vector < Future < VersionedChildrenT > > futureChildren ;
state std : : vector < LogicalPageID > childPageIDs ;
2018-09-19 15:32:39 +08:00
// TODO: Make this much more efficient with a skip-merge through the two sorted sets (mutations, existing cursor)
2018-06-15 08:52:25 +08:00
bool first = true ;
2018-06-08 18:32:34 +08:00
while ( existingCursorValid ) {
// The lower bound for the first child is lowerBoundKey
Key childLowerBound = first ? lowerBoundKey : existingCursor . getKey ( ) ;
if ( first )
first = false ;
2018-07-23 18:09:13 +08:00
uint32_t pageID = * ( uint32_t * ) existingCursor . getValueRef ( ) . begin ( ) ;
2018-08-29 04:46:14 +08:00
ASSERT ( pageID ! = 0 ) ;
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
2018-09-19 15:32:39 +08:00
Key childUpperBound = existingCursorValid ? existingCursor . getKey ( ) : upperBoundKey ;
2017-08-28 16:57:01 +08:00
2018-09-24 17:42:23 +08:00
debug_printf ( " lower '%s' \n " , childLowerBound . toHexString ( 20 ) . c_str ( ) ) ;
debug_printf ( " upper '%s' \n " , childUpperBound . toHexString ( 20 ) . c_str ( ) ) ;
2017-08-28 18:53:29 +08:00
ASSERT ( childLowerBound < = childUpperBound ) ;
2018-09-19 15:32:39 +08:00
futureChildren . push_back ( self - > commitSubtree ( self , mutationBuffer , snapshot , pageID , childLowerBound , childUpperBound ) ) ;
2018-06-08 18:32:34 +08:00
childPageIDs . push_back ( pageID ) ;
2017-06-10 05:56:41 +08:00
}
2018-09-20 18:39:55 +08:00
wait ( waitForAll ( futureChildren ) ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
bool modified = false ;
2018-06-08 18:32:34 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
const VersionedChildrenT & children = futureChildren [ i ] . get ( ) ;
2018-09-19 15:32:39 +08:00
// Handle multipages
2018-06-08 18:32:34 +08:00
if ( children . size ( ) ! = 1 | | children [ 0 ] . second . size ( ) ! = 1 ) {
2017-07-05 14:41:48 +08:00
modified = true ;
2017-06-10 05:56:41 +08:00
break ;
}
}
2017-07-14 02:32:14 +08:00
if ( ! modified ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p not modified. \n " , THIS ) ;
2017-06-10 05:56:41 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
2017-07-14 02:32:14 +08:00
}
2017-06-10 05:56:41 +08:00
Version version = 0 ;
VersionedChildrenT result ;
loop { // over version splits of this page
Version nextVersion = std : : numeric_limits < Version > : : max ( ) ;
2018-07-23 18:09:13 +08:00
std : : vector < PrefixTree : : EntryRef > childEntries ; // Logically std::vector<std::pair<std::string, LogicalPageID>> childEntries;
2017-06-10 05:56:41 +08:00
// For each Future<VersionedChildrenT>
2019-02-20 05:22:14 +08:00
debug_printf ( " %p creating replacement pages for id=%d at Version %lld \n " , THIS , root , version ) ;
2017-07-05 14:41:48 +08:00
// If we're writing version 0, there is a chance that we don't have to write ourselves, if there are no changes
bool modified = version ! = 0 ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
const VersionedChildrenT & children = futureChildren [ i ] . get ( ) ;
2019-04-10 02:16:45 +08:00
debug_printf ( " %p Versioned page set that replaced Page id=%d: %lu versions \n " , THIS ,
childPageIDs [ i ] , children . size ( ) ) ;
2017-06-10 05:56:41 +08:00
for ( auto & versionedPageSet : children ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p version: Page id=%lld \n " , THIS , versionedPageSet . first ) ;
2017-06-10 05:56:41 +08:00
for ( auto & boundaryPage : versionedPageSet . second ) {
2019-04-10 02:16:45 +08:00
( void ) boundaryPage ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p '%s' -> Page id=%u \n " , THIS , printable ( boundaryPage . first ) . c_str ( ) , boundaryPage . second ) ;
2017-06-10 05:56:41 +08:00
}
}
// Find the first version greater than the current version we are writing
auto cv = std : : upper_bound ( children . begin ( ) , children . end ( ) , version , [ ] ( Version a , VersionedChildrenT : : value_type const & b ) { return a < b . first ; } ) ;
// If there are no versions before the one we found, just update nextVersion and continue.
if ( cv = = children . begin ( ) ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p First version (%lld) in set is greater than current, setting nextVersion and continuing \n " , THIS , cv - > first ) ;
2017-06-10 05:56:41 +08:00
nextVersion = std : : min ( nextVersion , cv - > first ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p curr %lld next %lld \n " , THIS , version , nextVersion ) ;
2017-06-10 05:56:41 +08:00
continue ;
}
// If a version greater than the current version being written was found, update nextVersion
if ( cv ! = children . end ( ) ) {
nextVersion = std : : min ( nextVersion , cv - > first ) ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p curr %lld next %lld \n " , THIS , version , nextVersion ) ;
2017-06-10 05:56:41 +08:00
}
// Go back one to the last version that was valid prior to or at the current version we are writing
- - cv ;
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Using children for version %lld from this set, building version %lld \n " , THIS , cv - > first , version ) ;
2017-07-05 14:41:48 +08:00
// If page count isn't 1 then the root is definitely modified
modified = modified | | cv - > second . size ( ) ! = 1 ;
2017-06-10 05:56:41 +08:00
// Add the children at this version to the child entries list for the current version being built.
for ( auto & childPage : cv - > second ) {
2019-04-18 07:04:10 +08:00
debug_printf ( " %p Adding child page '%s' \n " , THIS , printable ( childPage . first ) . c_str ( ) ) ;
2018-07-23 18:09:13 +08:00
childEntries . emplace_back ( childPage . first , StringRef ( ( unsigned char * ) & childPage . second , sizeof ( uint32_t ) ) ) ;
2017-06-10 05:56:41 +08:00
}
}
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Finished pass through futurechildren. childEntries=%lu version=%lld nextVersion=%lld \n " , THIS , childEntries . size ( ) , version , nextVersion ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
if ( modified ) {
// TODO: Track split points across iterations of this loop, so that they don't shift unnecessarily and
// cause unnecessary path copying
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
IPager * pager = self - > m_pager ;
2018-09-19 15:32:39 +08:00
std : : vector < BoundaryAndPage > pages = buildPages ( false , lowerBoundKey , upperBoundKey , childEntries , 0 , [ pager ] ( ) { return pager - > newPageBuffer ( ) ; } , self - > m_usablePageSizeOverride ) ;
2017-06-10 05:56:41 +08:00
2018-09-19 15:32:39 +08:00
// Write page(s), use version 0 to replace latest version if only writing one page
2019-02-20 05:22:14 +08:00
std : : vector < LogicalPageID > newPageIDs = self - > writePages ( pages , version , root , page , upperBoundKey , THIS ) ;
2017-06-10 05:56:41 +08:00
2017-07-14 02:32:14 +08:00
// If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page
if ( root = = self - > m_root )
2018-09-19 15:32:39 +08:00
self - > buildNewRoot ( version , pages , newPageIDs , page ) ;
2017-07-14 02:32:14 +08:00
2017-07-05 14:41:48 +08:00
result . resize ( result . size ( ) + 1 ) ;
result . back ( ) . first = version ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + )
2018-09-19 15:32:39 +08:00
result . back ( ) . second . push_back ( { pages [ i ] . lowerBound , newPageIDs [ i ] } ) ;
2017-06-10 05:56:41 +08:00
2018-09-19 15:32:39 +08:00
// TODO: figure this out earlier instead of writing replacement page more than once
2017-07-05 14:41:48 +08:00
if ( result . size ( ) > 1 & & result . back ( ) . second = = result . end ( ) [ - 2 ] . second ) {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Output same as last version, popping it. \n " , THIS ) ;
2017-07-05 14:41:48 +08:00
result . pop_back ( ) ;
}
}
else {
2019-02-20 05:22:14 +08:00
debug_printf ( " %p Version 0 has no changes \n " , THIS ) ;
2017-07-05 14:41:48 +08:00
result . push_back ( { 0 , { { lowerBoundKey , root } } } ) ;
}
2017-06-10 05:56:41 +08:00
if ( nextVersion = = std : : numeric_limits < Version > : : max ( ) )
break ;
version = nextVersion ;
}
2019-02-20 05:22:14 +08:00
debug_printf ( " %p DONE. \n " , THIS ) ;
2017-06-10 05:56:41 +08:00
return result ;
}
}
ACTOR static Future < Void > commit_impl ( VersionedBTree * self ) {
2017-09-23 08:18:28 +08:00
state MutationBufferT * mutations = self - > m_pBuffer ;
// No more mutations are allowed to be written to this mutation buffer we will commit
// at m_writeVersion, which we must save locally because it could change during commit.
self - > m_pBuffer = nullptr ;
state Version writeVersion = self - > m_writeVersion ;
// The latest mutation buffer start version is the one we will now (or eventually) commit.
state Version mutationBufferStartVersion = self - > m_mutationBuffers . rbegin ( ) - > first ;
// Replace the lastCommit future with a new one and then wait on the old one
state Promise < Void > committed ;
Future < Void > previousCommit = self - > m_latestCommit ;
self - > m_latestCommit = committed . getFuture ( ) ;
// Wait for the latest commit that started to be finished.
2018-09-20 18:39:55 +08:00
wait ( previousCommit ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: Beginning commit of version %lld \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-09-23 08:18:28 +08:00
// Get the latest version from the pager, which is what we will read at
2017-06-10 05:56:41 +08:00
Version latestVersion = wait ( self - > m_pager - > getLatestVersion ( ) ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: pager latestVersion %lld \n " , self - > m_name . c_str ( ) , latestVersion ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
self - > printMutationBuffer ( mutations ) ;
2017-08-22 13:29:57 +08:00
2019-02-13 08:07:17 +08:00
wait ( success ( commitSubtree ( self , mutations , self - > m_pager - > getReadSnapshot ( latestVersion ) , self - > m_root , beginKey , endKey ) ) ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
self - > m_pager - > setLatestVersion ( writeVersion ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: Committing pager %lld \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_pager - > commit ( ) ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: Committed version %lld \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
// Now that everything is committed we must delete the mutation buffer.
// Our buffer's start version should be the oldest mutation buffer version in the map.
ASSERT ( mutationBufferStartVersion = = self - > m_mutationBuffers . begin ( ) - > first ) ;
self - > m_mutationBuffers . erase ( self - > m_mutationBuffers . begin ( ) ) ;
self - > m_lastCommittedVersion = writeVersion ;
committed . send ( Void ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
2018-06-08 18:32:34 +08:00
// InternalCursor is for seeking to and iterating over the internal / low level records in the Btree.
// This records are versioned and they can represent deletions or partial values so they must be
// post processed to obtain keys returnable to the user.
2017-09-16 17:09:09 +08:00
class InternalCursor {
2017-09-15 20:19:39 +08:00
public :
InternalCursor ( ) { }
2018-09-19 15:32:39 +08:00
InternalCursor ( Reference < IPagerSnapshot > pages , LogicalPageID root , int usablePageSizeOverride ) : m_pages ( pages ) , m_root ( root ) , outOfBound ( 0 ) , m_usablePageSizeOverride ( usablePageSizeOverride ) {
2017-09-15 20:19:39 +08:00
m_path . reserve ( 6 ) ;
}
bool valid ( ) const {
2018-06-08 18:32:34 +08:00
return ( outOfBound = = 0 ) & & kvv . valid ( ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-23 08:18:28 +08:00
Future < Void > seekLessThanOrEqual ( KeyRef key ) {
2017-09-16 17:09:09 +08:00
return seekLessThanOrEqual_impl ( this , key ) ;
2017-09-15 20:19:39 +08:00
}
Future < Void > move ( bool fwd ) {
2017-09-16 17:09:09 +08:00
return move_impl ( this , fwd ) ;
2017-09-15 20:19:39 +08:00
}
2018-07-23 18:09:13 +08:00
Standalone < KeyVersionValueRef > kvv ; // The decoded current internal record in the tree
2017-09-15 20:19:39 +08:00
2018-06-12 16:43:19 +08:00
std : : string toString ( const char * wrapPrefix = " " ) const {
std : : string r ;
r + = format ( " InternalCursor(%p) ver=%lld oob=%d valid=%d " , this , m_pages - > getVersion ( ) , outOfBound , valid ( ) ) ;
r + = format ( " \n %s KVV: %s " , wrapPrefix , kvv . toString ( ) . c_str ( ) ) ;
for ( const PageEntryLocation & p : m_path ) {
2018-09-24 17:42:23 +08:00
std : : string cur = p . cursor . valid ( ) ? format ( " '%s' -> '%s' " , p . cursor . getKey ( ) . toHexString ( 20 ) . c_str ( ) , p . cursor . getValueRef ( ) . toHexString ( 20 ) . c_str ( ) ) : " invalid " ;
2018-09-19 15:32:39 +08:00
r + = format ( " \n %s Page id=%d (%d records, %d bytes) Cursor %s " , wrapPrefix , p . pageNumber , p . btPage - > count , p . btPage - > kvBytes , cur . c_str ( ) ) ;
2018-06-12 16:43:19 +08:00
}
return r ;
2017-09-15 20:19:39 +08:00
}
private :
Reference < IPagerSnapshot > m_pages ;
LogicalPageID m_root ;
2018-09-19 15:32:39 +08:00
int m_usablePageSizeOverride ;
2017-09-15 20:19:39 +08:00
struct PageEntryLocation {
2018-06-08 18:32:34 +08:00
PageEntryLocation ( ) { }
2018-07-15 04:37:52 +08:00
PageEntryLocation ( Key lowerBound , Key upperBound , Reference < const IPage > page , LogicalPageID id )
2018-09-19 15:32:39 +08:00
: pageLowerBound ( lowerBound ) , pageUpperBound ( upperBound ) , page ( page ) , pageNumber ( id ) , btPage ( ( BTreePage * ) page - > begin ( ) ) , cursor ( btPage - > tree ( ) . getCursor ( pageLowerBound , pageUpperBound ) )
2018-07-10 17:24:01 +08:00
{
}
2017-09-15 20:19:39 +08:00
2018-07-15 04:37:52 +08:00
Key getNextOrUpperBound ( ) {
if ( cursor . moveNext ( ) ) {
Key r = cursor . getKey ( ) ;
cursor . movePrev ( ) ;
return r ;
}
return pageUpperBound ;
}
2018-07-10 17:24:01 +08:00
Key pageLowerBound ;
2018-07-15 04:37:52 +08:00
Key pageUpperBound ;
2017-09-15 20:19:39 +08:00
Reference < const IPage > page ;
2018-06-08 18:32:34 +08:00
BTreePage * btPage ;
PrefixTree : : Cursor cursor ;
2018-07-17 15:41:42 +08:00
// For easier debugging
LogicalPageID pageNumber ;
2017-09-15 20:19:39 +08:00
} ;
typedef std : : vector < PageEntryLocation > TraversalPathT ;
TraversalPathT m_path ;
2018-06-08 18:32:34 +08:00
int outOfBound ;
2017-09-15 20:19:39 +08:00
2018-07-15 04:37:52 +08:00
ACTOR static Future < Void > pushPage ( InternalCursor * self , Key lowerBound , Key upperBound , LogicalPageID id ) {
2018-09-19 15:32:39 +08:00
Reference < const IPage > rawPage = wait ( readPage ( self - > m_pages , id , self - > m_usablePageSizeOverride ) ) ;
2018-08-29 04:46:14 +08:00
debug_printf ( " InternalCursor::pushPage() %s \n " , ( ( const BTreePage * ) rawPage - > begin ( ) ) - > toString ( false , id , self - > m_pages - > getVersion ( ) , lowerBound , upperBound ) . c_str ( ) ) ;
2018-07-15 04:37:52 +08:00
self - > m_path . emplace_back ( lowerBound , upperBound , rawPage , id ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2017-09-16 17:09:09 +08:00
ACTOR static Future < Void > reset ( InternalCursor * self ) {
2017-09-15 20:19:39 +08:00
if ( self - > m_path . empty ( ) ) {
2018-09-20 18:39:55 +08:00
wait ( pushPage ( self , beginKey , endKey , self - > m_root ) ) ;
2017-09-15 20:19:39 +08:00
}
else {
self - > m_path . resize ( 1 ) ;
}
2018-06-12 16:43:19 +08:00
self - > outOfBound = 0 ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2017-09-23 08:18:28 +08:00
ACTOR static Future < Void > seekLessThanOrEqual_impl ( InternalCursor * self , KeyRef key ) {
2017-09-15 20:19:39 +08:00
state TraversalPathT & path = self - > m_path ;
2018-09-20 18:39:55 +08:00
wait ( reset ( self ) ) ;
2017-09-15 20:19:39 +08:00
2018-09-24 17:42:23 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): start %s \n " , key . toHexString ( 20 ) . c_str ( ) , self - > toString ( " " ) . c_str ( ) ) ;
2018-06-12 16:43:19 +08:00
2017-09-15 20:19:39 +08:00
loop {
state PageEntryLocation * p = & path . back ( ) ;
2018-06-08 18:32:34 +08:00
if ( p - > btPage - > count = = 0 ) {
2017-09-15 20:19:39 +08:00
ASSERT ( path . size ( ) = = 1 ) ; // This must be the root page.
2018-06-12 16:43:19 +08:00
self - > outOfBound = - 1 ;
2017-09-15 20:19:39 +08:00
self - > kvv . version = invalidVersion ;
2018-09-24 17:42:23 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Exit, root page empty. %s \n " , key . toHexString ( 20 ) . c_str ( ) , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2018-06-08 18:32:34 +08:00
state bool foundLTE = p - > cursor . seekLessThanOrEqual ( key ) ;
2018-09-24 17:42:23 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Seek on path tail, result %d. %s \n " , key . toHexString ( 20 ) . c_str ( ) , foundLTE , self - > toString ( " " ) . c_str ( ) ) ;
2018-06-12 16:43:19 +08:00
2018-06-08 18:32:34 +08:00
if ( p - > btPage - > flags & BTreePage : : IS_LEAF ) {
// It is possible for the current leaf key to be between the page's lower bound (in the parent page) and the
// first record in the leaf page, which means we must move backwards 1 step in the database to find the
// record < key, if such a record exists.
if ( ! foundLTE ) {
2018-09-20 18:39:55 +08:00
wait ( self - > move ( false ) ) ;
2017-09-15 20:19:39 +08:00
}
else {
2018-06-08 18:32:34 +08:00
// Found the target record
2018-07-23 18:09:13 +08:00
self - > kvv = KeyVersionValue : : unpack ( p - > cursor . getKVRef ( ) ) ;
2017-09-15 20:19:39 +08:00
}
2018-09-24 17:42:23 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Exit, Found leaf page. %s \n " , key . toHexString ( 20 ) . c_str ( ) , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
else {
2018-06-08 18:32:34 +08:00
// We don't have to check foundLTE here because if it's false then cursor will be at the first record in the page.
// TODO: It would, however, be more efficient to check foundLTE and if false move to the previous sibling page.
// But the page should NOT be empty so let's assert that the cursor is valid.
ASSERT ( p - > cursor . valid ( ) ) ;
2018-08-29 04:46:14 +08:00
state LogicalPageID newPage = ( LogicalPageID ) * ( uint32_t * ) p - > cursor . getValueRef ( ) . begin ( ) ;
2018-09-19 15:32:39 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Found internal page, going to Page id=%d. %s \n " ,
2018-09-24 17:42:23 +08:00
key . toHexString ( 20 ) . c_str ( ) , newPage , self - > toString ( " " ) . c_str ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( pushPage ( self , p - > cursor . getKey ( ) , p - > getNextOrUpperBound ( ) , newPage ) ) ;
2017-09-15 20:19:39 +08:00
}
}
}
// Move one 'internal' key/value/version/valueindex/value record.
2018-07-23 18:09:13 +08:00
// Iterating with this function will "see" all parts of all values and clears at all versions (that is, within the cursor's version of btree pages)
2017-09-16 17:09:09 +08:00
ACTOR static Future < Void > move_impl ( InternalCursor * self , bool fwd ) {
2017-09-15 20:19:39 +08:00
state TraversalPathT & path = self - > m_path ;
state const char * dir = fwd ? " forward " : " backward " ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) start %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
2018-06-08 18:32:34 +08:00
// If cursor was out of bound, adjust out of boundness by 1 in the correct direction
if ( self - > outOfBound ! = 0 ) {
self - > outOfBound + = fwd ? 1 : - 1 ;
2018-06-12 16:43:19 +08:00
// If we appear to be inbounds, see if we're off the other end of the db or if the page cursor is valid.
if ( self - > outOfBound = = 0 ) {
if ( ! path . empty ( ) & & path . back ( ) . cursor . valid ( ) ) {
2018-07-23 18:09:13 +08:00
self - > kvv = KeyVersionValue : : unpack ( path . back ( ) . cursor . getKVRef ( ) ) ;
2018-06-12 16:43:19 +08:00
}
else {
self - > outOfBound = fwd ? 1 : - 1 ;
}
}
debug_printf ( " InternalCursor::move(%s) was out of bound, exiting %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
return Void ( ) ;
}
2017-09-15 20:19:39 +08:00
int i = path . size ( ) ;
// Find the closest path part to the end where the index can be moved in the correct direction.
while ( - - i > = 0 ) {
2018-06-08 18:32:34 +08:00
PrefixTree : : Cursor & c = path [ i ] . cursor ;
bool success = fwd ? c . moveNext ( ) : c . movePrev ( ) ;
if ( success ) {
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Move successful on path index %d \n " , dir , i ) ;
2018-06-17 21:48:41 +08:00
path . resize ( i + 1 ) ;
2017-09-15 20:19:39 +08:00
break ;
2018-06-12 16:43:19 +08:00
} else {
debug_printf ( " InternalCursor::move(%s) Move failed on path index %d \n " , dir , i ) ;
2018-06-08 18:32:34 +08:00
}
2017-09-15 20:19:39 +08:00
}
// If no path part could be moved without going out of range then the
// new cursor position is either before the first record or after the last.
2018-06-08 18:32:34 +08:00
// Leave the path steps in place and set outOfBound to 1 or -1 based on fwd.
// This makes the cursor not valid() but a move in the opposite direction
// will make it valid again, pointing to the previous target record.
2017-09-15 20:19:39 +08:00
if ( i < 0 ) {
2018-06-08 18:32:34 +08:00
self - > outOfBound = fwd ? 1 : - 1 ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Passed an end of the database %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2018-06-17 21:48:41 +08:00
// We were able to advance the cursor on one of the pages in the page traversal path, so now traverse down to leaf level
2017-09-15 20:19:39 +08:00
state PageEntryLocation * p = & ( path . back ( ) ) ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s): Descending if needed to find a leaf \n " , dir ) ;
2017-09-15 20:19:39 +08:00
2018-06-08 18:32:34 +08:00
// Now we must traverse downward if needed until we are at a leaf level.
2017-09-15 20:19:39 +08:00
// Each movement down will start on the far left or far right depending on fwd
2018-06-08 18:32:34 +08:00
while ( ! ( p - > btPage - > flags & BTreePage : : IS_LEAF ) ) {
// Get the page that the path's last entry points to
2018-08-29 04:46:14 +08:00
LogicalPageID childPageID = ( LogicalPageID ) * ( uint32_t * ) p - > cursor . getValueRef ( ) . begin ( ) ;
2018-09-20 18:39:55 +08:00
wait ( pushPage ( self , p - > cursor . getKey ( ) , p - > getNextOrUpperBound ( ) , childPageID ) ) ;
2017-09-15 20:19:39 +08:00
p = & ( path . back ( ) ) ;
2018-06-08 18:32:34 +08:00
// No page traversed to in this manner should be empty.
ASSERT ( p - > btPage - > count ! = 0 ) ;
// Go to the first or last entry in the page depending on traversal direction
if ( fwd )
p - > cursor . moveFirst ( ) ;
else
p - > cursor . moveLast ( ) ;
2017-09-15 20:19:39 +08:00
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Descended one level %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
}
2017-09-15 20:19:39 +08:00
2018-06-08 18:32:34 +08:00
// Found the target record, unpack it
ASSERT ( p - > cursor . valid ( ) ) ;
2018-07-23 18:09:13 +08:00
self - > kvv = KeyVersionValue : : unpack ( p - > cursor . getKVRef ( ) ) ;
2018-06-08 18:32:34 +08:00
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Exiting %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
} ;
// Cursor is for reading and interating over user visible KV pairs at a specific version
2018-09-19 15:32:39 +08:00
// Keys and values returned are only valid until one of the move methods is called (find*, next, prev)
// TODO: Make an option to copy all returned strings into an arena?
2018-06-17 21:48:41 +08:00
class Cursor : public IStoreCursor , public ReferenceCounted < Cursor > , public NonCopyable {
2017-06-10 05:56:41 +08:00
public :
2018-09-19 15:32:39 +08:00
Cursor ( Version version , IPager * pager , LogicalPageID root , int usablePageSizeOverride )
: m_version ( version ) , m_pagerSnapshot ( pager - > getReadSnapshot ( version ) ) , m_icursor ( m_pagerSnapshot , root , usablePageSizeOverride ) {
2017-06-10 05:56:41 +08:00
}
virtual ~ Cursor ( ) { }
2017-09-17 19:38:01 +08:00
virtual Future < Void > findEqual ( KeyRef key ) { return find_impl ( Reference < Cursor > : : addRef ( this ) , key , true , 0 ) ; }
virtual Future < Void > findFirstEqualOrGreater ( KeyRef key , bool needValue , int prefetchNextBytes ) { return find_impl ( Reference < Cursor > : : addRef ( this ) , key , needValue , 1 ) ; }
virtual Future < Void > findLastLessOrEqual ( KeyRef key , bool needValue , int prefetchPriorBytes ) { return find_impl ( Reference < Cursor > : : addRef ( this ) , key , needValue , - 1 ) ; }
2017-09-09 16:29:25 +08:00
2017-09-15 20:19:39 +08:00
virtual Future < Void > next ( bool needValue ) { return next_impl ( Reference < Cursor > : : addRef ( this ) , needValue ) ; }
2017-09-17 19:38:01 +08:00
virtual Future < Void > prev ( bool needValue ) { return prev_impl ( Reference < Cursor > : : addRef ( this ) , needValue ) ; }
2017-06-10 05:56:41 +08:00
virtual bool isValid ( ) {
return m_kv . present ( ) ;
}
virtual KeyRef getKey ( ) {
return m_kv . get ( ) . key ;
}
//virtual StringRef getCompressedKey() = 0;
virtual ValueRef getValue ( ) {
return m_kv . get ( ) . value ;
}
2018-09-19 15:32:39 +08:00
// TODO: Either remove this method or change the contract so that key and value strings returned are still valid after the cursor is
// moved and allocate them in some arena that this method resets.
2017-06-10 05:56:41 +08:00
virtual void invalidateReturnedStrings ( ) {
}
2017-09-15 20:19:39 +08:00
void addref ( ) { ReferenceCounted < Cursor > : : addref ( ) ; }
void delref ( ) { ReferenceCounted < Cursor > : : delref ( ) ; }
2018-06-12 16:43:19 +08:00
std : : string toString ( const char * wrapPrefix = " " ) const {
std : : string r ;
2018-06-14 19:15:14 +08:00
r + = format ( " Cursor(%p) ver: %lld key: %s value: %s " , this , m_version ,
2018-06-12 16:43:19 +08:00
( m_kv . present ( ) ? m_kv . get ( ) . key . printable ( ) . c_str ( ) : " <np> " ) ,
( m_kv . present ( ) ? m_kv . get ( ) . value . printable ( ) . c_str ( ) : " " ) ) ;
2018-06-14 19:15:14 +08:00
r + = format ( " \n %s InternalCursor: %s " , wrapPrefix , m_icursor . toString ( format ( " %s " , wrapPrefix ) . c_str ( ) ) . c_str ( ) ) ;
2018-06-12 16:43:19 +08:00
return r ;
}
2017-09-15 20:19:39 +08:00
private :
2017-06-10 05:56:41 +08:00
Version m_version ;
2017-09-15 20:19:39 +08:00
Reference < IPagerSnapshot > m_pagerSnapshot ;
2017-09-16 17:09:09 +08:00
InternalCursor m_icursor ;
2017-09-09 16:29:25 +08:00
Optional < KeyValueRef > m_kv ; // The current user-level key/value in the tree
2017-06-10 05:56:41 +08:00
Arena m_arena ;
2017-09-09 16:29:25 +08:00
2018-07-23 18:09:13 +08:00
// find key in tree closest to or equal to key (at this cursor's version)
2017-09-15 20:19:39 +08:00
// for less than or equal use cmp < 0
// for greater than or equal use cmp > 0
// for equal use cmp == 0
2017-09-17 19:38:01 +08:00
ACTOR static Future < Void > find_impl ( Reference < Cursor > self , KeyRef key , bool needValue , int cmp ) {
2017-09-16 17:09:09 +08:00
state InternalCursor & icur = self - > m_icursor ;
2017-09-09 16:29:25 +08:00
2018-07-23 18:09:13 +08:00
// Search for the last key at or before (key, version, \xff)
state Key target = KeyVersionValueRef : : searchKey ( key , self - > m_version ) ;
2017-09-17 19:38:01 +08:00
self - > m_kv = Optional < KeyValueRef > ( ) ;
2017-06-10 05:56:41 +08:00
2018-09-20 18:39:55 +08:00
wait ( icur . seekLessThanOrEqual ( target ) ) ;
2018-08-29 04:46:14 +08:00
debug_printf ( " find%sE('%s'): %s \n " , cmp > 0 ? " GT " : ( cmp = = 0 ? " " : " LT " ) , target . toHexString ( 15 ) . c_str ( ) , icur . toString ( ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2017-09-15 20:19:39 +08:00
// If we found the target key, return it as it is valid for any cmp option
2017-09-16 17:09:09 +08:00
if ( icur . valid ( ) & & icur . kvv . value . present ( ) & & icur . kvv . key = = key ) {
debug_printf ( " Reading full kv pair starting from: %s \n " , icur . kvv . toString ( ) . c_str ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > readFullKVPair ( self ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
2017-09-09 16:29:25 +08:00
}
2017-09-15 20:19:39 +08:00
// FindEqual, so if we're still here we didn't find it.
if ( cmp = = 0 ) {
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2017-09-16 16:45:39 +08:00
// FindEqualOrGreaterThan, so if we're here we have to go to the next present record at the target version.
2017-09-15 20:19:39 +08:00
if ( cmp > 0 ) {
2017-09-16 16:45:39 +08:00
// icur is at a record < key, possibly before the start of the tree so move forward at least once.
loop {
2018-09-20 18:39:55 +08:00
wait ( icur . move ( true ) ) ;
2017-09-16 17:09:09 +08:00
if ( ! icur . valid ( ) | | icur . kvv . key > key )
2017-09-16 16:45:39 +08:00
break ;
}
// Get the next present key at the target version. Handles invalid cursor too.
2018-09-20 18:39:55 +08:00
wait ( self - > next ( needValue ) ) ;
2017-09-09 16:29:25 +08:00
}
2017-09-16 16:45:39 +08:00
else if ( cmp < 0 ) {
2017-09-17 19:38:01 +08:00
// Move to previous present kv pair at the target version
2018-09-20 18:39:55 +08:00
wait ( self - > prev ( needValue ) ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2017-09-15 20:19:39 +08:00
ACTOR static Future < Void > next_impl ( Reference < Cursor > self , bool needValue ) {
2017-09-17 19:38:01 +08:00
// TODO: use needValue
2017-09-16 17:09:09 +08:00
state InternalCursor & i = self - > m_icursor ;
2017-09-15 20:19:39 +08:00
2018-06-17 21:48:41 +08:00
debug_printf ( " Cursor::next(): cursor %s \n " , i . toString ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
2017-09-16 16:45:39 +08:00
// Make sure we are one record past the last user key
if ( self - > m_kv . present ( ) ) {
2017-09-16 17:09:09 +08:00
while ( i . valid ( ) & & i . kvv . key < = self - > m_kv . get ( ) . key ) {
2018-06-17 21:48:41 +08:00
debug_printf ( " Cursor::next(): Advancing internal cursor to get passed previous returned user key. cursor %s \n " , i . toString ( ) . c_str ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( i . move ( true ) ) ;
2017-09-16 16:45:39 +08:00
}
}
2017-09-16 08:27:13 +08:00
state Version v = self - > m_pagerSnapshot - > getVersion ( ) ;
2017-09-16 17:09:09 +08:00
state InternalCursor iLast ;
2017-09-16 08:27:13 +08:00
while ( 1 ) {
2017-09-16 17:09:09 +08:00
iLast = i ;
if ( ! i . valid ( ) )
2017-09-16 08:27:13 +08:00
break ;
2018-09-20 18:39:55 +08:00
wait ( i . move ( true ) ) ;
2018-06-17 21:48:41 +08:00
// If the previous cursor position was a set at a version at or before v and the new cursor position
// is not valid or a newer version of the same key or a different key, then get the full record
// for the previous cursor position
2017-09-16 17:09:09 +08:00
if ( iLast . kvv . version < = v
& & iLast . kvv . value . present ( )
2017-09-16 08:27:13 +08:00
& & (
2017-09-16 17:09:09 +08:00
! i . valid ( )
| | i . kvv . key ! = iLast . kvv . key
| | i . kvv . version > v
2017-09-16 08:27:13 +08:00
)
) {
2017-09-16 16:45:39 +08:00
// Assume that next is the most likely next move, so save the one-too-far cursor position.
2017-09-16 08:27:13 +08:00
std : : swap ( i , iLast ) ;
2018-06-17 21:48:41 +08:00
// readFullKVPair will have to go backwards to read the value
2018-09-20 18:39:55 +08:00
wait ( readFullKVPair ( self ) ) ;
2017-09-16 08:27:13 +08:00
std : : swap ( i , iLast ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
}
2017-09-16 08:27:13 +08:00
self - > m_kv = Optional < KeyValueRef > ( ) ;
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
2017-09-17 19:38:01 +08:00
ACTOR static Future < Void > prev_impl ( Reference < Cursor > self , bool needValue ) {
// TODO: use needValue
state InternalCursor & i = self - > m_icursor ;
2018-06-17 21:48:41 +08:00
debug_printf ( " Cursor::prev(): cursor %s \n " , i . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
// Make sure we are one record before the last user key
if ( self - > m_kv . present ( ) ) {
while ( i . valid ( ) & & i . kvv . key > = self - > m_kv . get ( ) . key ) {
2018-09-20 18:39:55 +08:00
wait ( i . move ( false ) ) ;
2017-09-17 19:38:01 +08:00
}
}
state Version v = self - > m_pagerSnapshot - > getVersion ( ) ;
while ( i . valid ( ) ) {
// Once we reach a present value at or before v, return or skip it.
if ( i . kvv . version < = v ) {
// If it's present, return it
if ( i . kvv . value . present ( ) ) {
2018-09-20 18:39:55 +08:00
wait ( readFullKVPair ( self ) ) ;
2017-09-17 19:38:01 +08:00
return Void ( ) ;
}
// Value wasn't present as of the latest version <= v, so move backward to a new key
state Key clearedKey = i . kvv . key ;
while ( 1 ) {
2018-09-20 18:39:55 +08:00
wait ( i . move ( false ) ) ;
2017-09-17 19:38:01 +08:00
if ( ! i . valid ( ) | | i . kvv . key ! = clearedKey )
break ;
}
}
else {
2018-09-20 18:39:55 +08:00
wait ( i . move ( false ) ) ;
2017-09-17 19:38:01 +08:00
}
}
self - > m_kv = Optional < KeyValueRef > ( ) ;
return Void ( ) ;
}
2017-09-09 16:29:25 +08:00
// Read all of the current value, if it is split across multiple kv pairs, and set m_kv.
// m_current must be at either the first or the last value part.
ACTOR static Future < Void > readFullKVPair ( Reference < Cursor > self ) {
2017-09-16 17:09:09 +08:00
state KeyVersionValue & kvv = self - > m_icursor . kvv ;
2017-09-09 16:29:25 +08:00
state KeyValueRef & kv = ( self - > m_kv = KeyValueRef ( ) ) . get ( ) ;
2018-07-23 18:09:13 +08:00
ASSERT ( kvv . value . present ( ) ) ;
// Set the key and cursor arena to the arena containing that key
self - > m_arena = kvv . arena ( ) ;
kv . key = kvv . key ;
2017-09-09 16:29:25 +08:00
// Unsplit value
2018-07-23 18:09:13 +08:00
if ( ! kvv . isMultiPart ( ) ) {
kv . value = kvv . value . get ( ) ;
2018-06-14 19:15:14 +08:00
debug_printf ( " readFullKVPair: Unsplit, exit. %s \n " , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-09 16:29:25 +08:00
}
2018-07-23 18:09:13 +08:00
else {
// Figure out if we should go forward or backward to find all the parts
state bool fwd = kvv . valueIndex = = 0 ;
ASSERT ( fwd | | kvv . valueIndex + kvv . value . get ( ) . size ( ) = = kvv . valueTotalSize ) ;
2018-07-25 17:29:17 +08:00
debug_printf ( " readFullKVPair: Split, fwd %d totalsize %lld %s \n " , fwd , kvv . valueTotalSize , self - > toString ( " " ) . c_str ( ) ) ;
2018-07-23 18:09:13 +08:00
// Allocate space for the entire value in the same arena as the key
state int bytesLeft = kvv . valueTotalSize ;
kv . value = makeString ( bytesLeft , self - > m_arena ) ;
2017-09-09 16:29:25 +08:00
while ( 1 ) {
2018-07-25 17:29:17 +08:00
debug_printf ( " readFullKVPair: Adding chunk start %lld len %d total %lld dir %d \n " , kvv . valueIndex , kvv . value . get ( ) . size ( ) , kvv . valueTotalSize , fwd ) ;
2018-07-23 18:09:13 +08:00
int partSize = kvv . value . get ( ) . size ( ) ;
memcpy ( mutateString ( kv . value ) + kvv . valueIndex , kvv . value . get ( ) . begin ( ) , partSize ) ;
bytesLeft - = partSize ;
if ( bytesLeft = = 0 )
2017-09-09 16:29:25 +08:00
break ;
2018-07-23 18:09:13 +08:00
ASSERT ( bytesLeft > 0 ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_icursor . move ( fwd ) ) ;
2018-07-23 18:09:13 +08:00
ASSERT ( self - > m_icursor . valid ( ) ) ;
2017-09-09 16:29:25 +08:00
}
}
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
} ;
} ;
2018-07-23 18:09:13 +08:00
KeyVersionValueRef VersionedBTree : : beginKVV ( StringRef ( ) , 0 , StringRef ( ) ) ;
KeyVersionValueRef VersionedBTree : : endKVV ( LiteralStringRef ( " \xff \xff \xff \xff " ) , std : : numeric_limits < int > : : max ( ) , StringRef ( ) ) ;
2018-07-15 04:37:52 +08:00
Key VersionedBTree : : beginKey ( beginKVV . pack ( ) . key ) ;
Key VersionedBTree : : endKey ( endKVV . pack ( ) . key ) ;
2017-08-23 02:30:44 +08:00
2017-10-02 18:32:22 +08:00
ACTOR template < class T >
Future < T > catchError ( Promise < Void > error , Future < T > f ) {
try {
T result = wait ( f ) ;
return result ;
} catch ( Error & e ) {
2018-10-25 06:57:06 +08:00
if ( e . code ( ) ! = error_code_actor_cancelled & & error . canBeSet ( ) )
2017-10-02 18:32:22 +08:00
error . sendError ( e ) ;
throw ;
}
}
2017-09-22 14:51:55 +08:00
class KeyValueStoreRedwoodUnversioned : public IKeyValueStore {
2017-09-21 19:43:49 +08:00
public :
2017-09-22 14:51:55 +08:00
KeyValueStoreRedwoodUnversioned ( std : : string filePrefix , UID logID ) : m_filePrefix ( filePrefix ) {
2018-10-25 06:57:06 +08:00
// TODO: This constructor should really just take an IVersionedStore
IPager * pager = new IndirectShadowPager ( filePrefix ) ;
m_tree = new VersionedBTree ( pager , filePrefix , pager - > getUsablePageSize ( ) ) ;
m_init = catchError ( init_impl ( this ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual Future < Void > init ( ) {
return m_init ;
}
ACTOR Future < Void > init_impl ( KeyValueStoreRedwoodUnversioned * self ) {
2018-10-25 06:57:06 +08:00
TraceEvent ( SevInfo , " RedwoodInit " ) . detail ( " FilePrefix " , self - > m_filePrefix ) ;
2018-09-20 18:39:55 +08:00
wait ( self - > m_tree - > init ( ) ) ;
2017-09-21 19:43:49 +08:00
Version v = wait ( self - > m_tree - > getLatestVersion ( ) ) ;
self - > m_tree - > setWriteVersion ( v + 1 ) ;
2018-10-25 06:57:06 +08:00
TraceEvent ( SevInfo , " RedwoodInitComplete " ) . detail ( " FilePrefix " , self - > m_filePrefix ) ;
2017-09-21 19:43:49 +08:00
return Void ( ) ;
}
2017-10-02 18:32:22 +08:00
ACTOR void shutdown ( KeyValueStoreRedwoodUnversioned * self , bool dispose ) {
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdown " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2018-10-25 06:57:06 +08:00
if ( self - > m_error . canBeSet ( ) ) {
self - > m_error . sendError ( actor_cancelled ( ) ) ; // Ideally this should be shutdown_in_progress
}
2017-09-23 08:18:28 +08:00
self - > m_init . cancel ( ) ;
2018-10-25 06:57:06 +08:00
Future < Void > closedFuture = self - > m_tree - > onClosed ( ) ;
2017-10-02 18:32:22 +08:00
if ( dispose )
2018-10-25 06:57:06 +08:00
self - > m_tree - > dispose ( ) ;
2017-10-02 18:32:22 +08:00
else
2018-10-25 06:57:06 +08:00
self - > m_tree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-09-21 19:43:49 +08:00
self - > m_closed . send ( Void ( ) ) ;
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdownComplete " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2017-10-02 18:32:22 +08:00
delete self ;
2017-09-21 19:43:49 +08:00
}
virtual void close ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , false ) ;
2017-09-21 19:43:49 +08:00
}
virtual void dispose ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , true ) ;
2017-09-21 19:43:49 +08:00
}
virtual Future < Void > onClosed ( ) {
return m_closed . getFuture ( ) ;
}
Future < Void > commit ( bool sequential = false ) {
2017-10-10 04:24:16 +08:00
Future < Void > c = m_tree - > commit ( ) ;
m_tree - > setWriteVersion ( m_tree - > getWriteVersion ( ) + 1 ) ;
2018-10-25 06:57:06 +08:00
return catchError ( c ) ;
2017-09-21 19:43:49 +08:00
}
virtual KeyValueStoreType getType ( ) {
2017-09-22 14:51:55 +08:00
return KeyValueStoreType : : SSD_REDWOOD_V1 ;
2017-09-21 19:43:49 +08:00
}
virtual StorageBytes getStorageBytes ( ) {
2018-10-25 06:57:06 +08:00
return m_tree - > getStorageBytes ( ) ;
2017-09-21 19:43:49 +08:00
}
2018-10-25 06:57:06 +08:00
virtual Future < Void > getError ( ) {
return delayed ( m_error . getFuture ( ) ) ;
} ;
2017-09-21 19:43:49 +08:00
void clear ( KeyRangeRef range , const Arena * arena = 0 ) {
m_tree - > clear ( range ) ;
}
virtual void set ( KeyValueRef keyValue , const Arena * arena = NULL ) {
2017-09-23 08:18:28 +08:00
//printf("SET write version %lld %s\n", m_tree->getWriteVersion(), printable(keyValue).c_str());
2017-09-21 19:43:49 +08:00
m_tree - > set ( keyValue ) ;
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Standalone < VectorRef < KeyValueRef > > > readRange_impl ( KeyValueStoreRedwoodUnversioned * self , KeyRange keys , int rowLimit , int byteLimit ) {
2017-09-21 19:43:49 +08:00
state Standalone < VectorRef < KeyValueRef > > result ;
state int accumulatedBytes = 0 ;
ASSERT ( byteLimit > 0 ) ;
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2017-10-02 18:32:22 +08:00
2017-09-21 19:43:49 +08:00
if ( rowLimit > = 0 ) {
2018-09-20 18:39:55 +08:00
wait ( cur - > findFirstEqualOrGreater ( keys . begin , true , 0 ) ) ;
2017-09-21 19:43:49 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) < keys . end ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2018-10-25 06:57:06 +08:00
if ( - - rowLimit = = 0 | | accumulatedBytes > = byteLimit ) {
2017-09-21 19:43:49 +08:00
break ;
2018-10-25 06:57:06 +08:00
}
2018-09-20 18:39:55 +08:00
wait ( cur - > next ( true ) ) ;
2017-09-21 19:43:49 +08:00
}
} else {
2018-09-20 18:39:55 +08:00
wait ( cur - > findLastLessOrEqual ( keys . end , true , 0 ) ) ;
2017-09-21 19:43:49 +08:00
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = keys . end )
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-21 19:43:49 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = keys . begin ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2018-10-25 06:57:06 +08:00
if ( - - rowLimit = = 0 | | accumulatedBytes > = byteLimit ) {
2017-09-21 19:43:49 +08:00
break ;
2018-10-25 06:57:06 +08:00
}
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-21 19:43:49 +08:00
}
}
return result ;
}
virtual Future < Standalone < VectorRef < KeyValueRef > > > readRange ( KeyRangeRef keys , int rowLimit = 1 < < 30 , int byteLimit = 1 < < 30 ) {
2018-10-25 06:57:06 +08:00
return catchError ( readRange_impl ( this , keys , rowLimit , byteLimit ) ) ;
2017-09-21 19:43:49 +08:00
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Optional < Value > > readValue_impl ( KeyValueStoreRedwoodUnversioned * self , Key key , Optional < UID > debugID ) {
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findEqual ( key ) ) ;
2017-10-02 18:32:22 +08:00
if ( cur - > isValid ( ) ) {
2017-09-21 19:43:49 +08:00
return cur - > getValue ( ) ;
2017-10-02 18:32:22 +08:00
}
2017-09-21 19:43:49 +08:00
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValue ( KeyRef key , Optional < UID > debugID = Optional < UID > ( ) ) {
2018-10-25 06:57:06 +08:00
return catchError ( readValue_impl ( this , key , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2018-10-26 10:48:31 +08:00
ACTOR static Future < Optional < Value > > readValuePrefix_impl ( KeyValueStoreRedwoodUnversioned * self , Key key , int maxLength , Optional < UID > debugID ) {
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findEqual ( key ) ) ;
2017-09-21 19:43:49 +08:00
if ( cur - > isValid ( ) ) {
Value v = cur - > getValue ( ) ;
int len = std : : min ( v . size ( ) , maxLength ) ;
return Value ( cur - > getValue ( ) . substr ( 0 , len ) ) ;
}
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValuePrefix ( KeyRef key , int maxLength , Optional < UID > debugID = Optional < UID > ( ) ) {
2018-10-25 06:57:06 +08:00
return catchError ( readValuePrefix_impl ( this , key , maxLength , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual ~ KeyValueStoreRedwoodUnversioned ( ) {
2017-09-21 19:43:49 +08:00
} ;
private :
std : : string m_filePrefix ;
VersionedBTree * m_tree ;
Future < Void > m_init ;
Promise < Void > m_closed ;
2017-10-02 18:32:22 +08:00
Promise < Void > m_error ;
2018-10-25 06:57:06 +08:00
template < typename T > inline Future < T > catchError ( Future < T > f ) {
return : : catchError ( m_error , f ) ;
}
2017-09-21 19:43:49 +08:00
} ;
2017-09-22 14:51:55 +08:00
IKeyValueStore * keyValueStoreRedwoodV1 ( std : : string const & filename , UID logID ) {
return new KeyValueStoreRedwoodUnversioned ( filename , logID ) ;
2017-09-21 19:43:49 +08:00
}
2018-09-28 07:07:29 +08:00
int randomSize ( int max ) {
2019-04-10 02:16:45 +08:00
return g_random - > randomInt ( 0 , max ) ;
2018-09-28 07:07:29 +08:00
}
2017-09-21 19:43:49 +08:00
2017-07-14 13:11:48 +08:00
KeyValue randomKV ( int keySize = 10 , int valueSize = 5 ) {
2018-09-28 07:07:29 +08:00
int kLen = randomSize ( 1 + keySize ) ;
2018-10-05 14:46:37 +08:00
int vLen = valueSize > 0 ? randomSize ( valueSize ) : 0 ;
2017-06-10 05:56:41 +08:00
KeyValue kv ;
kv . key = makeString ( kLen , kv . arena ( ) ) ;
kv . value = makeString ( vLen , kv . arena ( ) ) ;
for ( int i = 0 ; i < kLen ; + + i )
mutateString ( kv . key ) [ i ] = ( uint8_t ) g_random - > randomInt ( ' a ' , ' m ' ) ;
for ( int i = 0 ; i < vLen ; + + i )
mutateString ( kv . value ) [ i ] = ( uint8_t ) g_random - > randomInt ( ' n ' , ' z ' ) ;
return kv ;
}
2017-09-16 16:45:39 +08:00
ACTOR Future < int > verifyRandomRange ( VersionedBTree * btree , Version v , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written ) {
2017-09-15 20:19:39 +08:00
state int errors = 0 ;
2017-09-16 08:27:13 +08:00
state Key start = randomKV ( ) . key ;
state Key end = randomKV ( ) . key ;
2017-09-15 20:19:39 +08:00
if ( end < = start )
end = keyAfter ( start ) ;
2017-09-16 16:45:39 +08:00
debug_printf ( " VerifyRange '%s' to '%s' @%lld \n " , printable ( start ) . c_str ( ) , printable ( end ) . c_str ( ) , v ) ;
2017-09-15 20:19:39 +08:00
2017-09-16 16:45:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written - > lower_bound ( std : : make_pair ( start . toString ( ) , 0 ) ) ;
2017-09-15 20:19:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written - > upper_bound ( std : : make_pair ( end . toString ( ) , 0 ) ) ;
2017-09-16 08:27:13 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iLast ;
2017-09-15 20:19:39 +08:00
2017-09-17 19:38:01 +08:00
state Reference < IStoreCursor > cur = btree - > readAtVersion ( v ) ;
// Randomly use the cursor for something else first.
if ( g_random - > coinflip ( ) ) {
2018-06-14 19:15:14 +08:00
debug_printf ( " VerifyRange: Dummy seek \n " ) ;
2017-09-21 15:58:56 +08:00
state Key randomKey = randomKV ( ) . key ;
2018-09-20 18:39:55 +08:00
wait ( g_random - > coinflip ( ) ? cur - > findFirstEqualOrGreater ( randomKey , true , 0 ) : cur - > findLastLessOrEqual ( randomKey , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
}
2018-06-14 19:15:14 +08:00
debug_printf ( " VerifyRange: Actual seek \n " ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > findFirstEqualOrGreater ( start , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
state std : : vector < KeyValue > results ;
2017-09-15 20:19:39 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) < end ) {
// Find the next written kv pair that would be present at this version
while ( 1 ) {
iLast = i ;
2017-09-16 08:27:13 +08:00
if ( i = = iEnd )
break ;
+ + i ;
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
)
break ;
2017-09-15 20:19:39 +08:00
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( iLast = = iEnd ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( cur - > getKey ( ) ! = iLast - > first . first ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , iLast - > first . first . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
if ( cur - > getValue ( ) ! = iLast - > second . get ( ) ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , iLast - > second . get ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-17 19:38:01 +08:00
results . push_back ( KeyValue ( KeyValueRef ( cur - > getKey ( ) , cur - > getValue ( ) ) ) ) ;
2018-09-20 18:39:55 +08:00
wait ( cur - > next ( true ) ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-16 08:27:13 +08:00
// Make sure there are no further written kv pairs that would be present at this version.
while ( 1 ) {
iLast = i ;
if ( i = = iEnd )
break ;
+ + i ;
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
)
break ;
}
if ( iLast ! = iEnd ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRange(@% " PRId64 " , %s, %s) ERROR: Tree range ended but written has @% " PRId64 " '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , iLast - > first . second , iLast - > first . first . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
}
2017-09-16 16:45:39 +08:00
2019-05-05 01:52:02 +08:00
debug_printf ( " VerifyRangeReverse '%s' to '%s' @% " PRId64 " \n " , printable ( start ) . c_str ( ) , printable ( end ) . c_str ( ) , v ) ;
2017-09-17 19:38:01 +08:00
// Randomly use a new cursor for the revere range read
if ( g_random - > coinflip ( ) ) {
cur = btree - > readAtVersion ( v ) ;
}
// Now read the range from the tree in reverse order and compare to the saved results
2018-09-20 18:39:55 +08:00
wait ( cur - > findLastLessOrEqual ( end , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = end )
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-17 19:38:01 +08:00
state std : : vector < KeyValue > : : const_reverse_iterator r = results . rbegin ( ) ;
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = start ) {
if ( r = = results . rend ( ) ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
if ( cur - > getKey ( ) ! = r - > key ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
if ( cur - > getValue ( ) ! = r - > value ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , r - > value . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
break ;
}
+ + r ;
2018-09-20 18:39:55 +08:00
wait ( cur - > prev ( true ) ) ;
2017-09-17 19:38:01 +08:00
}
if ( r ! = results . rend ( ) ) {
errors + = 1 ;
2019-05-05 01:52:02 +08:00
printf ( " VerifyRangeReverse(@% " PRId64 " , %s, %s) ERROR: Tree range ended but written has '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
}
2017-09-15 20:19:39 +08:00
return errors ;
}
2018-09-28 15:35:03 +08:00
ACTOR Future < int > verifyAll ( VersionedBTree * btree , Version maxCommittedVersion , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written ) {
// Read back every key at every version set or cleared and verify the result.
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written - > cbegin ( ) ;
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written - > cend ( ) ;
state int errors = 0 ;
while ( i ! = iEnd ) {
state std : : string key = i - > first . first ;
state Version ver = i - > first . second ;
if ( ver < = maxCommittedVersion ) {
state Optional < std : : string > val = i - > second ;
state Reference < IStoreCursor > cur = btree - > readAtVersion ( ver ) ;
debug_printf ( " Verifying @%lld '%s' \n " , ver , key . c_str ( ) ) ;
wait ( cur - > findEqual ( key ) ) ;
if ( val . present ( ) ) {
if ( ! ( cur - > isValid ( ) & & cur - > getKey ( ) = = key & & cur - > getValue ( ) = = val . get ( ) ) ) {
+ + errors ;
if ( ! cur - > isValid ( ) )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: key_not_found: '%s' -> '%s' @% " PRId64 " \n " , key . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
else if ( cur - > getKey ( ) ! = key )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: key_incorrect: found '%s' expected '%s' @% " PRId64 " \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , key . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
else if ( cur - > getValue ( ) ! = val . get ( ) )
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @% " PRId64 " \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
}
} else {
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = key ) {
+ + errors ;
2019-05-05 01:52:02 +08:00
printf ( " Verify ERROR: cleared_key_found: '%s' -> '%s' @% " PRId64 " \n " , key . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , ver ) ;
2018-09-28 15:35:03 +08:00
}
}
}
+ + i ;
}
return errors ;
}
ACTOR Future < Void > verify ( VersionedBTree * btree , FutureStream < Version > vStream , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written , int * pErrorCount ) {
try {
loop {
2018-10-02 07:51:57 +08:00
state Version v = waitNext ( vStream ) ;
2018-09-28 15:35:03 +08:00
2018-10-02 07:51:57 +08:00
debug_printf ( " Verifying through version %lld \n " , v ) ;
2018-09-28 15:35:03 +08:00
state Future < int > vall = verifyAll ( btree , v , written ) ;
2018-10-02 07:51:57 +08:00
state Future < int > vrange = verifyRandomRange ( btree , g_random - > randomInt ( 1 , v + 1 ) , written ) ;
2018-09-28 15:35:03 +08:00
wait ( success ( vall ) & & success ( vrange ) ) ;
2018-10-02 07:51:57 +08:00
int errors = vall . get ( ) + vrange . get ( ) ;
* pErrorCount + = errors ;
debug_printf ( " Verified through version %lld, %d errors \n " , v , errors ) ;
2018-09-28 15:35:03 +08:00
if ( * pErrorCount ! = 0 )
break ;
}
} catch ( Error & e ) {
if ( e . code ( ) ! = error_code_end_of_stream ) {
throw ;
}
}
return Void ( ) ;
}
2018-10-05 14:46:37 +08:00
// Does a random range read, doesn't trap/report errors
ACTOR Future < Void > randomReader ( VersionedBTree * btree ) {
state Reference < IStoreCursor > cur ;
loop {
wait ( yield ( ) ) ;
if ( ! cur | | g_random - > random01 ( ) > .1 ) {
Version v = g_random - > randomInt ( 1 , btree - > getLastCommittedVersion ( ) + 1 ) ;
cur = btree - > readAtVersion ( v ) ;
}
wait ( cur - > findFirstEqualOrGreater ( randomKV ( 10 , 0 ) . key , true , 0 ) ) ;
state int c = g_random - > randomInt ( 0 , 100 ) ;
while ( cur - > isValid ( ) & & c - - > 0 ) {
wait ( success ( cur - > next ( true ) ) ) ;
wait ( yield ( ) ) ;
}
}
}
2018-08-29 04:46:14 +08:00
2018-10-06 13:13:22 +08:00
TEST_CASE ( " !/redwood/correctness " ) {
2018-09-20 10:16:18 +08:00
state bool useDisk = true ; // MemoryPager is not being maintained currently.
2017-07-14 13:11:48 +08:00
2018-07-05 12:12:09 +08:00
state std : : string pagerFile = " unittest_pageFile " ;
2018-10-15 18:43:43 +08:00
IPager * pager ;
2018-08-29 04:46:14 +08:00
2018-10-15 18:43:43 +08:00
if ( useDisk ) {
deleteFile ( pagerFile ) ;
deleteFile ( pagerFile + " 0.pagerlog " ) ;
deleteFile ( pagerFile + " 1.pagerlog " ) ;
2017-09-22 14:51:55 +08:00
pager = new IndirectShadowPager ( pagerFile ) ;
2018-10-15 18:43:43 +08:00
}
2017-07-14 13:11:48 +08:00
else
pager = createMemoryPager ( ) ;
2018-09-24 17:42:23 +08:00
state int pageSize = g_random - > coinflip ( ) ? pager - > getUsablePageSize ( ) : g_random - > randomInt ( 200 , 400 ) ;
2017-10-10 04:24:16 +08:00
state VersionedBTree * btree = new VersionedBTree ( pager , pagerFile , pageSize ) ;
2018-09-20 18:39:55 +08:00
wait ( btree - > init ( ) ) ;
2017-06-10 05:56:41 +08:00
2018-09-28 07:07:29 +08:00
state int mutationBytesTarget = g_random - > randomInt ( 100 , 20e6 ) ;
2018-06-12 16:43:19 +08:00
2017-09-06 07:59:31 +08:00
// We must be able to fit at least two any two keys plus overhead in a page to prevent
// a situation where the tree cannot be grown upward with decreasing level size.
2018-06-12 16:43:19 +08:00
// TODO: Handle arbitrarily large keys
2018-09-24 17:42:23 +08:00
state int maxKeySize = std : : min ( pageSize * 8 , 30000 ) ;
2017-09-06 07:59:31 +08:00
ASSERT ( maxKeySize > 0 ) ;
2018-09-28 07:07:29 +08:00
state int maxValueSize = std : : min ( pageSize * 25 , 100000 ) ;
2017-09-06 07:59:31 +08:00
2018-09-28 07:07:29 +08:00
printf ( " Using page size %d, max key size %d, max value size %d, total mutation byte target %d \n " , pageSize , maxKeySize , maxValueSize , mutationBytesTarget ) ;
2017-09-06 07:59:31 +08:00
2017-08-22 13:29:57 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > written ;
state std : : set < Key > keys ;
2017-06-10 05:56:41 +08:00
2017-09-15 20:19:39 +08:00
state Version lastVer = wait ( btree - > getLatestVersion ( ) ) ;
2019-05-05 01:52:02 +08:00
printf ( " Starting from version: % " PRId64 " \n " , lastVer ) ;
2017-07-14 13:11:48 +08:00
2017-06-10 05:56:41 +08:00
state Version version = lastVer + 1 ;
2018-09-28 07:07:29 +08:00
state int mutationBytes = 0 ;
btree - > setWriteVersion ( version ) ;
2018-09-28 15:35:03 +08:00
2017-07-26 07:10:19 +08:00
state int64_t keyBytesInserted = 0 ;
state int64_t ValueBytesInserted = 0 ;
2018-09-28 15:35:03 +08:00
state int errorCount ;
2017-07-26 07:10:19 +08:00
2018-09-28 15:35:03 +08:00
state PromiseStream < Version > committedVersions ;
state Future < Void > verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount ) ;
2018-10-15 18:43:43 +08:00
state Future < Void > randomTask = randomReader ( btree ) | | btree - > getError ( ) ;
2018-09-28 07:07:29 +08:00
2018-10-02 07:51:57 +08:00
state Future < Void > commit = Void ( ) ;
2018-09-28 07:07:29 +08:00
while ( mutationBytes < mutationBytesTarget ) {
// Sometimes advance the version
if ( g_random - > random01 ( ) < 0.10 ) {
2017-07-15 02:37:08 +08:00
+ + version ;
2017-06-10 05:56:41 +08:00
btree - > setWriteVersion ( version ) ;
}
2018-09-28 07:07:29 +08:00
// Sometimes do a clear range
if ( g_random - > random01 ( ) < .10 ) {
Key start = randomKV ( maxKeySize , 1 ) . key ;
Key end = ( g_random - > random01 ( ) < .01 ) ? keyAfter ( start ) : randomKV ( maxKeySize , 1 ) . key ;
2017-06-10 05:56:41 +08:00
2018-09-28 07:07:29 +08:00
// Sometimes replace start and/or end with a close actual (previously used) value
if ( g_random - > random01 ( ) < .10 ) {
auto i = keys . upper_bound ( start ) ;
if ( i ! = keys . end ( ) )
start = * i ;
}
if ( g_random - > random01 ( ) < .10 ) {
auto i = keys . upper_bound ( end ) ;
if ( i ! = keys . end ( ) )
end = * i ;
}
2017-07-14 13:11:48 +08:00
2018-09-28 07:07:29 +08:00
if ( end = = start )
end = keyAfter ( start ) ;
else if ( end < start ) {
std : : swap ( end , start ) ;
}
2017-07-14 13:11:48 +08:00
2018-09-28 07:07:29 +08:00
KeyRangeRef range ( start , end ) ;
debug_printf ( " Clear '%s' to '%s' @%lld \n " , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , version ) ;
auto e = written . lower_bound ( std : : make_pair ( start . toString ( ) , 0 ) ) ;
if ( e ! = written . end ( ) ) {
auto last = e ;
auto eEnd = written . lower_bound ( std : : make_pair ( end . toString ( ) , 0 ) ) ;
while ( e ! = eEnd ) {
auto w = * e ;
+ + e ;
// If e key is different from last and last was present then insert clear for last's key at version
if ( last ! = eEnd & & ( ( e = = eEnd | | e - > first . first ! = last - > first . first ) & & last - > second . present ( ) ) ) {
debug_printf ( " Clearing key '%s' @%lld \n " , last - > first . first . c_str ( ) , version ) ;
mutationBytes + = ( last - > first . first . size ( ) + last - > second . get ( ) . size ( ) ) ;
// If the last set was at version then just make it not present
if ( last - > first . second = = version ) {
last - > second = Optional < std : : string > ( ) ;
}
else {
written [ std : : make_pair ( last - > first . first , version ) ] = Optional < std : : string > ( ) ;
}
}
last = e ;
}
}
2017-07-14 13:11:48 +08:00
2018-09-28 07:07:29 +08:00
btree - > clear ( range ) ;
2017-07-14 13:11:48 +08:00
}
2018-09-28 07:07:29 +08:00
else {
// Set a key
KeyValue kv = randomKV ( maxKeySize , maxValueSize ) ;
// Sometimes change key to a close previously used key
if ( g_random - > random01 ( ) < .01 ) {
auto i = keys . upper_bound ( kv . key ) ;
if ( i ! = keys . end ( ) )
kv . key = StringRef ( kv . arena ( ) , * i ) ;
}
keyBytesInserted + = kv . key . size ( ) ;
ValueBytesInserted + = kv . value . size ( ) ;
mutationBytes + = ( kv . key . size ( ) + kv . value . size ( ) ) ;
debug_printf ( " Set '%s' -> '%s' @%lld \n " , kv . key . toString ( ) . c_str ( ) , kv . value . toString ( ) . c_str ( ) , version ) ;
btree - > set ( kv ) ;
written [ std : : make_pair ( kv . key . toString ( ) , version ) ] = kv . value . toString ( ) ;
keys . insert ( kv . key ) ;
}
// Sometimes (and at end) commit then check all results
if ( mutationBytes > = std : : min ( mutationBytesTarget , ( int ) 20e6 ) | | g_random - > random01 ( ) < .002 ) {
2018-10-02 07:51:57 +08:00
// Wait for btree commit and send the new version to committedVersions.
// Avoid capture of version as a member of *this
Version v = version ;
commit = map ( commit & & btree - > commit ( ) , [ = ] ( Void ) {
// Notify the background verifier that version is committed and therefore readable
committedVersions . send ( v ) ;
return Void ( ) ;
} ) ;
2019-05-05 01:52:02 +08:00
printf ( " Cumulative: %d total mutation bytes, %lu key changes, % " PRId64 " key bytes, % " PRId64 " value bytes \n " , mutationBytes , written . size ( ) , keyBytesInserted , ValueBytesInserted ) ;
2018-09-28 07:07:29 +08:00
2018-10-02 07:51:57 +08:00
// Recover from disk at random
2018-09-28 07:07:29 +08:00
if ( useDisk & & g_random - > random01 ( ) < .1 ) {
2018-10-02 07:51:57 +08:00
printf ( " Recovering from disk. \n " ) ;
// Wait for outstanding commit
debug_printf ( " Waiting for outstanding commit \n " ) ;
wait ( commit ) ;
2018-09-28 15:35:03 +08:00
// Stop and wait for the verifier task
committedVersions . sendError ( end_of_stream ( ) ) ;
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for verification to complete. \n " ) ;
2018-09-28 15:35:03 +08:00
wait ( verifyTask ) ;
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-28 07:07:29 +08:00
wait ( closedFuture ) ;
2018-10-25 06:57:06 +08:00
debug_printf ( " Reopening btree \n " ) ;
2018-10-15 18:43:43 +08:00
IPager * pager = new IndirectShadowPager ( pagerFile ) ;
2018-09-28 07:07:29 +08:00
btree = new VersionedBTree ( pager , pagerFile , pageSize ) ;
wait ( btree - > init ( ) ) ;
Version v = wait ( btree - > getLatestVersion ( ) ) ;
ASSERT ( v = = version ) ;
2019-05-05 01:52:02 +08:00
printf ( " Recovered from disk. Latest version % " PRId64 " \n " , v ) ;
2017-06-10 05:56:41 +08:00
2018-09-28 15:35:03 +08:00
// Create new promise stream and start the verifier again
committedVersions = PromiseStream < Version > ( ) ;
verifyTask = verify ( btree , committedVersions . getFuture ( ) , & written , & errorCount ) ;
2018-10-15 18:43:43 +08:00
randomTask = randomReader ( btree ) | | btree - > getError ( ) ;
2017-06-10 05:56:41 +08:00
}
2018-09-28 15:35:03 +08:00
// Check for errors
if ( errorCount ! = 0 )
2018-09-28 07:07:29 +08:00
throw internal_error ( ) ;
2018-09-28 15:35:03 +08:00
+ + version ;
btree - > setWriteVersion ( version ) ;
2018-09-28 07:07:29 +08:00
}
2018-06-08 18:32:34 +08:00
2017-06-10 05:56:41 +08:00
}
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for outstanding commit \n " ) ;
wait ( commit ) ;
2018-09-28 15:35:03 +08:00
committedVersions . sendError ( end_of_stream ( ) ) ;
2018-10-02 07:51:57 +08:00
debug_printf ( " Waiting for verification to complete. \n " ) ;
2018-09-28 15:35:03 +08:00
wait ( verifyTask ) ;
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-07-14 13:11:48 +08:00
return Void ( ) ;
}
2018-10-06 13:13:22 +08:00
TEST_CASE ( " !/redwood/performance/set " ) {
2018-10-15 18:43:43 +08:00
state std : : string pagerFile = " unittest_pageFile " ;
deleteFile ( pagerFile ) ;
deleteFile ( pagerFile + " 0.pagerlog " ) ;
deleteFile ( pagerFile + " 1.pagerlog " ) ;
IPager * pager = new IndirectShadowPager ( pagerFile ) ;
2018-07-05 12:12:09 +08:00
state VersionedBTree * btree = new VersionedBTree ( pager , " unittest_pageFile " ) ;
2018-09-20 18:39:55 +08:00
wait ( btree - > init ( ) ) ;
2017-07-14 13:11:48 +08:00
2018-10-02 07:51:57 +08:00
state int nodeCount = 10000000 ;
state int maxChangesPerVersion = 1000 ;
2018-06-15 08:52:25 +08:00
state int versions = 5000 ;
int maxKeySize = 50 ;
2017-07-14 13:11:48 +08:00
state std : : string key ( maxKeySize , ' k ' ) ;
state std : : string value ( maxKeySize , ' v ' ) ;
2018-06-15 08:52:25 +08:00
state int64_t kvBytes = 0 ;
state int records = 0 ;
2018-10-02 07:51:57 +08:00
state Future < Void > commit = Void ( ) ;
2017-07-14 13:11:48 +08:00
state double startTime = now ( ) ;
while ( - - versions ) {
Version lastVer = wait ( btree - > getLatestVersion ( ) ) ;
state Version version = lastVer + 1 ;
btree - > setWriteVersion ( version ) ;
int changes = g_random - > randomInt ( 0 , maxChangesPerVersion ) ;
while ( changes - - ) {
KeyValue kv ;
// Change first 4 bytes of key to an int
* ( uint32_t * ) key . data ( ) = g_random - > randomInt ( 0 , nodeCount ) ;
kv . key = StringRef ( ( uint8_t * ) key . data ( ) , g_random - > randomInt ( 10 , key . size ( ) ) ) ;
kv . value = StringRef ( ( uint8_t * ) value . data ( ) , g_random - > randomInt ( 0 , value . size ( ) ) ) ;
btree - > set ( kv ) ;
2018-06-15 08:52:25 +08:00
kvBytes + = kv . key . size ( ) + kv . value . size ( ) ;
+ + records ;
2017-07-14 13:11:48 +08:00
}
2018-10-02 07:51:57 +08:00
if ( g_random - > random01 ( ) < ( 1.0 / 300 ) ) {
wait ( commit ) ;
commit = btree - > commit ( ) ;
2018-06-15 08:52:25 +08:00
double elapsed = now ( ) - startTime ;
2019-05-05 01:52:02 +08:00
printf ( " Committed (cumulative) % " PRId64 " bytes in %d records in %f seconds, %.2f MB/s \n " , kvBytes , records , elapsed , kvBytes / elapsed / 1e6 ) ;
2017-07-14 13:11:48 +08:00
}
}
2018-09-20 18:39:55 +08:00
wait ( btree - > commit ( ) ) ;
2017-07-14 13:11:48 +08:00
2018-10-15 18:43:43 +08:00
Future < Void > closedFuture = btree - > onClosed ( ) ;
btree - > close ( ) ;
2018-09-20 18:39:55 +08:00
wait ( closedFuture ) ;
2017-07-14 13:11:48 +08:00
2018-06-15 08:52:25 +08:00
double elapsed = now ( ) - startTime ;
2019-05-05 01:52:02 +08:00
printf ( " Wrote (final) % " PRId64 " bytes in %d records in %f seconds, %.2f MB/s \n " , kvBytes , records , elapsed , kvBytes / elapsed / 1e6 ) ;
2018-06-15 08:52:25 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}