2017-06-10 05:56:41 +08:00
/*
* VersionedBTree . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
*
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
*
* http : //www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
# include "flow/flow.h"
# include "IVersionedStore.h"
# include "IPager.h"
# include "fdbclient/Tuple.h"
# include "flow/serialize.h"
# include "flow/genericactors.actor.h"
# include "flow/UnitTest.h"
# include "MemoryPager.h"
2017-07-14 13:11:48 +08:00
# include "IndirectShadowPager.h"
2017-06-10 05:56:41 +08:00
# include <map>
# include <vector>
2017-08-04 15:01:25 +08:00
# include "fdbclient/CommitTransaction.h"
2017-09-21 19:43:49 +08:00
# include "IKeyValueStore.h"
2018-06-08 18:32:34 +08:00
# include "PrefixTree.h"
2018-07-23 18:09:13 +08:00
# include <string.h>
// Convenience method for converting a Standalone to a Ref while adding its arena to another arena.
template < typename T > inline const T & dependsOn ( Arena & arena , const Standalone < T > & s ) {
arena . dependsOn ( s . arena ( ) ) ;
return s ;
}
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
struct BTreePage {
enum EPageFlags { IS_LEAF = 1 } ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
uint8_t flags ;
uint16_t count ;
uint32_t kvBytes ;
PrefixTree tree ;
2017-06-10 05:56:41 +08:00
2018-06-14 19:15:14 +08:00
static inline int GetHeaderSize ( ) {
return sizeof ( BTreePage ) - sizeof ( PrefixTree ) ;
}
2018-07-15 04:37:52 +08:00
std : : string toString ( StringRef lowerBoundKey , StringRef upperBoundKey ) const {
2018-06-08 18:32:34 +08:00
std : : string r ;
2018-07-17 15:41:42 +08:00
r + = format ( " BTreePage %p tree %p flags 0x%X count %d kvBytes %d \n lowerBoundKey='%s' \n upperBoundKey='%s' " , this , & tree , ( int ) flags , ( int ) count , ( int ) kvBytes , lowerBoundKey . toHexString ( ) . c_str ( ) , upperBoundKey . toHexString ( ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
if ( count > 0 ) {
2018-07-15 04:37:52 +08:00
PrefixTree : : Cursor c = tree . getCursor ( lowerBoundKey , upperBoundKey ) ;
2018-06-08 18:32:34 +08:00
c . moveFirst ( ) ;
2018-07-15 04:37:52 +08:00
ASSERT ( c . valid ( ) ) ;
2018-06-08 18:32:34 +08:00
do {
2018-06-12 16:43:19 +08:00
r + = " \n " ;
2018-07-18 18:19:35 +08:00
Tuple t ;
try {
2018-07-23 18:09:13 +08:00
t = Tuple : : unpack ( c . getKeyRef ( ) ) ;
2018-07-18 18:19:35 +08:00
for ( int i = 0 ; i < t . size ( ) ; + + i ) {
if ( i ! = 0 )
r + = " , " ;
if ( t . getType ( i ) = = Tuple : : ElementType : : BYTES )
r + = format ( " '%s' " , t . getString ( i ) . printable ( ) . c_str ( ) ) ;
if ( t . getType ( i ) = = Tuple : : ElementType : : INT )
r + = format ( " %lld " , t . getInt ( i , true ) ) ;
}
} catch ( Error & e ) {
2018-06-08 18:32:34 +08:00
}
2018-07-23 18:09:13 +08:00
r + = format ( " ['%s'] " , c . getKeyRef ( ) . toHexString ( ) . c_str ( ) ) ;
2018-07-18 18:19:35 +08:00
2018-06-08 18:32:34 +08:00
r + = " -> " ;
if ( flags & & IS_LEAF )
2018-07-23 18:09:13 +08:00
r + = format ( " '%s' " , c . getValueRef ( ) . toHexString ( ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
else
2018-07-23 18:09:13 +08:00
r + = format ( " Page %u " , * ( const uint32_t * ) c . getValueRef ( ) . begin ( ) ) ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
} while ( c . moveNext ( ) ) ;
}
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
return r ;
2017-06-10 05:56:41 +08:00
}
2018-06-12 16:43:19 +08:00
} __attribute__ ( ( packed , aligned ( 1 ) ) ) ;
2018-06-08 18:32:34 +08:00
static void writeEmptyPage ( Reference < IPage > page , uint8_t newFlags , int pageSize ) {
BTreePage * btpage = ( BTreePage * ) page - > begin ( ) ;
btpage - > flags = newFlags ;
btpage - > kvBytes = 0 ;
btpage - > count = 0 ;
2018-07-15 04:37:52 +08:00
btpage - > tree . build ( nullptr , nullptr , StringRef ( ) , StringRef ( ) ) ;
2018-06-08 18:32:34 +08:00
}
2017-06-10 05:56:41 +08:00
2018-07-10 17:24:01 +08:00
typedef std : : pair < Key , Reference < IPage > > BoundaryPagePairT ;
// Returns a std::vector of pairs of lower boundary key indices within kvPairs and encoded pages.
2018-06-08 18:32:34 +08:00
template < typename Allocator >
2018-07-23 18:09:13 +08:00
static std : : vector < BoundaryPagePairT > buildPages ( bool minimalBoundaries , StringRef lowerBound , StringRef upperBound , std : : vector < PrefixTree : : EntryRef > entries , uint8_t newFlags , Allocator const & newPageFn , int pageSize ) {
2018-07-15 04:37:52 +08:00
// Subtract space used for btree page and prefix tree headers to get prefix tree node space available/
pageSize - = ( BTreePage : : GetHeaderSize ( ) + PrefixTree : : GetHeaderSize ( ) ) ;
2018-07-10 17:24:01 +08:00
std : : vector < BoundaryPagePairT > pages ;
2018-06-08 18:32:34 +08:00
2018-07-15 04:37:52 +08:00
// TODO: Move all of this abstraction breaking stuff into PrefixTree in the form of a helper function or class.
int kvBytes = 0 ; // User key/value bytes in page
int overheadBytes = 0 ; // Overhead that could be needed for the records that will be in the prefix tree. TODO: Be more accurate
int uniqueBytes = 0 ; // Data byte count in serialized tree
2018-06-15 08:52:25 +08:00
2018-06-08 18:32:34 +08:00
int start = 0 ;
2018-06-15 08:52:25 +08:00
int i = 0 ;
2018-06-08 18:32:34 +08:00
const int iEnd = entries . size ( ) ;
2018-07-15 04:37:52 +08:00
StringRef pageLowerBound = lowerBound ;
2018-06-08 18:32:34 +08:00
2018-06-14 19:15:14 +08:00
while ( i < iEnd ) {
2018-07-17 15:41:42 +08:00
int common = commonPrefixLength ( entries [ i ] . key , ( i = = 0 ) ? pageLowerBound : entries [ i - 1 ] . key ) ;
2018-06-12 16:43:19 +08:00
int valueSize = entries [ i ] . value . size ( ) ;
2018-07-15 04:37:52 +08:00
// Calculate how many bytes will be added to each tracked metric
2018-06-12 16:43:19 +08:00
int kvAdd = entries [ i ] . key . size ( ) + valueSize ;
int uniqueAdd = kvAdd - common ;
2018-06-15 08:52:25 +08:00
int overheadAdd = PrefixTree : : Node : : getMaxOverhead ( i , entries [ i ] . key . size ( ) , entries [ i ] . value . size ( ) ) ;
2018-07-15 04:37:52 +08:00
int addedTreeBytes = uniqueAdd + overheadAdd ;
2018-07-17 15:41:42 +08:00
debug_printf ( " Trying to add index %d of %lu klen %d vlen %d common %d overhead %d key %s \n " , i + 1 , entries . size ( ) , entries [ i ] . key . size ( ) , entries [ i ] . value . size ( ) , common , overheadAdd , entries [ i ] . key . toHexString ( ) . c_str ( ) ) ;
2018-07-15 04:37:52 +08:00
// If adding the new item to the prefix tree will result in it being too large then write a page and start a new one
if ( ( overheadBytes + uniqueBytes + overheadAdd + uniqueAdd ) > pageSize ) {
2018-06-15 08:52:25 +08:00
ASSERT ( i ! = 0 ) ;
2018-07-17 15:41:42 +08:00
StringRef pageUpperBound = minimalBoundaries ? entries [ i ] . key . substr ( 0 , common + 1 ) : entries [ i ] . key ;
debug_printf ( " Flushing page start=%d i=%d lower='%s' upper='%s' \n " , start , i , pageLowerBound . toHexString ( ) . c_str ( ) , pageUpperBound . toHexString ( ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
Reference < IPage > page = newPageFn ( ) ;
BTreePage * btpage = ( BTreePage * ) page - > begin ( ) ;
btpage - > flags = newFlags ;
btpage - > kvBytes = kvBytes ;
btpage - > count = i - start ;
2018-07-15 04:37:52 +08:00
int written = btpage - > tree . build ( & entries [ start ] , & entries [ i ] , pageLowerBound , pageUpperBound ) ;
2018-06-12 16:43:19 +08:00
if ( written > pageSize ) {
2018-07-15 04:37:52 +08:00
printf ( " ERROR: Wrote %d bytes to %d byte page. recs %d uniqueBytes %d overheadBytes %d \n " , written , pageSize , i - start , uniqueBytes , overheadBytes ) ;
2018-06-12 16:43:19 +08:00
ASSERT ( false ) ;
}
2018-07-15 04:37:52 +08:00
pages . push_back ( { pageLowerBound , page } ) ;
2018-06-08 18:32:34 +08:00
start = i ;
kvBytes = 0 ;
2018-07-15 04:37:52 +08:00
uniqueBytes = 0 ;
overheadBytes = 0 ;
pageLowerBound = pageUpperBound ;
2017-06-10 05:56:41 +08:00
}
2018-06-12 16:43:19 +08:00
kvBytes + = kvAdd ;
uniqueBytes + = uniqueAdd ;
2018-07-15 04:37:52 +08:00
overheadBytes + = overheadAdd ;
2018-06-08 18:32:34 +08:00
+ + i ;
2017-06-10 05:56:41 +08:00
}
2018-06-14 19:15:14 +08:00
// Flush last page, if not empty
if ( i > start ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " Flushing page start=%d i=%d lower='%s' upper='%s' \n " , start , i , pageLowerBound . toHexString ( ) . c_str ( ) , upperBound . toHexString ( ) . c_str ( ) ) ;
2018-06-14 19:15:14 +08:00
Reference < IPage > page = newPageFn ( ) ;
BTreePage * btpage = ( BTreePage * ) page - > begin ( ) ;
btpage - > flags = newFlags ;
btpage - > kvBytes = kvBytes ;
btpage - > count = i - start ;
2018-07-15 04:37:52 +08:00
int written = btpage - > tree . build ( & entries [ start ] , & entries [ i ] , pageLowerBound , upperBound ) ;
2018-06-14 19:15:14 +08:00
if ( written > pageSize ) {
2018-07-15 04:37:52 +08:00
printf ( " ERROR: Wrote %d bytes to %d byte page. recs %d uniqueBytes %d overheadBytes %d \n " , written , pageSize , i - start , uniqueBytes , overheadBytes ) ;
2018-06-14 19:15:14 +08:00
ASSERT ( false ) ;
}
2018-07-15 04:37:52 +08:00
pages . push_back ( { pageLowerBound , page } ) ;
2018-06-14 19:15:14 +08:00
}
2018-06-08 18:32:34 +08:00
//debug_printf("buildPages: returning pages.size %lu, kvpairs %lu\n", pages.size(), kvPairs.size());
return pages ;
}
2017-06-10 05:56:41 +08:00
2018-07-23 18:09:13 +08:00
// Internal key/value records represent either a cleared key at a version or a shard of a value of a key at a version.
// When constructing and packing these it is assumed that the key and value memory is being held elsewhere.
struct KeyVersionValueRef {
KeyVersionValueRef ( ) : version ( invalidVersion ) , valueTotalSize ( 0 ) , valueIndex ( - 1 ) { }
// Cleared key at version
KeyVersionValueRef ( KeyRef key , Version ver , Optional < ValueRef > val = { } , int64_t totalSize = - 1 , int64_t index = - 1 )
: key ( key ) , version ( ver )
{
// Non-split value
if ( val . present ( ) & & totalSize < 0 ) {
valueTotalSize = value . get ( ) . size ( ) ;
valueIndex = 0 ;
}
}
KeyVersionValueRef ( Arena & a , const KeyVersionValueRef & toCopy ) {
key = KeyRef ( a , toCopy . key ) ;
version = toCopy . version ;
if ( toCopy . value . present ( ) ) {
value = ValueRef ( a , toCopy . value . get ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2018-07-23 18:09:13 +08:00
valueTotalSize = toCopy . valueTotalSize ;
valueIndex = toCopy . valueIndex ;
}
static inline Key searchKey ( StringRef key , Version ver ) {
Tuple t ;
t . append ( key ) ;
t . append ( ver ) ;
Standalone < VectorRef < uint8_t > > packed = t . getData ( ) ;
packed . append ( packed . arena ( ) , ( const uint8_t * ) " \xff " , 1 ) ;
return Key ( KeyRef ( packed . begin ( ) , packed . size ( ) ) , packed . arena ( ) ) ;
}
KeyRef key ;
2017-08-25 08:25:53 +08:00
Version version ;
2018-07-23 18:09:13 +08:00
int64_t valueTotalSize ; // Total size of value, including all other KVV parts if multipart
int64_t valueIndex ; // Index within reconstituted value of this part
Optional < ValueRef > value ;
2017-08-25 08:25:53 +08:00
2018-07-23 18:09:13 +08:00
// Result undefined if value is not present
bool isMultiPart ( ) const { return value . get ( ) . size ( ) ! = valueTotalSize ; }
2017-08-28 16:57:01 +08:00
bool valid ( ) const { return version ! = invalidVersion ; }
2017-08-25 08:25:53 +08:00
2018-07-23 18:09:13 +08:00
// Generate a kv shard from a complete kv
KeyVersionValueRef split ( int start , int len ) {
ASSERT ( value . present ( ) ) ;
return KeyVersionValueRef ( key , version , value . get ( ) . substr ( start , len ) , value . get ( ) . size ( ) , start ) ;
}
// Encode the record for writing to a btree page.
// If copyValue is false, the value is not copied into the returned arena.
//
// Encoded forms:
// userKey, version - the value is present and complete (which includes an empty value)
// userKey, version, valueSize=0 - the key was deleted as of this version
// userKey, version, valueSize>=0, valuePart - the value is present and spans multiple records
inline PrefixTree : : Entry pack ( bool copyValue = true ) const {
Tuple t ;
t . append ( key ) ;
t . append ( version ) ;
if ( ! value . present ( ) ) {
t . append ( 0 ) ;
}
else {
if ( isMultiPart ( ) ) {
t . append ( valueTotalSize ) ;
t . append ( valueIndex ) ;
}
}
Key k = t . getDataAsStandalone ( ) ;
ValueRef v ;
if ( value . present ( ) ) {
v = copyValue ? StringRef ( k . arena ( ) , value . get ( ) ) : value . get ( ) ;
}
return PrefixTree : : Entry ( { k , v } , k . arena ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2017-09-06 07:59:31 +08:00
// Supports partial/incomplete encoded sequences.
2018-07-23 18:09:13 +08:00
// Unpack an encoded key/value pair.
// Both key and value will be in the returned arena unless copyValue is false in which case
// the value will not be copied to the arena.
static Standalone < KeyVersionValueRef > unpack ( KeyValueRef kv , bool copyValue = true ) {
debug_printf ( " Unpacking: '%s' -> '%s' \n " , kv . key . toHexString ( 15 ) . c_str ( ) , kv . value . toHexString ( 15 ) . c_str ( ) ) ;
Standalone < KeyVersionValueRef > result ;
2017-09-06 07:59:31 +08:00
if ( kv . key . size ( ) ! = 0 ) {
2018-07-18 18:19:35 +08:00
Tuple k = Tuple : : unpack ( kv . key ) ;
2018-07-23 18:09:13 +08:00
int s = k . size ( ) ;
switch ( s ) {
case 4 :
// Value shard
result . valueIndex = k . getInt ( 3 ) ;
result . valueTotalSize = k . getInt ( 2 ) ;
result . value = kv . value ;
break ;
case 3 :
// Deleted or Complete value
result . valueIndex = 0 ;
result . valueTotalSize = k . getInt ( 2 ) ;
// If not a clear, set the value, otherwise it remains non-present
if ( result . valueTotalSize ! = 0 )
result . value = kv . value ;
break ;
default :
result . valueIndex = 0 ;
result . valueTotalSize = kv . value . size ( ) ;
result . value = kv . value ;
break ;
} ;
if ( s > 0 ) {
Key sk = k . getString ( 0 ) ;
result . arena ( ) . dependsOn ( sk . arena ( ) ) ;
result . key = sk ;
if ( s > 1 ) {
result . version = k . getInt ( 1 ) ;
2017-09-06 07:59:31 +08:00
}
}
}
2018-07-23 18:09:13 +08:00
if ( copyValue & & result . value . present ( ) ) {
result . value = StringRef ( result . arena ( ) , result . value . get ( ) ) ;
2017-09-06 07:59:31 +08:00
}
return result ;
2017-08-25 08:25:53 +08:00
}
2018-07-23 18:09:13 +08:00
static Standalone < KeyVersionValueRef > unpack ( KeyRef k ) {
return unpack ( KeyValueRef ( k , StringRef ( ) ) ) ;
2017-08-25 08:25:53 +08:00
}
std : : string toString ( ) const {
2018-07-23 18:09:13 +08:00
std : : string r ;
r + = format ( " '%s' @%lld -> " , key . toHexString ( 10 ) . c_str ( ) , version ) ;
r + = value . present ( ) ? format ( " '%s' %d/%d " , value . get ( ) . toHexString ( 10 ) . c_str ( ) , valueIndex , valueTotalSize ) . c_str ( ) : " <cleared> " ;
return r ;
2017-08-25 08:25:53 +08:00
}
} ;
2018-07-23 18:09:13 +08:00
typedef Standalone < KeyVersionValueRef > KeyVersionValue ;
2017-06-10 05:56:41 +08:00
# define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); }
class VersionedBTree : public IVersionedStore {
public :
2018-07-15 04:37:52 +08:00
// The first possible internal record possible in the tree
2018-07-23 18:09:13 +08:00
static KeyVersionValueRef beginKVV ;
2018-07-15 04:37:52 +08:00
// A record which is greater than the last possible record in the tree
2018-07-23 18:09:13 +08:00
static KeyVersionValueRef endKVV ;
2018-07-15 04:37:52 +08:00
// The encoded key form of the above two things.
static Key beginKey ;
static Key endKey ;
2017-06-10 05:56:41 +08:00
virtual Future < Void > getError ( ) NOT_IMPLEMENTED
virtual Future < Void > onClosed ( ) NOT_IMPLEMENTED
virtual void dispose ( ) NOT_IMPLEMENTED
virtual void close ( ) NOT_IMPLEMENTED
virtual KeyValueStoreType getType ( ) NOT_IMPLEMENTED
virtual bool supportsMutation ( int op ) NOT_IMPLEMENTED
virtual StorageBytes getStorageBytes ( ) NOT_IMPLEMENTED
// Writes are provided in an ordered stream.
// A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion()
// A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns
virtual void set ( KeyValueRef keyValue ) {
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion & changes = insertMutationBoundary ( keyValue . key ) - > second . startKeyMutations ;
// Add the set if the changes set is empty or the last entry isn't a set to exactly the same value
2017-08-26 06:48:32 +08:00
if ( changes . empty ( ) | | ! changes . rbegin ( ) - > second . equalToSet ( keyValue . value ) ) {
2017-08-25 08:25:53 +08:00
changes [ m_writeVersion ] = SingleKeyMutation ( keyValue . value ) ;
2017-08-26 06:48:32 +08:00
}
2017-08-22 13:29:57 +08:00
}
virtual void clear ( KeyRangeRef range ) {
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator iBegin = insertMutationBoundary ( range . begin ) ;
MutationBufferT : : iterator iEnd = insertMutationBoundary ( range . end ) ;
// For each boundary in the cleared range
while ( iBegin ! = iEnd ) {
RangeMutation & range = iBegin - > second ;
// Set the rangeClearedVersion if not set
if ( ! range . rangeClearVersion . present ( ) )
range . rangeClearVersion = m_writeVersion ;
// Add a clear to the startKeyMutations map if it's empty or the last item is not a clear
if ( range . startKeyMutations . empty ( ) | | ! range . startKeyMutations . rbegin ( ) - > second . isClear ( ) )
range . startKeyMutations [ m_writeVersion ] = SingleKeyMutation ( ) ;
+ + iBegin ;
}
2017-06-10 05:56:41 +08:00
}
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
virtual void mutate ( int op , StringRef param1 , StringRef param2 ) NOT_IMPLEMENTED
// Versions [begin, end) no longer readable
virtual void forgetVersions ( Version begin , Version end ) NOT_IMPLEMENTED
virtual Future < Version > getLatestVersion ( ) {
if ( m_writeVersion ! = invalidVersion )
return m_writeVersion ;
return m_pager - > getLatestVersion ( ) ;
}
2017-09-23 08:18:28 +08:00
Version getWriteVersion ( ) {
return m_writeVersion ;
}
2017-09-21 19:43:49 +08:00
Version getLastCommittedVersion ( ) {
return m_lastCommittedVersion ;
}
2017-10-10 04:24:16 +08:00
VersionedBTree ( IPager * pager , std : : string name , int target_page_size = - 1 )
2017-08-04 06:07:29 +08:00
: m_pager ( pager ) ,
m_writeVersion ( invalidVersion ) ,
2017-09-06 07:59:31 +08:00
m_pageSize ( pager - > getUsablePageSize ( ) ) ,
2017-09-23 08:18:28 +08:00
m_lastCommittedVersion ( invalidVersion ) ,
m_pBuffer ( nullptr ) ,
2017-10-10 04:24:16 +08:00
m_name ( name )
2017-09-06 07:59:31 +08:00
{
if ( target_page_size > 0 & & target_page_size < m_pageSize )
m_pageSize = target_page_size ;
2017-09-23 08:18:28 +08:00
m_init = init_impl ( this ) ;
2017-10-10 04:24:16 +08:00
m_latestCommit = m_init ;
2017-06-10 05:56:41 +08:00
}
2017-09-21 19:43:49 +08:00
ACTOR static Future < Void > init_impl ( VersionedBTree * self ) {
2017-07-15 02:36:49 +08:00
self - > m_root = 0 ;
state Version latest = wait ( self - > m_pager - > getLatestVersion ( ) ) ;
if ( latest = = 0 ) {
2017-08-04 06:07:29 +08:00
+ + latest ;
2017-09-21 08:50:02 +08:00
Reference < IPage > page = self - > m_pager - > newPageBuffer ( ) ;
2018-06-08 18:32:34 +08:00
writeEmptyPage ( page , BTreePage : : IS_LEAF , self - > m_pageSize ) ;
2018-07-15 04:37:52 +08:00
self - > writePage ( self - > m_root , page , latest , StringRef ( ) , StringRef ( ) ) ;
2017-08-04 06:07:29 +08:00
self - > m_pager - > setLatestVersion ( latest ) ;
2017-07-15 02:36:49 +08:00
Void _ = wait ( self - > m_pager - > commit ( ) ) ;
}
2017-08-04 06:07:29 +08:00
self - > m_lastCommittedVersion = latest ;
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
2017-09-23 08:18:28 +08:00
Future < Void > init ( ) { return m_init ; }
2017-06-10 05:56:41 +08:00
2017-08-22 13:29:57 +08:00
virtual ~ VersionedBTree ( ) {
2017-10-02 18:32:22 +08:00
m_init . cancel ( ) ;
m_latestCommit . cancel ( ) ;
2017-08-22 13:29:57 +08:00
}
2017-06-10 05:56:41 +08:00
// readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed
// to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations.
// The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less
// than or equal to the given version.
// If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same
// write version, OR it may represent a snapshot as of the call to readAtVersion().
virtual Reference < IStoreCursor > readAtVersion ( Version v ) {
// TODO: Use the buffer to return uncommitted data
2017-07-26 07:10:19 +08:00
// For now, only committed versions can be read.
ASSERT ( v < = m_lastCommittedVersion ) ;
2017-06-10 05:56:41 +08:00
return Reference < IStoreCursor > ( new Cursor ( v , m_pager , m_root ) ) ;
}
// Must be nondecreasing
virtual void setWriteVersion ( Version v ) {
2017-09-23 08:18:28 +08:00
ASSERT ( v > m_lastCommittedVersion ) ;
// If there was no current mutation buffer, create one in the buffer map and update m_pBuffer
if ( m_pBuffer = = nullptr ) {
// When starting a new mutation buffer its start version must be greater than the last write version
ASSERT ( v > m_writeVersion ) ;
m_pBuffer = & m_mutationBuffers [ v ] ;
// Create range representing the entire keyspace. This reduces edge cases to applying mutations
// because now all existing keys are within some range in the mutation map.
2018-07-15 04:37:52 +08:00
( * m_pBuffer ) [ beginKVV . key ] ;
( * m_pBuffer ) [ endKVV . key ] ;
2017-09-23 08:18:28 +08:00
}
else {
// It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null
ASSERT ( v > = m_writeVersion ) ;
}
2017-06-10 05:56:41 +08:00
m_writeVersion = v ;
}
virtual Future < Void > commit ( ) {
2017-09-23 08:18:28 +08:00
if ( m_pBuffer = = nullptr )
return m_latestCommit ;
2017-06-10 05:56:41 +08:00
return commit_impl ( this ) ;
}
private :
2018-07-15 04:37:52 +08:00
void writePage ( LogicalPageID id , Reference < IPage > page , Version ver , StringRef pageLowerBound , StringRef pageUpperBound ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " page write: id=%d ver=%lld lower='%s' upper='%s' \n " , id , ver , pageLowerBound . toHexString ( ) . c_str ( ) , pageUpperBound . toHexString ( ) . c_str ( ) ) ;
debug_printf ( " page write: %s \n " , ( ( const BTreePage * ) page - > begin ( ) ) - > toString ( pageLowerBound , pageUpperBound ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
m_pager - > writePage ( id , page , ver ) ;
}
LogicalPageID m_root ;
2017-08-22 13:29:57 +08:00
typedef std : : pair < Key , LogicalPageID > KeyPagePairT ;
2017-07-05 14:41:48 +08:00
typedef std : : pair < Version , std : : vector < KeyPagePairT > > VersionedKeyToPageSetT ;
typedef std : : vector < VersionedKeyToPageSetT > VersionedChildrenT ;
2017-08-04 15:01:25 +08:00
2017-08-25 08:25:53 +08:00
// Represents a change to a single key - set, clear, or atomic op
struct SingleKeyMutation {
// Clear
SingleKeyMutation ( ) : op ( MutationRef : : ClearRange ) { }
// Set
SingleKeyMutation ( Value val ) : op ( MutationRef : : SetValue ) , value ( val ) { }
// Atomic Op
SingleKeyMutation ( MutationRef : : Type op , Value val ) : op ( op ) , value ( val ) { }
2017-08-22 13:29:57 +08:00
MutationRef : : Type op ;
2017-08-25 08:25:53 +08:00
Value value ;
2017-08-04 15:01:25 +08:00
2017-08-22 13:29:57 +08:00
inline bool isClear ( ) const { return op = = MutationRef : : ClearRange ; }
inline bool isSet ( ) const { return op = = MutationRef : : SetValue ; }
2017-08-25 08:25:53 +08:00
inline bool isAtomicOp ( ) const { return ! isSet ( ) & & ! isClear ( ) ; }
inline bool equalToSet ( ValueRef val ) { return isSet ( ) & & value = = val ; }
2018-07-23 18:09:13 +08:00
// The returned packed key will be added to arena, the value will just point to the SingleKeyMutation's memory
inline KeyVersionValueRef toKVV ( KeyRef userKey , Version version ) const {
2017-09-06 07:59:31 +08:00
// No point in serializing an atomic op, it needs to be coalesced to a real value.
2017-08-25 08:25:53 +08:00
ASSERT ( ! isAtomicOp ( ) ) ;
if ( isClear ( ) )
2018-07-23 18:09:13 +08:00
return KeyVersionValueRef ( userKey , version ) ;
2017-08-22 13:29:57 +08:00
2018-07-23 18:09:13 +08:00
return KeyVersionValueRef ( userKey , version , value ) ;
2017-08-25 08:25:53 +08:00
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
std : : string toString ( ) const {
2017-08-26 06:48:32 +08:00
return format ( " op=%d val='%s' " , op , printable ( value ) . c_str ( ) ) ;
2017-08-22 13:29:57 +08:00
}
2017-08-04 15:01:25 +08:00
} ;
2017-08-25 08:25:53 +08:00
// Represents mutations on a single key and a possible clear to a range that begins
// immediately after that key
typedef std : : map < Version , SingleKeyMutation > SingleKeyMutationsByVersion ;
struct RangeMutation {
// Mutations for exactly the start key
SingleKeyMutationsByVersion startKeyMutations ;
// A clear range version, if cleared, for the range starting immediately AFTER the start key
Optional < Version > rangeClearVersion ;
// Returns true if this RangeMutation doesn't actually mutate anything
bool noChanges ( ) const {
return ! rangeClearVersion . present ( ) & & startKeyMutations . empty ( ) ;
}
std : : string toString ( ) const {
std : : string result ;
result . append ( " rangeClearVersion: " ) ;
if ( rangeClearVersion . present ( ) )
result . append ( format ( " %lld " , rangeClearVersion . get ( ) ) ) ;
else
result . append ( " <not present> " ) ;
result . append ( " startKeyMutations: " ) ;
for ( SingleKeyMutationsByVersion : : value_type const & m : startKeyMutations )
result . append ( format ( " [%lld => %s] " , m . first , m . second . toString ( ) . c_str ( ) ) ) ;
return result ;
}
} ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
typedef std : : map < Key , RangeMutation > MutationBufferT ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
/* Mutation Buffer Overview
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* MutationBuffer maps the start of a range to a RangeMutation . The end of the range is
* the next range start in the map .
2017-08-22 13:29:57 +08:00
*
2018-07-15 04:37:52 +08:00
* - The buffer starts out with keys ' ' and endKVV . key already populated .
2017-08-25 08:25:53 +08:00
*
* - When a new key is inserted into the buffer map , it is by definition
* splitting an existing range so it should take on the rangeClearVersion of
* the immediately preceding key which is the start of that range
2017-08-22 13:29:57 +08:00
*
* - Keys are inserted into the buffer map for every individual operation ( set / clear / atomic )
* key and for both the start and end of a range clear .
2017-08-25 08:25:53 +08:00
*
2017-08-22 13:29:57 +08:00
* - To apply a single clear , add it to the individual ops only if the last entry is not also a clear .
*
2017-08-25 08:25:53 +08:00
* - To apply a range clear , after inserting the new range boundaries do the following to the start
* boundary and all successive boundaries < end
* - set the range clear version if not already set
* - add a clear to the startKeyMutations if the final entry is not a clear .
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* - Note that there are actually TWO valid ways to represent
* set c = val1 at version 1
* clear c \ x00 to z at version 2
* with this model . Either
* c = { rangeClearVersion = 2 , startKeyMutations = { 1 = > val1 }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
* OR
* c = { rangeClearVersion = < not present > , startKeyMutations = { 1 = > val1 }
* c \ x00 = { rangeClearVersion = 2 , startKeyMutations = { 2 = > < not present > }
* z = { rangeClearVersion = < not present > , startKeyMutations = { }
2017-08-22 13:29:57 +08:00
*
2017-08-25 08:25:53 +08:00
* This is because the rangeClearVersion applies to a range begining with the first
* key AFTER the start key , so that the logic for reading the start key is more simple
* as it only involves consulting startKeyMutations . When adding a clear range , the
* boundary key insert / split described above is valid , and is what is currently done ,
* but it would also be valid to see if the last key before startKey is equal to
* keyBefore ( startKey ) , and if so that mutation buffer boundary key can be used instead
* without adding an additional key to the buffer .
2017-08-22 13:29:57 +08:00
*/
2017-08-28 16:57:01 +08:00
void printMutationBuffer ( MutationBufferT : : const_iterator begin , MutationBufferT : : const_iterator end ) const {
2017-08-25 08:25:53 +08:00
# if REDWOOD_DEBUG
debug_printf ( " ------------------------------------- \n " ) ;
debug_printf ( " BUFFER \n " ) ;
while ( begin ! = end ) {
debug_printf ( " '%s': %s \n " , printable ( begin - > first ) . c_str ( ) , begin - > second . toString ( ) . c_str ( ) ) ;
+ + begin ;
}
debug_printf ( " ------------------------------------- \n " ) ;
# endif
}
2017-08-22 13:29:57 +08:00
2017-09-23 08:18:28 +08:00
void printMutationBuffer ( MutationBufferT * buf ) const {
return printMutationBuffer ( buf - > begin ( ) , buf - > end ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2017-08-22 13:29:57 +08:00
2017-09-23 08:18:28 +08:00
// Find or create a mutation buffer boundary for bound and return an iterator to it
2017-08-25 08:25:53 +08:00
MutationBufferT : : iterator insertMutationBoundary ( Key boundary ) {
2017-09-23 08:18:28 +08:00
ASSERT ( m_pBuffer ! = nullptr ) ;
2017-08-25 08:25:53 +08:00
// Find the first split point in buffer that is >= key
2017-09-23 08:18:28 +08:00
MutationBufferT : : iterator ib = m_pBuffer - > lower_bound ( boundary ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Since the initial state of the mutation buffer contains the range '' through
// the maximum possible key, our search had to have found something.
2017-09-23 08:18:28 +08:00
ASSERT ( ib ! = m_pBuffer - > end ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// If we found the boundary we are looking for, return its iterator
if ( ib - > first = = boundary )
return ib ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is our insert hint. Insert the new boundary and set ib to its entry
2017-09-23 08:18:28 +08:00
ib = m_pBuffer - > insert ( ib , { boundary , RangeMutation ( ) } ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// ib is certainly > begin() because it is guaranteed that the empty string
// boundary exists and the only way to have found that is to look explicitly
// for it in which case we would have returned above.
MutationBufferT : : iterator iPrevious = ib ;
2017-08-26 06:48:32 +08:00
- - iPrevious ;
2017-08-28 16:57:01 +08:00
if ( iPrevious - > second . rangeClearVersion . present ( ) ) {
ib - > second . rangeClearVersion = iPrevious - > second . rangeClearVersion ;
ib - > second . startKeyMutations [ iPrevious - > second . rangeClearVersion . get ( ) ] = SingleKeyMutation ( ) ;
}
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
return ib ;
2017-08-22 13:29:57 +08:00
}
2017-07-26 07:10:19 +08:00
2018-07-10 17:24:01 +08:00
void buildNewRoot ( Version version , std : : vector < BoundaryPagePairT > & pages , std : : vector < LogicalPageID > & logicalPageIDs ) {
2017-07-14 02:32:14 +08:00
// While there are multiple child pages for this version we must write new tree levels.
while ( pages . size ( ) > 1 ) {
2018-07-23 18:09:13 +08:00
std : : vector < PrefixTree : : EntryRef > childEntries ;
2017-07-14 02:32:14 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + )
2018-07-23 18:09:13 +08:00
childEntries . emplace_back ( pages [ i ] . first , StringRef ( ( unsigned char * ) & logicalPageIDs [ i ] , sizeof ( uint32_t ) ) ) ;
2017-07-14 02:32:14 +08:00
2017-07-14 13:11:48 +08:00
int oldPages = pages . size ( ) ;
2018-07-17 15:41:42 +08:00
pages = buildPages ( false , beginKey , endKey , childEntries , 0 , [ = ] ( ) { return m_pager - > newPageBuffer ( ) ; } , m_pageSize ) ;
2017-07-14 13:11:48 +08:00
// If there isn't a reduction in page count then we'll build new root levels forever.
ASSERT ( pages . size ( ) < oldPages ) ;
2017-07-14 02:32:14 +08:00
2017-07-14 13:11:48 +08:00
debug_printf ( " Writing a new root level at version %lld with %lu children across %lu pages \n " , version , childEntries . size ( ) , pages . size ( ) ) ;
2017-07-14 02:32:14 +08:00
// Allocate logical page ids for the new level
logicalPageIDs . clear ( ) ;
// Only reuse root if there's one replacement page being written or if the subtree root is not the tree root
if ( pages . size ( ) = = 1 )
logicalPageIDs . push_back ( m_root ) ;
// Allocate enough pageIDs for all of the pages
for ( int i = logicalPageIDs . size ( ) ; i < pages . size ( ) ; i + + )
logicalPageIDs . push_back ( m_pager - > allocateLogicalPage ( ) ) ;
for ( int i = 0 ; i < pages . size ( ) ; i + + )
2018-07-15 04:37:52 +08:00
writePage ( logicalPageIDs [ i ] , pages [ i ] . second , version , pages [ i ] . first , ( i = = pages . size ( ) - 1 ) ? endKey : pages [ i + 1 ] . first ) ;
2017-07-14 02:32:14 +08:00
}
}
2017-07-05 14:41:48 +08:00
// Returns list of (version, list of (lower_bound, list of children) )
2017-09-23 08:18:28 +08:00
ACTOR static Future < VersionedChildrenT > commitSubtree ( VersionedBTree * self , MutationBufferT * mutationBuffer , Reference < IPagerSnapshot > snapshot , LogicalPageID root , Key lowerBoundKey , Key upperBoundKey ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p commitSubtree: root=%d lower='%s' upper='%s' \n " , this , root , lowerBoundKey . toHexString ( ) . c_str ( ) , upperBoundKey . toHexString ( ) . c_str ( ) ) ;
2017-07-05 14:41:48 +08:00
2018-07-23 18:09:13 +08:00
// Decode the (likely truncate) upper and lower bound keys for this subtree.
2017-08-25 08:25:53 +08:00
state KeyVersionValue lowerBoundKVV = KeyVersionValue : : unpack ( lowerBoundKey ) ;
state KeyVersionValue upperBoundKVV = KeyVersionValue : : unpack ( upperBoundKey ) ;
// Find the slice of the mutation buffer that is relevant to this subtree
2017-09-23 08:18:28 +08:00
state MutationBufferT : : const_iterator iMutationBoundary = mutationBuffer - > lower_bound ( lowerBoundKVV . key ) ;
state MutationBufferT : : const_iterator iMutationBoundaryEnd = mutationBuffer - > lower_bound ( upperBoundKVV . key ) ;
2017-08-25 08:25:53 +08:00
// If the lower bound key and the upper bound key are the same then there can't be any changes to
// this subtree since changes would happen after the upper bound key as the mutated versions would
// necessarily be higher.
if ( lowerBoundKVV . key = = upperBoundKVV . key ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p no changes, lower and upper bound keys are the same. \n " , this ) ;
2017-06-10 05:56:41 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
}
2017-08-25 08:25:53 +08:00
// If the mutation buffer key found is greater than the lower bound key then go to the previous mutation
// buffer key because it may cover deletion of some keys at the start of this subtree.
2017-09-23 08:18:28 +08:00
if ( iMutationBoundary ! = mutationBuffer - > begin ( ) & & iMutationBoundary - > first > lowerBoundKVV . key ) {
2017-08-25 08:25:53 +08:00
- - iMutationBoundary ;
2017-08-28 21:28:49 +08:00
}
2017-08-25 08:25:53 +08:00
else {
// If the there are no mutations, we're done
if ( iMutationBoundary = = iMutationBoundaryEnd ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p no changes, mutation buffer start/end are the same \n " , this ) ;
2017-08-25 08:25:53 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
}
}
2017-08-29 08:26:53 +08:00
// TODO: Check if entire subtree is erased and return no pages, also have the previous pages deleted as of
// the cleared version.
2017-08-25 08:25:53 +08:00
// Another way to have no mutations is to have a single mutation range cover this
// subtree but have no changes in it
MutationBufferT : : const_iterator iMutationBoundaryNext = iMutationBoundary ;
+ + iMutationBoundaryNext ;
2017-08-31 16:23:12 +08:00
if ( iMutationBoundaryNext = = iMutationBoundaryEnd & & iMutationBoundary - > second . noChanges ( ) ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p no changes because sole mutation range was empty \n " , this ) ;
2017-08-25 08:25:53 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
}
2017-06-10 05:56:41 +08:00
Reference < const IPage > rawPage = wait ( snapshot - > getPhysicalPage ( root ) ) ;
2018-07-23 18:09:13 +08:00
BTreePage * page = ( BTreePage * ) rawPage - > begin ( ) ;
2018-07-18 18:19:35 +08:00
debug_printf ( " %p commitSubtree: page read: id=%d ver=%lld lower='%s' upper='%s' \n " , this , root , snapshot - > getVersion ( ) , lowerBoundKey . toHexString ( ) . c_str ( ) , upperBoundKey . toHexString ( ) . c_str ( ) ) ;
debug_printf ( " %p commitSubtree: page read: id=%d %s \n " , this , root , page - > toString ( lowerBoundKey , upperBoundKey ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
2018-07-15 04:37:52 +08:00
PrefixTree : : Cursor existingCursor = page - > tree . getCursor ( lowerBoundKey , upperBoundKey ) ;
2018-06-08 18:32:34 +08:00
bool existingCursorValid = existingCursor . moveFirst ( ) ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
if ( page - > flags & BTreePage : : IS_LEAF ) {
2017-06-10 05:56:41 +08:00
VersionedChildrenT results ;
2018-07-23 18:09:13 +08:00
std : : vector < PrefixTree : : EntryRef > merged ;
Arena mergedArena ;
2017-06-10 05:56:41 +08:00
2018-07-18 18:19:35 +08:00
debug_printf ( " %p MERGING EXISTING DATA WITH MUTATIONS: \n " , this ) ;
2017-08-26 06:48:32 +08:00
self - > printMutationBuffer ( iMutationBoundary , iMutationBoundaryEnd ) ;
2017-08-22 13:29:57 +08:00
// It's a given that the mutation map is not empty so it's safe to do this
2017-08-25 08:25:53 +08:00
Key mutationRangeStart = iMutationBoundary - > first ;
2017-08-22 13:29:57 +08:00
2018-06-08 18:32:34 +08:00
// There will be multiple loops advancing existing cursor, existing KVV will track its current value
2017-08-22 13:29:57 +08:00
KeyVersionValue existing ;
2018-07-23 18:09:13 +08:00
if ( existingCursorValid ) {
existing = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
}
2017-08-22 13:29:57 +08:00
// If replacement pages are written they will be at the minimum version seen in the mutations for this leaf
2017-08-28 16:57:01 +08:00
Version minVersion = invalidVersion ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
// Now, process each mutation range and merge changes with existing data.
while ( iMutationBoundary ! = iMutationBoundaryEnd ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p New mutation boundary: '%s': %s \n " , this , printable ( iMutationBoundary - > first ) . c_str ( ) , iMutationBoundary - > second . toString ( ) . c_str ( ) ) ;
2017-08-23 02:30:44 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutations ;
2017-08-22 13:29:57 +08:00
2017-08-28 21:28:49 +08:00
// If the mutation boundary key is less than the lower bound key then skip startKeyMutations for
// this bounary, we're only processing this mutation range here to apply any clears to existing data.
if ( iMutationBoundary - > first < lowerBoundKVV . key )
iMutations = iMutationBoundary - > second . startKeyMutations . end ( ) ;
2017-08-25 08:25:53 +08:00
// If the mutation boundary key is the same as the page lowerBound key then start reading single
// key mutations at the first version greater than the lowerBoundKey version.
2017-08-28 21:28:49 +08:00
else if ( iMutationBoundary - > first = = lowerBoundKVV . key )
2017-08-25 08:25:53 +08:00
iMutations = iMutationBoundary - > second . startKeyMutations . upper_bound ( lowerBoundKVV . version ) ;
else
iMutations = iMutationBoundary - > second . startKeyMutations . begin ( ) ;
2017-08-22 13:29:57 +08:00
2017-08-25 08:25:53 +08:00
SingleKeyMutationsByVersion : : const_iterator iMutationsEnd = iMutationBoundary - > second . startKeyMutations . end ( ) ;
// Output old versions of the mutation boundary key
2018-06-08 18:32:34 +08:00
while ( existingCursorValid & & existing . key = = iMutationBoundary - > first ) {
2018-07-23 18:09:13 +08:00
// Don't copy the value because this page will stay in memory until after we've built new version(s) of it
merged . push_back ( dependsOn ( mergedArena , existingCursor . getKV ( false ) ) ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p: Added %s [existing, boundary start] \n " , this , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-25 08:25:53 +08:00
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
if ( existingCursorValid )
2018-07-23 18:09:13 +08:00
existing = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
2017-08-25 08:25:53 +08:00
}
2017-06-10 05:56:41 +08:00
2018-07-18 18:19:35 +08:00
// TODO: If a mutation set is equal to the previous existing value of the key, maybe don't write it.
2017-08-25 08:25:53 +08:00
// Output mutations for the mutation boundary start key
2017-08-22 13:29:57 +08:00
while ( iMutations ! = iMutationsEnd ) {
2017-09-06 07:59:31 +08:00
const SingleKeyMutation & m = iMutations - > second ;
2018-06-15 08:52:25 +08:00
int maxPartSize = std : : min ( 255 , self - > m_pageSize / 4 ) ;
2017-09-20 04:03:30 +08:00
if ( m . isClear ( ) | | m . value . size ( ) < = maxPartSize ) {
2017-09-16 08:27:13 +08:00
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
2018-07-23 18:09:13 +08:00
// Don't copy the value because this page will stay in memory until after we've built new version(s) of it
merged . push_back ( dependsOn ( mergedArena , iMutations - > second . toKVV ( iMutationBoundary - > first , iMutations - > first ) . pack ( false ) ) ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p: Added %s [mutation, boundary start] \n " , this , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-09-06 07:59:31 +08:00
}
else {
2018-07-23 18:09:13 +08:00
if ( iMutations - > first < minVersion | | minVersion = = invalidVersion )
minVersion = iMutations - > first ;
2017-09-06 07:59:31 +08:00
int bytesLeft = m . value . size ( ) ;
2018-07-23 18:09:13 +08:00
int start = 0 ;
KeyVersionValueRef whole ( iMutationBoundary - > first , iMutations - > first , m . value ) ;
2017-09-06 07:59:31 +08:00
while ( bytesLeft > 0 ) {
int partSize = std : : min ( bytesLeft , maxPartSize ) ;
bytesLeft - = partSize ;
2018-07-23 18:09:13 +08:00
start + = partSize ;
// Don't copy the value chunk because this page will stay in memory until after we've built new version(s) of it
merged . push_back ( dependsOn ( mergedArena , whole . split ( start , partSize ) . pack ( false ) ) ) ;
debug_printf ( " %p: Added %s [mutation, boundary start] \n " , this , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-09-06 07:59:31 +08:00
}
}
2017-08-22 13:29:57 +08:00
+ + iMutations ;
}
2017-06-10 05:56:41 +08:00
2017-08-25 08:25:53 +08:00
// Get the clear version for this range, which is the last thing that we need from it,
Optional < Version > clearRangeVersion = iMutationBoundary - > second . rangeClearVersion ;
// Advance to the next boundary because we need to know the end key for the current range.
+ + iMutationBoundary ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Mutation range end: '%s' \n " , this , printable ( iMutationBoundary - > first ) . c_str ( ) ) ;
2017-08-29 08:26:53 +08:00
2017-08-25 08:25:53 +08:00
// Write existing keys which are less than the next mutation boundary key, clearing if needed.
2018-06-08 18:32:34 +08:00
while ( existingCursorValid & & existing . key < iMutationBoundary - > first ) {
2018-07-23 18:09:13 +08:00
merged . push_back ( existingCursor . getKVRef ( ) ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p: Added %s [existing, middle] \n " , this , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-25 08:25:53 +08:00
// Write a clear of this key if needed. A clear is required if clearRangeVersion is set and the next key is different
// than this one. Note that the next key might be the in our right sibling, we can use the page upperBound to get that.
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
2017-08-25 08:25:53 +08:00
KeyVersionValue nextEntry ;
2018-06-08 18:32:34 +08:00
if ( existingCursorValid )
2018-07-23 18:09:13 +08:00
nextEntry = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
2017-08-25 08:25:53 +08:00
else
nextEntry = upperBoundKVV ;
if ( clearRangeVersion . present ( ) & & existing . key ! = nextEntry . key ) {
2017-08-28 16:57:01 +08:00
Version clearVersion = clearRangeVersion . get ( ) ;
if ( clearVersion < minVersion | | minVersion = = invalidVersion )
minVersion = clearVersion ;
2018-07-23 18:09:13 +08:00
merged . push_back ( dependsOn ( mergedArena , KeyVersionValueRef ( existing . key , clearVersion ) . pack ( false ) ) ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p: Added %s [existing, middle clear] \n " , this , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-22 13:29:57 +08:00
}
2017-08-25 08:25:53 +08:00
2018-06-08 18:32:34 +08:00
if ( existingCursorValid )
2017-08-25 08:25:53 +08:00
existing = nextEntry ;
2017-08-22 13:29:57 +08:00
}
2017-08-26 06:48:32 +08:00
}
2017-06-10 05:56:41 +08:00
2017-08-26 06:48:32 +08:00
// Write any remaining existing keys, which are not subject to clears as they are beyond the cleared range.
2018-06-08 18:32:34 +08:00
while ( existingCursorValid ) {
2018-07-23 18:09:13 +08:00
merged . push_back ( existingCursor . getKVRef ( ) ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p: Added %s [existing, tail] \n " , this , KeyVersionValue : : unpack ( merged . back ( ) ) . toString ( ) . c_str ( ) ) ;
2017-08-25 08:25:53 +08:00
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
if ( existingCursorValid )
2018-07-23 18:09:13 +08:00
existing = KeyVersionValue : : unpack ( existingCursor . getKVRef ( ) ) ;
2017-06-10 05:56:41 +08:00
}
2017-08-25 08:25:53 +08:00
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Done merging mutations into existing leaf contents \n " , this ) ;
2017-08-28 18:53:29 +08:00
2017-08-29 08:26:53 +08:00
// No changes were actually made. This could happen if there is a clear which does not cover an entire leaf but also does
// not which turns out to not match any existing data in the leaf.
2017-08-28 18:53:29 +08:00
if ( minVersion = = invalidVersion ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p No changes were made during mutation merge \n " , this ) ;
2017-08-28 16:57:01 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
2017-08-28 18:53:29 +08:00
}
2017-08-28 16:57:01 +08:00
2017-08-22 13:29:57 +08:00
// TODO: Make version and key splits based on contents of merged list
2017-06-10 05:56:41 +08:00
IPager * pager = self - > m_pager ;
2018-07-17 15:41:42 +08:00
std : : vector < BoundaryPagePairT > pages = buildPages ( true , lowerBoundKey , upperBoundKey , merged , BTreePage : : IS_LEAF , [ pager ] ( ) { return pager - > newPageBuffer ( ) ; } , self - > m_pageSize ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
// If there isn't still just a single page of data then return the previous lower bound and page ID that lead to this page to be used for version 0
if ( pages . size ( ) ! = 1 ) {
2017-06-10 05:56:41 +08:00
results . push_back ( { 0 , { { lowerBoundKey , root } } } ) ;
2017-07-05 14:41:48 +08:00
}
2017-06-10 05:56:41 +08:00
// For each IPage of data, assign a logical pageID.
std : : vector < LogicalPageID > logicalPages ;
// Only reuse first page if only one page is being returned or if root is not the btree root.
if ( pages . size ( ) = = 1 | | root ! = self - > m_root )
logicalPages . push_back ( root ) ;
// Allocate enough pageIDs for all of the pages
for ( int i = logicalPages . size ( ) ; i < pages . size ( ) ; i + + )
logicalPages . push_back ( self - > m_pager - > allocateLogicalPage ( ) ) ;
2017-09-15 20:19:39 +08:00
if ( pages . size ( ) = = 1 )
minVersion = 0 ;
2017-06-10 05:56:41 +08:00
// Write each page using its assigned page ID
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Writing %lu replacement pages for %d at version %lld \n " , this , pages . size ( ) , root , minVersion ) ;
2017-06-10 05:56:41 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + )
2018-07-15 04:37:52 +08:00
self - > writePage ( logicalPages [ i ] , pages [ i ] . second , minVersion , pages [ i ] . first , ( i = = pages . size ( ) - 1 ) ? upperBoundKey : pages [ i + 1 ] . first ) ;
2017-06-10 05:56:41 +08:00
2017-07-14 02:32:14 +08:00
// If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page
2017-08-28 21:28:49 +08:00
if ( root = = self - > m_root ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Building new root \n " , this ) ;
2018-07-10 17:24:01 +08:00
self - > buildNewRoot ( minVersion , pages , logicalPages ) ;
2017-08-28 21:28:49 +08:00
}
2017-07-14 02:32:14 +08:00
2017-06-10 05:56:41 +08:00
results . push_back ( { minVersion , { } } ) ;
for ( int i = 0 ; i < pages . size ( ) ; i + + ) {
2017-08-28 16:57:01 +08:00
// The lower bound of the first page is the lower bound of the subtree, not the first entry in the page
2018-07-10 17:24:01 +08:00
Key lowerBound = ( i = = 0 ) ? lowerBoundKey : pages [ i ] . first ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Adding page to results: %s => %d \n " , this , lowerBound . toHexString ( ) . c_str ( ) , logicalPages [ i ] ) ;
2017-08-28 16:57:01 +08:00
results . back ( ) . second . push_back ( { lowerBound , logicalPages [ i ] } ) ;
2017-06-10 05:56:41 +08:00
}
2018-07-17 15:41:42 +08:00
debug_printf ( " %p DONE. \n " , this ) ;
2017-06-10 05:56:41 +08:00
return results ;
}
else {
2018-06-08 18:32:34 +08:00
state std : : vector < Future < VersionedChildrenT > > futureChildren ;
state std : : vector < LogicalPageID > childPageIDs ;
2018-06-15 08:52:25 +08:00
bool first = true ;
2018-06-08 18:32:34 +08:00
while ( existingCursorValid ) {
// The lower bound for the first child is lowerBoundKey
Key childLowerBound = first ? lowerBoundKey : existingCursor . getKey ( ) ;
if ( first )
first = false ;
2018-07-23 18:09:13 +08:00
uint32_t pageID = * ( uint32_t * ) existingCursor . getValueRef ( ) . begin ( ) ;
2018-06-08 18:32:34 +08:00
existingCursorValid = existingCursor . moveNext ( ) ;
2017-06-10 05:56:41 +08:00
2018-07-18 18:19:35 +08:00
// The upper bound for the last child is upperBoundKey, and the cursor's next key for the others.
2018-06-08 18:32:34 +08:00
Key childUpperBound = existingCursorValid ? existingCursor . getKey ( ) : upperBoundKey ;
2017-08-28 16:57:01 +08:00
2017-08-28 18:53:29 +08:00
ASSERT ( childLowerBound < = childUpperBound ) ;
2018-06-08 18:32:34 +08:00
futureChildren . push_back ( self - > commitSubtree ( self , mutationBuffer , snapshot , pageID , childLowerBound , childUpperBound ) ) ;
childPageIDs . push_back ( pageID ) ;
2017-06-10 05:56:41 +08:00
}
2018-06-08 18:32:34 +08:00
Void _ = wait ( waitForAll ( futureChildren ) ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
bool modified = false ;
2018-06-08 18:32:34 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
const VersionedChildrenT & children = futureChildren [ i ] . get ( ) ;
if ( children . size ( ) ! = 1 | | children [ 0 ] . second . size ( ) ! = 1 ) {
2017-07-05 14:41:48 +08:00
modified = true ;
2017-06-10 05:56:41 +08:00
break ;
}
}
2017-07-14 02:32:14 +08:00
if ( ! modified ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p not modified. \n " , this ) ;
2017-06-10 05:56:41 +08:00
return VersionedChildrenT ( { { 0 , { { lowerBoundKey , root } } } } ) ;
2017-07-14 02:32:14 +08:00
}
2017-06-10 05:56:41 +08:00
Version version = 0 ;
VersionedChildrenT result ;
loop { // over version splits of this page
Version nextVersion = std : : numeric_limits < Version > : : max ( ) ;
2018-07-23 18:09:13 +08:00
std : : vector < PrefixTree : : EntryRef > childEntries ; // Logically std::vector<std::pair<std::string, LogicalPageID>> childEntries;
2017-06-10 05:56:41 +08:00
// For each Future<VersionedChildrenT>
2018-07-17 15:41:42 +08:00
debug_printf ( " %p creating replacement pages for id=%d at Version %lld \n " , this , root , version ) ;
2017-07-05 14:41:48 +08:00
// If we're writing version 0, there is a chance that we don't have to write ourselves, if there are no changes
bool modified = version ! = 0 ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
for ( int i = 0 ; i < futureChildren . size ( ) ; + + i ) {
LogicalPageID pageID = childPageIDs [ i ] ;
const VersionedChildrenT & children = futureChildren [ i ] . get ( ) ;
2017-07-05 14:41:48 +08:00
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Versioned page set that replaced page %d: %lu versions \n " , this , pageID , children . size ( ) ) ;
2017-06-10 05:56:41 +08:00
for ( auto & versionedPageSet : children ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p version: %lld \n " , this , versionedPageSet . first ) ;
2017-06-10 05:56:41 +08:00
for ( auto & boundaryPage : versionedPageSet . second ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p '%s' -> %u \n " , this , printable ( boundaryPage . first ) . c_str ( ) , boundaryPage . second ) ;
2017-06-10 05:56:41 +08:00
}
}
// Find the first version greater than the current version we are writing
auto cv = std : : upper_bound ( children . begin ( ) , children . end ( ) , version , [ ] ( Version a , VersionedChildrenT : : value_type const & b ) { return a < b . first ; } ) ;
// If there are no versions before the one we found, just update nextVersion and continue.
if ( cv = = children . begin ( ) ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p First version (%lld) in set is greater than current, setting nextVersion and continuing \n " , this , cv - > first ) ;
2017-06-10 05:56:41 +08:00
nextVersion = std : : min ( nextVersion , cv - > first ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p curr %lld next %lld \n " , this , version , nextVersion ) ;
2017-06-10 05:56:41 +08:00
continue ;
}
// If a version greater than the current version being written was found, update nextVersion
if ( cv ! = children . end ( ) ) {
nextVersion = std : : min ( nextVersion , cv - > first ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p curr %lld next %lld \n " , this , version , nextVersion ) ;
2017-06-10 05:56:41 +08:00
}
// Go back one to the last version that was valid prior to or at the current version we are writing
- - cv ;
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Using children for version %lld from this set, building version %lld \n " , this , cv - > first , version ) ;
2017-07-05 14:41:48 +08:00
// If page count isn't 1 then the root is definitely modified
modified = modified | | cv - > second . size ( ) ! = 1 ;
2017-06-10 05:56:41 +08:00
// Add the children at this version to the child entries list for the current version being built.
for ( auto & childPage : cv - > second ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Adding child page '%s' \n " , this , printable ( childPage . first ) . c_str ( ) ) ;
2018-07-23 18:09:13 +08:00
childEntries . emplace_back ( childPage . first , StringRef ( ( unsigned char * ) & childPage . second , sizeof ( uint32_t ) ) ) ;
2017-06-10 05:56:41 +08:00
}
}
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Finished pass through futurechildren. childEntries=%lu version=%lld nextVersion=%lld \n " , this , childEntries . size ( ) , version , nextVersion ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
if ( modified ) {
// TODO: Track split points across iterations of this loop, so that they don't shift unnecessarily and
// cause unnecessary path copying
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
IPager * pager = self - > m_pager ;
2018-07-17 15:41:42 +08:00
std : : vector < BoundaryPagePairT > pages = buildPages ( false , lowerBoundKey , upperBoundKey , childEntries , 0 , [ pager ] ( ) { return pager - > newPageBuffer ( ) ; } , self - > m_pageSize ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
// For each IPage of data, assign a logical pageID.
std : : vector < LogicalPageID > logicalPages ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
// Only reuse first page if only one page is being returned or if root is not the btree root.
if ( pages . size ( ) = = 1 | | root ! = self - > m_root )
logicalPages . push_back ( root ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
// Allocate enough pageIDs for all of the pages
for ( int i = logicalPages . size ( ) ; i < pages . size ( ) ; i + + )
logicalPages . push_back ( self - > m_pager - > allocateLogicalPage ( ) ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
// Write each page using its assigned page ID
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Writing %lu internal pages \n " , this , pages . size ( ) ) ;
2017-07-05 14:41:48 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + )
2018-07-15 04:37:52 +08:00
self - > writePage ( logicalPages [ i ] , pages [ i ] . second , version , pages [ i ] . first , ( i = = pages . size ( ) - 1 ) ? upperBoundKey : pages [ i + 1 ] . first ) ;
2017-06-10 05:56:41 +08:00
2017-07-14 02:32:14 +08:00
// If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page
if ( root = = self - > m_root )
2018-07-10 17:24:01 +08:00
self - > buildNewRoot ( version , pages , logicalPages ) ;
2017-07-14 02:32:14 +08:00
2017-07-05 14:41:48 +08:00
result . resize ( result . size ( ) + 1 ) ;
result . back ( ) . first = version ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
for ( int i = 0 ; i < pages . size ( ) ; i + + )
2018-07-10 17:24:01 +08:00
result . back ( ) . second . push_back ( { pages [ i ] . first , logicalPages [ i ] } ) ;
2017-06-10 05:56:41 +08:00
2017-07-05 14:41:48 +08:00
if ( result . size ( ) > 1 & & result . back ( ) . second = = result . end ( ) [ - 2 ] . second ) {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Output same as last version, popping it. \n " , this ) ;
2017-07-05 14:41:48 +08:00
result . pop_back ( ) ;
}
}
else {
2018-07-17 15:41:42 +08:00
debug_printf ( " %p Version 0 has no changes \n " , this ) ;
2017-07-05 14:41:48 +08:00
result . push_back ( { 0 , { { lowerBoundKey , root } } } ) ;
}
2017-06-10 05:56:41 +08:00
if ( nextVersion = = std : : numeric_limits < Version > : : max ( ) )
break ;
version = nextVersion ;
}
2018-07-17 15:41:42 +08:00
debug_printf ( " %p DONE. \n " , this ) ;
2017-06-10 05:56:41 +08:00
return result ;
}
}
ACTOR static Future < Void > commit_impl ( VersionedBTree * self ) {
2017-09-23 08:18:28 +08:00
state MutationBufferT * mutations = self - > m_pBuffer ;
// No more mutations are allowed to be written to this mutation buffer we will commit
// at m_writeVersion, which we must save locally because it could change during commit.
self - > m_pBuffer = nullptr ;
state Version writeVersion = self - > m_writeVersion ;
// The latest mutation buffer start version is the one we will now (or eventually) commit.
state Version mutationBufferStartVersion = self - > m_mutationBuffers . rbegin ( ) - > first ;
// Replace the lastCommit future with a new one and then wait on the old one
state Promise < Void > committed ;
Future < Void > previousCommit = self - > m_latestCommit ;
self - > m_latestCommit = committed . getFuture ( ) ;
// Wait for the latest commit that started to be finished.
Void _ = wait ( previousCommit ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: Beginning commit of version %lld \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-09-23 08:18:28 +08:00
// Get the latest version from the pager, which is what we will read at
2017-06-10 05:56:41 +08:00
Version latestVersion = wait ( self - > m_pager - > getLatestVersion ( ) ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: pager latestVersion %lld \n " , self - > m_name . c_str ( ) , latestVersion ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
self - > printMutationBuffer ( mutations ) ;
2017-08-22 13:29:57 +08:00
2018-07-15 04:37:52 +08:00
VersionedChildrenT _ = wait ( commitSubtree ( self , mutations , self - > m_pager - > getReadSnapshot ( latestVersion ) , self - > m_root , beginKey , endKey ) ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
self - > m_pager - > setLatestVersion ( writeVersion ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: Committing pager %lld \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-07-14 05:51:39 +08:00
Void _ = wait ( self - > m_pager - > commit ( ) ) ;
2017-10-10 04:24:16 +08:00
debug_printf ( " %s: Committed version %lld \n " , self - > m_name . c_str ( ) , writeVersion ) ;
2017-06-10 05:56:41 +08:00
2017-09-23 08:18:28 +08:00
// Now that everything is committed we must delete the mutation buffer.
// Our buffer's start version should be the oldest mutation buffer version in the map.
ASSERT ( mutationBufferStartVersion = = self - > m_mutationBuffers . begin ( ) - > first ) ;
self - > m_mutationBuffers . erase ( self - > m_mutationBuffers . begin ( ) ) ;
self - > m_lastCommittedVersion = writeVersion ;
committed . send ( Void ( ) ) ;
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}
IPager * m_pager ;
2017-09-23 08:18:28 +08:00
MutationBufferT * m_pBuffer ;
std : : map < Version , MutationBufferT > m_mutationBuffers ;
2017-08-22 13:29:57 +08:00
2017-06-10 05:56:41 +08:00
Version m_writeVersion ;
2017-07-26 07:10:19 +08:00
Version m_lastCommittedVersion ;
2017-09-23 08:18:28 +08:00
Future < Void > m_latestCommit ;
2017-09-06 07:59:31 +08:00
int m_pageSize ;
2017-09-23 08:18:28 +08:00
Future < Void > m_init ;
2017-10-10 04:24:16 +08:00
std : : string m_name ;
2017-06-10 05:56:41 +08:00
2018-06-08 18:32:34 +08:00
// InternalCursor is for seeking to and iterating over the internal / low level records in the Btree.
// This records are versioned and they can represent deletions or partial values so they must be
// post processed to obtain keys returnable to the user.
2017-09-16 17:09:09 +08:00
class InternalCursor {
2017-09-15 20:19:39 +08:00
public :
InternalCursor ( ) { }
2018-06-08 18:32:34 +08:00
InternalCursor ( Reference < IPagerSnapshot > pages , LogicalPageID root ) : m_pages ( pages ) , m_root ( root ) , outOfBound ( 0 ) {
2017-09-15 20:19:39 +08:00
m_path . reserve ( 6 ) ;
}
bool valid ( ) const {
2018-06-08 18:32:34 +08:00
return ( outOfBound = = 0 ) & & kvv . valid ( ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-23 08:18:28 +08:00
Future < Void > seekLessThanOrEqual ( KeyRef key ) {
2017-09-16 17:09:09 +08:00
return seekLessThanOrEqual_impl ( this , key ) ;
2017-09-15 20:19:39 +08:00
}
Future < Void > move ( bool fwd ) {
2017-09-16 17:09:09 +08:00
return move_impl ( this , fwd ) ;
2017-09-15 20:19:39 +08:00
}
2018-07-23 18:09:13 +08:00
Standalone < KeyVersionValueRef > kvv ; // The decoded current internal record in the tree
2017-09-15 20:19:39 +08:00
2018-06-12 16:43:19 +08:00
std : : string toString ( const char * wrapPrefix = " " ) const {
std : : string r ;
r + = format ( " InternalCursor(%p) ver=%lld oob=%d valid=%d " , this , m_pages - > getVersion ( ) , outOfBound , valid ( ) ) ;
r + = format ( " \n %s KVV: %s " , wrapPrefix , kvv . toString ( ) . c_str ( ) ) ;
for ( const PageEntryLocation & p : m_path ) {
2018-07-23 18:09:13 +08:00
std : : string cur = p . cursor . valid ( ) ? format ( " '%s' -> '%s' " , p . cursor . getKey ( ) . toHexString ( ) . c_str ( ) , p . cursor . getValueRef ( ) . toHexString ( ) . c_str ( ) ) : " invalid " ;
2018-06-12 16:43:19 +08:00
r + = format ( " \n %s Page %d (%d records, %d bytes) Cursor %s " , wrapPrefix , p . pageNumber , p . btPage - > count , p . btPage - > kvBytes , cur . c_str ( ) ) ;
}
return r ;
2017-09-15 20:19:39 +08:00
}
private :
Reference < IPagerSnapshot > m_pages ;
LogicalPageID m_root ;
struct PageEntryLocation {
2018-06-08 18:32:34 +08:00
PageEntryLocation ( ) { }
2018-07-15 04:37:52 +08:00
PageEntryLocation ( Key lowerBound , Key upperBound , Reference < const IPage > page , LogicalPageID id )
: pageLowerBound ( lowerBound ) , pageUpperBound ( upperBound ) , page ( page ) , pageNumber ( id ) , btPage ( ( BTreePage * ) page - > begin ( ) ) , cursor ( btPage - > tree . getCursor ( pageLowerBound , pageUpperBound ) )
2018-07-10 17:24:01 +08:00
{
}
2017-09-15 20:19:39 +08:00
2018-07-15 04:37:52 +08:00
Key getNextOrUpperBound ( ) {
if ( cursor . moveNext ( ) ) {
Key r = cursor . getKey ( ) ;
cursor . movePrev ( ) ;
return r ;
}
return pageUpperBound ;
}
2018-07-10 17:24:01 +08:00
Key pageLowerBound ;
2018-07-15 04:37:52 +08:00
Key pageUpperBound ;
2017-09-15 20:19:39 +08:00
Reference < const IPage > page ;
2018-06-08 18:32:34 +08:00
BTreePage * btPage ;
PrefixTree : : Cursor cursor ;
2018-07-17 15:41:42 +08:00
// For easier debugging
LogicalPageID pageNumber ;
2017-09-15 20:19:39 +08:00
} ;
typedef std : : vector < PageEntryLocation > TraversalPathT ;
TraversalPathT m_path ;
2018-06-08 18:32:34 +08:00
int outOfBound ;
2017-09-15 20:19:39 +08:00
2018-07-15 04:37:52 +08:00
ACTOR static Future < Void > pushPage ( InternalCursor * self , Key lowerBound , Key upperBound , LogicalPageID id ) {
2017-09-15 20:19:39 +08:00
Reference < const IPage > rawPage = wait ( self - > m_pages - > getPhysicalPage ( id ) ) ;
2018-07-17 15:41:42 +08:00
debug_printf ( " InternalCursor: page read: id=%d ver=%lld lower='%s' upper='%s' \n " , id , self - > m_pages - > getVersion ( ) , lowerBound . toHexString ( ) . c_str ( ) , upperBound . toHexString ( ) . c_str ( ) ) ;
debug_printf ( " InternalCursor: page read: %s \n " , ( ( const BTreePage * ) rawPage - > begin ( ) ) - > toString ( lowerBound , upperBound ) . c_str ( ) ) ;
2018-07-15 04:37:52 +08:00
self - > m_path . emplace_back ( lowerBound , upperBound , rawPage , id ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2017-09-16 17:09:09 +08:00
ACTOR static Future < Void > reset ( InternalCursor * self ) {
2017-09-15 20:19:39 +08:00
if ( self - > m_path . empty ( ) ) {
2018-07-15 04:37:52 +08:00
Void _ = wait ( pushPage ( self , beginKey , endKey , self - > m_root ) ) ;
2017-09-15 20:19:39 +08:00
}
else {
self - > m_path . resize ( 1 ) ;
}
2018-06-12 16:43:19 +08:00
self - > outOfBound = 0 ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2017-09-23 08:18:28 +08:00
ACTOR static Future < Void > seekLessThanOrEqual_impl ( InternalCursor * self , KeyRef key ) {
2017-09-15 20:19:39 +08:00
state TraversalPathT & path = self - > m_path ;
Void _ = wait ( reset ( self ) ) ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): start %s \n " , KeyVersionValue : : unpack ( key ) . toString ( ) . c_str ( ) , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
loop {
state PageEntryLocation * p = & path . back ( ) ;
2018-06-08 18:32:34 +08:00
if ( p - > btPage - > count = = 0 ) {
2017-09-15 20:19:39 +08:00
ASSERT ( path . size ( ) = = 1 ) ; // This must be the root page.
2018-06-12 16:43:19 +08:00
self - > outOfBound = - 1 ;
2017-09-15 20:19:39 +08:00
self - > kvv . version = invalidVersion ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Exit, root page empty. %s \n " , KeyVersionValue : : unpack ( key ) . toString ( ) . c_str ( ) , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2018-06-08 18:32:34 +08:00
state bool foundLTE = p - > cursor . seekLessThanOrEqual ( key ) ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Seek on path tail, result %d. %s \n " , KeyVersionValue : : unpack ( key ) . toString ( ) . c_str ( ) , foundLTE , self - > toString ( " " ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
if ( p - > btPage - > flags & BTreePage : : IS_LEAF ) {
// It is possible for the current leaf key to be between the page's lower bound (in the parent page) and the
// first record in the leaf page, which means we must move backwards 1 step in the database to find the
// record < key, if such a record exists.
if ( ! foundLTE ) {
2017-09-16 16:59:16 +08:00
Void _ = wait ( self - > move ( false ) ) ;
2017-09-15 20:19:39 +08:00
}
else {
2018-06-08 18:32:34 +08:00
// Found the target record
2018-07-23 18:09:13 +08:00
self - > kvv = KeyVersionValue : : unpack ( p - > cursor . getKVRef ( ) ) ;
2017-09-15 20:19:39 +08:00
}
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Exit, Found leaf page. %s \n " , KeyVersionValue : : unpack ( key ) . toString ( ) . c_str ( ) , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
else {
2018-06-08 18:32:34 +08:00
// We don't have to check foundLTE here because if it's false then cursor will be at the first record in the page.
// TODO: It would, however, be more efficient to check foundLTE and if false move to the previous sibling page.
// But the page should NOT be empty so let's assert that the cursor is valid.
ASSERT ( p - > cursor . valid ( ) ) ;
2018-07-23 18:09:13 +08:00
LogicalPageID newPage = ( LogicalPageID ) * ( uint32_t * ) p - > cursor . getValueRef ( ) . begin ( ) ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::seekLTE(%s): Found internal page, going to page %d. %s \n " ,
KeyVersionValue : : unpack ( key ) . toString ( ) . c_str ( ) , newPage , self - > toString ( " " ) . c_str ( ) ) ;
2018-07-15 04:37:52 +08:00
Void _ = wait ( pushPage ( self , p - > cursor . getKey ( ) , p - > getNextOrUpperBound ( ) , newPage ) ) ;
2017-09-15 20:19:39 +08:00
}
}
}
// Move one 'internal' key/value/version/valueindex/value record.
2018-07-23 18:09:13 +08:00
// Iterating with this function will "see" all parts of all values and clears at all versions (that is, within the cursor's version of btree pages)
2017-09-16 17:09:09 +08:00
ACTOR static Future < Void > move_impl ( InternalCursor * self , bool fwd ) {
2017-09-15 20:19:39 +08:00
state TraversalPathT & path = self - > m_path ;
state const char * dir = fwd ? " forward " : " backward " ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) start %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
2018-06-08 18:32:34 +08:00
// If cursor was out of bound, adjust out of boundness by 1 in the correct direction
if ( self - > outOfBound ! = 0 ) {
self - > outOfBound + = fwd ? 1 : - 1 ;
2018-06-12 16:43:19 +08:00
// If we appear to be inbounds, see if we're off the other end of the db or if the page cursor is valid.
if ( self - > outOfBound = = 0 ) {
if ( ! path . empty ( ) & & path . back ( ) . cursor . valid ( ) ) {
2018-07-23 18:09:13 +08:00
self - > kvv = KeyVersionValue : : unpack ( path . back ( ) . cursor . getKVRef ( ) ) ;
2018-06-12 16:43:19 +08:00
}
else {
self - > outOfBound = fwd ? 1 : - 1 ;
}
}
debug_printf ( " InternalCursor::move(%s) was out of bound, exiting %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2018-06-08 18:32:34 +08:00
return Void ( ) ;
}
2017-09-15 20:19:39 +08:00
int i = path . size ( ) ;
// Find the closest path part to the end where the index can be moved in the correct direction.
while ( - - i > = 0 ) {
2018-06-08 18:32:34 +08:00
PrefixTree : : Cursor & c = path [ i ] . cursor ;
bool success = fwd ? c . moveNext ( ) : c . movePrev ( ) ;
if ( success ) {
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Move successful on path index %d \n " , dir , i ) ;
2018-06-17 21:48:41 +08:00
path . resize ( i + 1 ) ;
2017-09-15 20:19:39 +08:00
break ;
2018-06-12 16:43:19 +08:00
} else {
debug_printf ( " InternalCursor::move(%s) Move failed on path index %d \n " , dir , i ) ;
2018-06-08 18:32:34 +08:00
}
2017-09-15 20:19:39 +08:00
}
// If no path part could be moved without going out of range then the
// new cursor position is either before the first record or after the last.
2018-06-08 18:32:34 +08:00
// Leave the path steps in place and set outOfBound to 1 or -1 based on fwd.
// This makes the cursor not valid() but a move in the opposite direction
// will make it valid again, pointing to the previous target record.
2017-09-15 20:19:39 +08:00
if ( i < 0 ) {
2018-06-08 18:32:34 +08:00
self - > outOfBound = fwd ? 1 : - 1 ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Passed an end of the database %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
2018-06-17 21:48:41 +08:00
// We were able to advance the cursor on one of the pages in the page traversal path, so now traverse down to leaf level
2017-09-15 20:19:39 +08:00
state PageEntryLocation * p = & ( path . back ( ) ) ;
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s): Descending if needed to find a leaf \n " , dir ) ;
2017-09-15 20:19:39 +08:00
2018-06-08 18:32:34 +08:00
// Now we must traverse downward if needed until we are at a leaf level.
2017-09-15 20:19:39 +08:00
// Each movement down will start on the far left or far right depending on fwd
2018-06-08 18:32:34 +08:00
while ( ! ( p - > btPage - > flags & BTreePage : : IS_LEAF ) ) {
// Get the page that the path's last entry points to
2018-07-23 18:09:13 +08:00
Void _ = wait ( pushPage ( self , p - > cursor . getKey ( ) , p - > getNextOrUpperBound ( ) , ( LogicalPageID ) * ( uint32_t * ) p - > cursor . getValueRef ( ) . begin ( ) ) ) ;
2017-09-15 20:19:39 +08:00
p = & ( path . back ( ) ) ;
2018-06-08 18:32:34 +08:00
// No page traversed to in this manner should be empty.
ASSERT ( p - > btPage - > count ! = 0 ) ;
// Go to the first or last entry in the page depending on traversal direction
if ( fwd )
p - > cursor . moveFirst ( ) ;
else
p - > cursor . moveLast ( ) ;
2017-09-15 20:19:39 +08:00
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Descended one level %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
}
2017-09-15 20:19:39 +08:00
2018-06-08 18:32:34 +08:00
// Found the target record, unpack it
ASSERT ( p - > cursor . valid ( ) ) ;
2018-07-23 18:09:13 +08:00
self - > kvv = KeyVersionValue : : unpack ( p - > cursor . getKVRef ( ) ) ;
2018-06-08 18:32:34 +08:00
2018-06-12 16:43:19 +08:00
debug_printf ( " InternalCursor::move(%s) Exiting %s \n " , dir , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
}
} ;
// Cursor is for reading and interating over user visible KV pairs at a specific version
2018-06-17 21:48:41 +08:00
class Cursor : public IStoreCursor , public ReferenceCounted < Cursor > , public NonCopyable {
2017-06-10 05:56:41 +08:00
public :
Cursor ( Version version , IPager * pager , LogicalPageID root )
2017-09-16 17:09:09 +08:00
: m_version ( version ) , m_pagerSnapshot ( pager - > getReadSnapshot ( version ) ) , m_icursor ( m_pagerSnapshot , root ) {
2017-06-10 05:56:41 +08:00
}
virtual ~ Cursor ( ) { }
2017-09-17 19:38:01 +08:00
virtual Future < Void > findEqual ( KeyRef key ) { return find_impl ( Reference < Cursor > : : addRef ( this ) , key , true , 0 ) ; }
virtual Future < Void > findFirstEqualOrGreater ( KeyRef key , bool needValue , int prefetchNextBytes ) { return find_impl ( Reference < Cursor > : : addRef ( this ) , key , needValue , 1 ) ; }
virtual Future < Void > findLastLessOrEqual ( KeyRef key , bool needValue , int prefetchPriorBytes ) { return find_impl ( Reference < Cursor > : : addRef ( this ) , key , needValue , - 1 ) ; }
2017-09-09 16:29:25 +08:00
2017-09-15 20:19:39 +08:00
virtual Future < Void > next ( bool needValue ) { return next_impl ( Reference < Cursor > : : addRef ( this ) , needValue ) ; }
2017-09-17 19:38:01 +08:00
virtual Future < Void > prev ( bool needValue ) { return prev_impl ( Reference < Cursor > : : addRef ( this ) , needValue ) ; }
2017-06-10 05:56:41 +08:00
virtual bool isValid ( ) {
return m_kv . present ( ) ;
}
virtual KeyRef getKey ( ) {
return m_kv . get ( ) . key ;
}
//virtual StringRef getCompressedKey() = 0;
virtual ValueRef getValue ( ) {
return m_kv . get ( ) . value ;
}
virtual void invalidateReturnedStrings ( ) {
2017-09-15 20:19:39 +08:00
m_pagerSnapshot - > invalidateReturnedPages ( ) ;
2017-06-10 05:56:41 +08:00
}
2017-09-15 20:19:39 +08:00
void addref ( ) { ReferenceCounted < Cursor > : : addref ( ) ; }
void delref ( ) { ReferenceCounted < Cursor > : : delref ( ) ; }
2018-06-12 16:43:19 +08:00
std : : string toString ( const char * wrapPrefix = " " ) const {
std : : string r ;
2018-06-14 19:15:14 +08:00
r + = format ( " Cursor(%p) ver: %lld key: %s value: %s " , this , m_version ,
2018-06-12 16:43:19 +08:00
( m_kv . present ( ) ? m_kv . get ( ) . key . printable ( ) . c_str ( ) : " <np> " ) ,
( m_kv . present ( ) ? m_kv . get ( ) . value . printable ( ) . c_str ( ) : " " ) ) ;
2018-06-14 19:15:14 +08:00
r + = format ( " \n %s InternalCursor: %s " , wrapPrefix , m_icursor . toString ( format ( " %s " , wrapPrefix ) . c_str ( ) ) . c_str ( ) ) ;
2018-06-12 16:43:19 +08:00
return r ;
}
2017-09-15 20:19:39 +08:00
private :
2017-06-10 05:56:41 +08:00
Version m_version ;
2017-09-15 20:19:39 +08:00
Reference < IPagerSnapshot > m_pagerSnapshot ;
2017-09-16 17:09:09 +08:00
InternalCursor m_icursor ;
2017-09-09 16:29:25 +08:00
Optional < KeyValueRef > m_kv ; // The current user-level key/value in the tree
2017-06-10 05:56:41 +08:00
Arena m_arena ;
2017-09-09 16:29:25 +08:00
2018-07-23 18:09:13 +08:00
// find key in tree closest to or equal to key (at this cursor's version)
2017-09-15 20:19:39 +08:00
// for less than or equal use cmp < 0
// for greater than or equal use cmp > 0
// for equal use cmp == 0
2017-09-17 19:38:01 +08:00
ACTOR static Future < Void > find_impl ( Reference < Cursor > self , KeyRef key , bool needValue , int cmp ) {
2017-09-16 17:09:09 +08:00
state InternalCursor & icur = self - > m_icursor ;
2017-09-09 16:29:25 +08:00
2018-07-23 18:09:13 +08:00
// Search for the last key at or before (key, version, \xff)
state Key target = KeyVersionValueRef : : searchKey ( key , self - > m_version ) ;
2017-09-17 19:38:01 +08:00
self - > m_kv = Optional < KeyValueRef > ( ) ;
2017-06-10 05:56:41 +08:00
2018-07-23 18:09:13 +08:00
Void _ = wait ( icur . seekLessThanOrEqual ( target ) ) ;
debug_printf ( " find%sE('%s'): %s \n " , cmp > 0 ? " GT " : ( cmp = = 0 ? " " : " LT " ) , target . toHexString ( 40 ) . c_str ( ) , icur . toString ( ) . c_str ( ) ) ;
2017-06-10 05:56:41 +08:00
2017-09-15 20:19:39 +08:00
// If we found the target key, return it as it is valid for any cmp option
2017-09-16 17:09:09 +08:00
if ( icur . valid ( ) & & icur . kvv . value . present ( ) & & icur . kvv . key = = key ) {
debug_printf ( " Reading full kv pair starting from: %s \n " , icur . kvv . toString ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
Void _ = wait ( self - > readFullKVPair ( self ) ) ;
return Void ( ) ;
2017-09-09 16:29:25 +08:00
}
2017-09-15 20:19:39 +08:00
// FindEqual, so if we're still here we didn't find it.
if ( cmp = = 0 ) {
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2017-09-16 16:45:39 +08:00
// FindEqualOrGreaterThan, so if we're here we have to go to the next present record at the target version.
2017-09-15 20:19:39 +08:00
if ( cmp > 0 ) {
2017-09-16 16:45:39 +08:00
// icur is at a record < key, possibly before the start of the tree so move forward at least once.
loop {
2017-09-16 17:09:09 +08:00
Void _ = wait ( icur . move ( true ) ) ;
if ( ! icur . valid ( ) | | icur . kvv . key > key )
2017-09-16 16:45:39 +08:00
break ;
}
// Get the next present key at the target version. Handles invalid cursor too.
2017-09-17 19:38:01 +08:00
Void _ = wait ( self - > next ( needValue ) ) ;
2017-09-09 16:29:25 +08:00
}
2017-09-16 16:45:39 +08:00
else if ( cmp < 0 ) {
2017-09-17 19:38:01 +08:00
// Move to previous present kv pair at the target version
Void _ = wait ( self - > prev ( needValue ) ) ;
2017-09-15 20:19:39 +08:00
}
2017-09-09 16:29:25 +08:00
return Void ( ) ;
}
2017-09-15 20:19:39 +08:00
ACTOR static Future < Void > next_impl ( Reference < Cursor > self , bool needValue ) {
2017-09-17 19:38:01 +08:00
// TODO: use needValue
2017-09-16 17:09:09 +08:00
state InternalCursor & i = self - > m_icursor ;
2017-09-15 20:19:39 +08:00
2018-06-17 21:48:41 +08:00
debug_printf ( " Cursor::next(): cursor %s \n " , i . toString ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
2017-09-16 16:45:39 +08:00
// Make sure we are one record past the last user key
if ( self - > m_kv . present ( ) ) {
2017-09-16 17:09:09 +08:00
while ( i . valid ( ) & & i . kvv . key < = self - > m_kv . get ( ) . key ) {
2018-06-17 21:48:41 +08:00
debug_printf ( " Cursor::next(): Advancing internal cursor to get passed previous returned user key. cursor %s \n " , i . toString ( ) . c_str ( ) ) ;
2017-09-16 17:09:09 +08:00
Void _ = wait ( i . move ( true ) ) ;
2017-09-16 16:45:39 +08:00
}
}
2017-09-16 08:27:13 +08:00
state Version v = self - > m_pagerSnapshot - > getVersion ( ) ;
2017-09-16 17:09:09 +08:00
state InternalCursor iLast ;
2017-09-16 08:27:13 +08:00
while ( 1 ) {
2017-09-16 17:09:09 +08:00
iLast = i ;
if ( ! i . valid ( ) )
2017-09-16 08:27:13 +08:00
break ;
2017-09-16 17:09:09 +08:00
Void _ = wait ( i . move ( true ) ) ;
2018-06-17 21:48:41 +08:00
// If the previous cursor position was a set at a version at or before v and the new cursor position
// is not valid or a newer version of the same key or a different key, then get the full record
// for the previous cursor position
2017-09-16 17:09:09 +08:00
if ( iLast . kvv . version < = v
& & iLast . kvv . value . present ( )
2017-09-16 08:27:13 +08:00
& & (
2017-09-16 17:09:09 +08:00
! i . valid ( )
| | i . kvv . key ! = iLast . kvv . key
| | i . kvv . version > v
2017-09-16 08:27:13 +08:00
)
) {
2017-09-16 16:45:39 +08:00
// Assume that next is the most likely next move, so save the one-too-far cursor position.
2017-09-16 08:27:13 +08:00
std : : swap ( i , iLast ) ;
2018-06-17 21:48:41 +08:00
// readFullKVPair will have to go backwards to read the value
2017-09-16 08:27:13 +08:00
Void _ = wait ( readFullKVPair ( self ) ) ;
std : : swap ( i , iLast ) ;
2017-09-15 20:19:39 +08:00
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
}
2017-09-16 08:27:13 +08:00
self - > m_kv = Optional < KeyValueRef > ( ) ;
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
2017-09-17 19:38:01 +08:00
ACTOR static Future < Void > prev_impl ( Reference < Cursor > self , bool needValue ) {
// TODO: use needValue
state InternalCursor & i = self - > m_icursor ;
2018-06-17 21:48:41 +08:00
debug_printf ( " Cursor::prev(): cursor %s \n " , i . toString ( ) . c_str ( ) ) ;
2017-09-17 19:38:01 +08:00
// Make sure we are one record before the last user key
if ( self - > m_kv . present ( ) ) {
while ( i . valid ( ) & & i . kvv . key > = self - > m_kv . get ( ) . key ) {
Void _ = wait ( i . move ( false ) ) ;
}
}
state Version v = self - > m_pagerSnapshot - > getVersion ( ) ;
while ( i . valid ( ) ) {
// Once we reach a present value at or before v, return or skip it.
if ( i . kvv . version < = v ) {
// If it's present, return it
if ( i . kvv . value . present ( ) ) {
Void _ = wait ( readFullKVPair ( self ) ) ;
return Void ( ) ;
}
// Value wasn't present as of the latest version <= v, so move backward to a new key
state Key clearedKey = i . kvv . key ;
while ( 1 ) {
Void _ = wait ( i . move ( false ) ) ;
if ( ! i . valid ( ) | | i . kvv . key ! = clearedKey )
break ;
}
}
else {
Void _ = wait ( i . move ( false ) ) ;
}
}
self - > m_kv = Optional < KeyValueRef > ( ) ;
return Void ( ) ;
}
2017-09-09 16:29:25 +08:00
// Read all of the current value, if it is split across multiple kv pairs, and set m_kv.
// m_current must be at either the first or the last value part.
ACTOR static Future < Void > readFullKVPair ( Reference < Cursor > self ) {
2017-09-16 17:09:09 +08:00
state KeyVersionValue & kvv = self - > m_icursor . kvv ;
2017-09-09 16:29:25 +08:00
state KeyValueRef & kv = ( self - > m_kv = KeyValueRef ( ) ) . get ( ) ;
2018-07-23 18:09:13 +08:00
ASSERT ( kvv . value . present ( ) ) ;
// Set the key and cursor arena to the arena containing that key
self - > m_arena = kvv . arena ( ) ;
kv . key = kvv . key ;
2017-09-09 16:29:25 +08:00
// Unsplit value
2018-07-23 18:09:13 +08:00
if ( ! kvv . isMultiPart ( ) ) {
kv . value = kvv . value . get ( ) ;
2018-06-14 19:15:14 +08:00
debug_printf ( " readFullKVPair: Unsplit, exit. %s \n " , self - > toString ( " " ) . c_str ( ) ) ;
2017-09-09 16:29:25 +08:00
}
2018-07-23 18:09:13 +08:00
else {
// Figure out if we should go forward or backward to find all the parts
state bool fwd = kvv . valueIndex = = 0 ;
ASSERT ( fwd | | kvv . valueIndex + kvv . value . get ( ) . size ( ) = = kvv . valueTotalSize ) ;
debug_printf ( " readFullKVPair: Split, fwd %d totalsize %d %s \n " , fwd , kvv . valueTotalSize , self - > toString ( " " ) . c_str ( ) ) ;
// Allocate space for the entire value in the same arena as the key
state int bytesLeft = kvv . valueTotalSize ;
kv . value = makeString ( bytesLeft , self - > m_arena ) ;
2017-09-09 16:29:25 +08:00
while ( 1 ) {
2018-07-23 18:09:13 +08:00
debug_printf ( " readFullKVPair: Adding chunk start %d len %d total %d dir %d %s \n " , kvv . valueIndex , kvv . value . get ( ) . size ( ) , kvv . valueTotalSize , fwd , self - > toString ( " " ) . c_str ( ) ) ;
int partSize = kvv . value . get ( ) . size ( ) ;
memcpy ( mutateString ( kv . value ) + kvv . valueIndex , kvv . value . get ( ) . begin ( ) , partSize ) ;
bytesLeft - = partSize ;
if ( bytesLeft = = 0 )
2017-09-09 16:29:25 +08:00
break ;
2018-07-23 18:09:13 +08:00
ASSERT ( bytesLeft > 0 ) ;
Void _ = wait ( self - > m_icursor . move ( fwd ) ) ;
ASSERT ( self - > m_icursor . valid ( ) ) ;
2017-09-09 16:29:25 +08:00
}
}
return Void ( ) ;
2017-06-10 05:56:41 +08:00
}
} ;
} ;
2018-07-23 18:09:13 +08:00
KeyVersionValueRef VersionedBTree : : beginKVV ( StringRef ( ) , 0 , StringRef ( ) ) ;
KeyVersionValueRef VersionedBTree : : endKVV ( LiteralStringRef ( " \xff \xff \xff \xff " ) , std : : numeric_limits < int > : : max ( ) , StringRef ( ) ) ;
2018-07-15 04:37:52 +08:00
Key VersionedBTree : : beginKey ( beginKVV . pack ( ) . key ) ;
Key VersionedBTree : : endKey ( endKVV . pack ( ) . key ) ;
2017-08-23 02:30:44 +08:00
2017-10-02 18:32:22 +08:00
ACTOR template < class T >
Future < T > catchError ( Promise < Void > error , Future < T > f ) {
try {
T result = wait ( f ) ;
return result ;
} catch ( Error & e ) {
if ( error . canBeSet ( ) )
error . sendError ( e ) ;
throw ;
}
}
2017-09-22 14:51:55 +08:00
class KeyValueStoreRedwoodUnversioned : public IKeyValueStore {
2017-09-21 19:43:49 +08:00
public :
2017-09-22 14:51:55 +08:00
KeyValueStoreRedwoodUnversioned ( std : : string filePrefix , UID logID ) : m_filePrefix ( filePrefix ) {
2018-07-04 06:39:32 +08:00
// TODO: These implementation-specific things should really be passed in as arguments, and this class should
// be an IKeyValueStore implementation that wraps IVersionedStore.
2017-09-21 19:43:49 +08:00
m_pager = new IndirectShadowPager ( filePrefix ) ;
2017-10-10 04:24:16 +08:00
m_tree = new VersionedBTree ( m_pager , filePrefix , m_pager - > getUsablePageSize ( ) ) ;
2017-10-02 18:32:22 +08:00
m_init = catchError ( m_error , init_impl ( this ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual Future < Void > init ( ) {
return m_init ;
}
ACTOR Future < Void > init_impl ( KeyValueStoreRedwoodUnversioned * self ) {
2017-09-21 19:43:49 +08:00
Void _ = wait ( self - > m_tree - > init ( ) ) ;
Version v = wait ( self - > m_tree - > getLatestVersion ( ) ) ;
self - > m_tree - > setWriteVersion ( v + 1 ) ;
return Void ( ) ;
}
2017-10-02 18:32:22 +08:00
ACTOR void shutdown ( KeyValueStoreRedwoodUnversioned * self , bool dispose ) {
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdown " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2017-09-23 08:18:28 +08:00
self - > m_init . cancel ( ) ;
2017-10-02 18:32:22 +08:00
delete self - > m_tree ;
2017-09-21 19:43:49 +08:00
Future < Void > closedFuture = self - > m_pager - > onClosed ( ) ;
2017-10-02 18:32:22 +08:00
if ( dispose )
self - > m_pager - > dispose ( ) ;
else
self - > m_pager - > close ( ) ;
2017-09-21 19:43:49 +08:00
Void _ = wait ( closedFuture ) ;
self - > m_closed . send ( Void ( ) ) ;
2018-07-04 20:51:01 +08:00
if ( self - > m_error . canBeSet ( ) ) {
2018-07-05 12:12:09 +08:00
self - > m_error . send ( Never ( ) ) ;
2018-07-04 20:51:01 +08:00
}
2018-07-04 06:39:32 +08:00
TraceEvent ( SevInfo , " RedwoodShutdownComplete " ) . detail ( " FilePrefix " , self - > m_filePrefix ) . detail ( " Dispose " , dispose ) ;
2017-10-02 18:32:22 +08:00
delete self ;
2017-09-21 19:43:49 +08:00
}
virtual void close ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , false ) ;
2017-09-21 19:43:49 +08:00
}
virtual void dispose ( ) {
2017-10-02 18:32:22 +08:00
shutdown ( this , true ) ;
2017-09-21 19:43:49 +08:00
}
virtual Future < Void > onClosed ( ) {
return m_closed . getFuture ( ) ;
}
Future < Void > commit ( bool sequential = false ) {
2017-10-10 04:24:16 +08:00
Future < Void > c = m_tree - > commit ( ) ;
m_tree - > setWriteVersion ( m_tree - > getWriteVersion ( ) + 1 ) ;
return catchError ( m_error , c ) ;
2017-09-21 19:43:49 +08:00
}
virtual KeyValueStoreType getType ( ) {
2017-09-22 14:51:55 +08:00
return KeyValueStoreType : : SSD_REDWOOD_V1 ;
2017-09-21 19:43:49 +08:00
}
virtual StorageBytes getStorageBytes ( ) {
return m_pager - > getStorageBytes ( ) ;
}
2017-10-02 18:32:22 +08:00
virtual Future < Void > getError ( ) { return m_error . getFuture ( ) ; } ;
2017-09-21 19:43:49 +08:00
void clear ( KeyRangeRef range , const Arena * arena = 0 ) {
m_tree - > clear ( range ) ;
}
virtual void set ( KeyValueRef keyValue , const Arena * arena = NULL ) {
2017-09-23 08:18:28 +08:00
//printf("SET write version %lld %s\n", m_tree->getWriteVersion(), printable(keyValue).c_str());
2017-09-21 19:43:49 +08:00
m_tree - > set ( keyValue ) ;
}
2017-09-22 14:51:55 +08:00
ACTOR static Future < Standalone < VectorRef < KeyValueRef > > > readRange_impl ( KeyValueStoreRedwoodUnversioned * self , KeyRangeRef keys , int rowLimit , int byteLimit ) {
Void _ = wait ( self - > m_init ) ;
2017-09-21 19:43:49 +08:00
state Standalone < VectorRef < KeyValueRef > > result ;
state int accumulatedBytes = 0 ;
ASSERT ( byteLimit > 0 ) ;
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2017-10-02 18:32:22 +08:00
state Version readVersion = self - > m_tree - > getLastCommittedVersion ( ) ;
2017-09-21 19:43:49 +08:00
if ( rowLimit > = 0 ) {
Void _ = wait ( cur - > findFirstEqualOrGreater ( keys . begin , true , 0 ) ) ;
while ( cur - > isValid ( ) & & cur - > getKey ( ) < keys . end ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2017-09-23 08:18:28 +08:00
if ( - - rowLimit = = 0 | | accumulatedBytes > = byteLimit )
2017-09-21 19:43:49 +08:00
break ;
Void _ = wait ( cur - > next ( true ) ) ;
}
} else {
Void _ = wait ( cur - > findLastLessOrEqual ( keys . end , true , 0 ) ) ;
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = keys . end )
Void _ = wait ( cur - > prev ( true ) ) ;
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = keys . begin ) {
KeyValueRef kv ( KeyRef ( result . arena ( ) , cur - > getKey ( ) ) , ValueRef ( result . arena ( ) , cur - > getValue ( ) ) ) ;
accumulatedBytes + = kv . expectedSize ( ) ;
result . push_back ( result . arena ( ) , kv ) ;
2017-09-23 08:18:28 +08:00
if ( - - rowLimit = = 0 | | accumulatedBytes > = byteLimit )
2017-09-21 19:43:49 +08:00
break ;
Void _ = wait ( cur - > prev ( true ) ) ;
}
}
return result ;
}
virtual Future < Standalone < VectorRef < KeyValueRef > > > readRange ( KeyRangeRef keys , int rowLimit = 1 < < 30 , int byteLimit = 1 < < 30 ) {
2017-10-02 18:32:22 +08:00
return catchError ( m_error , readRange_impl ( this , keys , rowLimit , byteLimit ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
ACTOR static Future < Optional < Value > > readValue_impl ( KeyValueStoreRedwoodUnversioned * self , KeyRef key , Optional < UID > debugID ) {
Void _ = wait ( self - > m_init ) ;
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
2017-10-02 18:32:22 +08:00
state Version readVersion = self - > m_tree - > getLastCommittedVersion ( ) ;
2017-09-21 19:43:49 +08:00
Void _ = wait ( cur - > findEqual ( key ) ) ;
2017-10-02 18:32:22 +08:00
if ( cur - > isValid ( ) ) {
2017-09-21 19:43:49 +08:00
return cur - > getValue ( ) ;
2017-10-02 18:32:22 +08:00
}
2017-09-21 19:43:49 +08:00
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValue ( KeyRef key , Optional < UID > debugID = Optional < UID > ( ) ) {
2017-10-02 18:32:22 +08:00
return catchError ( m_error , readValue_impl ( this , key , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
ACTOR static Future < Optional < Value > > readValuePrefix_impl ( KeyValueStoreRedwoodUnversioned * self , KeyRef key , int maxLength , Optional < UID > debugID ) {
Void _ = wait ( self - > m_init ) ;
2017-09-21 19:43:49 +08:00
state Reference < IStoreCursor > cur = self - > m_tree - > readAtVersion ( self - > m_tree - > getLastCommittedVersion ( ) ) ;
Void _ = wait ( cur - > findEqual ( key ) ) ;
if ( cur - > isValid ( ) ) {
Value v = cur - > getValue ( ) ;
int len = std : : min ( v . size ( ) , maxLength ) ;
return Value ( cur - > getValue ( ) . substr ( 0 , len ) ) ;
}
return Optional < Value > ( ) ;
}
virtual Future < Optional < Value > > readValuePrefix ( KeyRef key , int maxLength , Optional < UID > debugID = Optional < UID > ( ) ) {
2017-10-02 18:32:22 +08:00
return catchError ( m_error , readValuePrefix_impl ( this , key , maxLength , debugID ) ) ;
2017-09-21 19:43:49 +08:00
}
2017-09-22 14:51:55 +08:00
virtual ~ KeyValueStoreRedwoodUnversioned ( ) {
2017-09-21 19:43:49 +08:00
} ;
private :
std : : string m_filePrefix ;
IPager * m_pager ;
VersionedBTree * m_tree ;
Future < Void > m_init ;
Promise < Void > m_closed ;
2017-10-02 18:32:22 +08:00
Promise < Void > m_error ;
2017-09-21 19:43:49 +08:00
} ;
2017-09-22 14:51:55 +08:00
IKeyValueStore * keyValueStoreRedwoodV1 ( std : : string const & filename , UID logID ) {
return new KeyValueStoreRedwoodUnversioned ( filename , logID ) ;
2017-09-21 19:43:49 +08:00
}
2017-07-14 13:11:48 +08:00
KeyValue randomKV ( int keySize = 10 , int valueSize = 5 ) {
int kLen = g_random - > randomInt ( 1 , keySize ) ;
int vLen = g_random - > randomInt ( 0 , valueSize ) ;
2017-06-10 05:56:41 +08:00
KeyValue kv ;
kv . key = makeString ( kLen , kv . arena ( ) ) ;
kv . value = makeString ( vLen , kv . arena ( ) ) ;
for ( int i = 0 ; i < kLen ; + + i )
mutateString ( kv . key ) [ i ] = ( uint8_t ) g_random - > randomInt ( ' a ' , ' m ' ) ;
for ( int i = 0 ; i < vLen ; + + i )
mutateString ( kv . value ) [ i ] = ( uint8_t ) g_random - > randomInt ( ' n ' , ' z ' ) ;
return kv ;
}
2017-09-16 16:45:39 +08:00
ACTOR Future < int > verifyRandomRange ( VersionedBTree * btree , Version v , std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > * written ) {
2017-09-15 20:19:39 +08:00
state int errors = 0 ;
2017-09-16 08:27:13 +08:00
state Key start = randomKV ( ) . key ;
state Key end = randomKV ( ) . key ;
2017-09-15 20:19:39 +08:00
if ( end < = start )
end = keyAfter ( start ) ;
2017-09-16 16:45:39 +08:00
debug_printf ( " VerifyRange '%s' to '%s' @%lld \n " , printable ( start ) . c_str ( ) , printable ( end ) . c_str ( ) , v ) ;
2017-09-15 20:19:39 +08:00
2017-09-16 16:45:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written - > lower_bound ( std : : make_pair ( start . toString ( ) , 0 ) ) ;
2017-09-15 20:19:39 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written - > upper_bound ( std : : make_pair ( end . toString ( ) , 0 ) ) ;
2017-09-16 08:27:13 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iLast ;
2017-09-15 20:19:39 +08:00
2017-09-17 19:38:01 +08:00
state Reference < IStoreCursor > cur = btree - > readAtVersion ( v ) ;
// Randomly use the cursor for something else first.
if ( g_random - > coinflip ( ) ) {
2018-06-14 19:15:14 +08:00
debug_printf ( " VerifyRange: Dummy seek \n " ) ;
2017-09-21 15:58:56 +08:00
state Key randomKey = randomKV ( ) . key ;
Void _ = wait ( g_random - > coinflip ( ) ? cur - > findFirstEqualOrGreater ( randomKey , true , 0 ) : cur - > findLastLessOrEqual ( randomKey , true , 0 ) ) ;
2017-09-17 19:38:01 +08:00
}
2018-06-14 19:15:14 +08:00
debug_printf ( " VerifyRange: Actual seek \n " ) ;
2017-09-17 19:38:01 +08:00
Void _ = wait ( cur - > findFirstEqualOrGreater ( start , true , 0 ) ) ;
state std : : vector < KeyValue > results ;
2017-09-15 20:19:39 +08:00
while ( cur - > isValid ( ) & & cur - > getKey ( ) < end ) {
// Find the next written kv pair that would be present at this version
while ( 1 ) {
iLast = i ;
2017-09-16 08:27:13 +08:00
if ( i = = iEnd )
break ;
+ + i ;
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
)
break ;
2017-09-15 20:19:39 +08:00
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( iLast = = iEnd ) {
errors + = 1 ;
2017-09-16 16:45:39 +08:00
printf ( " VerifyRange(@%lld, %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-16 08:27:13 +08:00
2017-09-15 20:19:39 +08:00
if ( cur - > getKey ( ) ! = iLast - > first . first ) {
errors + = 1 ;
2017-09-16 16:45:39 +08:00
printf ( " VerifyRange(@%lld, %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , iLast - > first . first . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
if ( cur - > getValue ( ) ! = iLast - > second . get ( ) ) {
errors + = 1 ;
2017-09-16 16:45:39 +08:00
printf ( " VerifyRange(@%lld, %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , iLast - > second . get ( ) . c_str ( ) ) ;
2017-09-15 20:19:39 +08:00
break ;
}
2017-09-17 19:38:01 +08:00
results . push_back ( KeyValue ( KeyValueRef ( cur - > getKey ( ) , cur - > getValue ( ) ) ) ) ;
2017-09-15 20:19:39 +08:00
Void _ = wait ( cur - > next ( true ) ) ;
}
2017-09-16 08:27:13 +08:00
// Make sure there are no further written kv pairs that would be present at this version.
while ( 1 ) {
iLast = i ;
if ( i = = iEnd )
break ;
+ + i ;
if ( iLast - > first . second < = v
& & iLast - > second . present ( )
& & (
i = = iEnd
| | i - > first . first ! = iLast - > first . first
| | i - > first . second > v
)
)
break ;
}
if ( iLast ! = iEnd ) {
errors + = 1 ;
2017-09-16 16:45:39 +08:00
printf ( " VerifyRange(@%lld, %s, %s) ERROR: Tree range ended but written has @%lld '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , iLast - > first . second , iLast - > first . first . c_str ( ) ) ;
2017-09-16 08:27:13 +08:00
}
2017-09-16 16:45:39 +08:00
2017-09-20 04:03:30 +08:00
debug_printf ( " VerifyRangeReverse '%s' to '%s' @%lld \n " , printable ( start ) . c_str ( ) , printable ( end ) . c_str ( ) , v ) ;
2017-09-17 19:38:01 +08:00
// Randomly use a new cursor for the revere range read
if ( g_random - > coinflip ( ) ) {
cur = btree - > readAtVersion ( v ) ;
}
// Now read the range from the tree in reverse order and compare to the saved results
Void _ = wait ( cur - > findLastLessOrEqual ( end , true , 0 ) ) ;
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = end )
Void _ = wait ( cur - > prev ( true ) ) ;
state std : : vector < KeyValue > : : const_reverse_iterator r = results . rbegin ( ) ;
while ( cur - > isValid ( ) & & cur - > getKey ( ) > = start ) {
if ( r = = results . rend ( ) ) {
errors + = 1 ;
printf ( " VerifyRangeReverse(@%lld, %s, %s) ERROR: Tree key '%s' vs nothing in written map. \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) ) ;
break ;
}
if ( cur - > getKey ( ) ! = r - > key ) {
errors + = 1 ;
printf ( " VerifyRangeReverse(@%lld, %s, %s) ERROR: Tree key '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
break ;
}
if ( cur - > getValue ( ) ! = r - > value ) {
errors + = 1 ;
printf ( " VerifyRangeReverse(@%lld, %s, %s) ERROR: Tree key '%s' has tree value '%s' vs written '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , r - > value . toString ( ) . c_str ( ) ) ;
break ;
}
+ + r ;
Void _ = wait ( cur - > prev ( true ) ) ;
}
if ( r ! = results . rend ( ) ) {
errors + = 1 ;
printf ( " VerifyRangeReverse(@%lld, %s, %s) ERROR: Tree range ended but written has '%s' \n " , v , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , r - > key . toString ( ) . c_str ( ) ) ;
}
if ( errors > 0 )
throw internal_error ( ) ;
2017-09-15 20:19:39 +08:00
return errors ;
}
TEST_CASE ( " /redwood/correctness " ) {
2017-07-14 13:11:48 +08:00
state bool useDisk = true ;
2018-07-05 12:12:09 +08:00
state std : : string pagerFile = " unittest_pageFile " ;
2017-07-14 13:11:48 +08:00
state IPager * pager ;
if ( useDisk )
2017-09-22 14:51:55 +08:00
pager = new IndirectShadowPager ( pagerFile ) ;
2017-07-14 13:11:48 +08:00
else
pager = createMemoryPager ( ) ;
2017-09-06 07:59:31 +08:00
state int pageSize = g_random - > coinflip ( ) ? pager - > getUsablePageSize ( ) : g_random - > randomInt ( 200 , 400 ) ;
2017-10-10 04:24:16 +08:00
state VersionedBTree * btree = new VersionedBTree ( pager , pagerFile , pageSize ) ;
2017-06-10 05:56:41 +08:00
Void _ = wait ( btree - > init ( ) ) ;
2018-06-12 16:43:19 +08:00
state int maxCommits = 10 ;
state int maxVersionsPerCommit = 4 ;
state int maxChangesPerVersion = 5 ;
2017-09-06 07:59:31 +08:00
// We must be able to fit at least two any two keys plus overhead in a page to prevent
// a situation where the tree cannot be grown upward with decreasing level size.
2018-06-12 16:43:19 +08:00
// TODO: Handle arbitrarily large keys
2018-06-15 08:52:25 +08:00
state int maxKeySize = pageSize / 4 ;
2017-09-06 07:59:31 +08:00
ASSERT ( maxKeySize > 0 ) ;
2018-06-15 08:52:25 +08:00
state int maxValueSize = pageSize * 3 ;
2017-09-06 07:59:31 +08:00
printf ( " Using page size %d, max key size %d, max value size %d \n " , pageSize , maxKeySize , maxValueSize ) ;
2017-08-22 13:29:57 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > written ;
state std : : set < Key > keys ;
2017-06-10 05:56:41 +08:00
2017-09-15 20:19:39 +08:00
state Version lastVer = wait ( btree - > getLatestVersion ( ) ) ;
2017-07-14 13:11:48 +08:00
printf ( " Starting from version: %lld \n " , lastVer ) ;
2017-06-10 05:56:41 +08:00
state Version version = lastVer + 1 ;
2018-07-17 15:41:42 +08:00
state int commits = 1 + g_random - > randomInt ( 0 , maxCommits ) ;
2017-07-14 02:32:14 +08:00
//printf("Will do %d commits\n", commits);
2017-07-26 07:10:19 +08:00
state double insertTime = 0 ;
state int64_t keyBytesInserted = 0 ;
state int64_t ValueBytesInserted = 0 ;
2017-06-10 05:56:41 +08:00
while ( commits - - ) {
2017-07-26 07:10:19 +08:00
state double startTime = now ( ) ;
2018-06-12 16:43:19 +08:00
int versions = g_random - > randomInt ( 1 , maxVersionsPerCommit ) ;
2017-08-22 13:29:57 +08:00
debug_printf ( " Commit will have %d versions \n " , versions ) ;
2017-06-10 05:56:41 +08:00
while ( versions - - ) {
2017-07-15 02:37:08 +08:00
+ + version ;
2017-06-10 05:56:41 +08:00
btree - > setWriteVersion ( version ) ;
2018-06-12 16:43:19 +08:00
int changes = g_random - > randomInt ( 0 , maxChangesPerVersion ) ;
2017-08-22 13:29:57 +08:00
debug_printf ( " Version %lld will have %d changes \n " , version , changes ) ;
2017-06-10 05:56:41 +08:00
while ( changes - - ) {
2017-08-22 13:29:57 +08:00
if ( g_random - > random01 ( ) < .10 ) {
2017-09-15 20:19:39 +08:00
// Delete a random range
2017-08-22 13:29:57 +08:00
Key start = randomKV ( ) . key ;
Key end = randomKV ( ) . key ;
if ( end < = start )
end = keyAfter ( start ) ;
KeyRangeRef range ( start , end ) ;
debug_printf ( " Clear '%s' to '%s' @%lld \n " , start . toString ( ) . c_str ( ) , end . toString ( ) . c_str ( ) , version ) ;
auto w = keys . lower_bound ( start ) ;
auto wEnd = keys . lower_bound ( end ) ;
while ( w ! = wEnd ) {
2017-09-15 20:19:39 +08:00
debug_printf ( " Clearing key '%s' @%lld \n " , w - > toString ( ) . c_str ( ) , version ) ;
2017-08-22 13:29:57 +08:00
written [ std : : make_pair ( w - > toString ( ) , version ) ] = Optional < std : : string > ( ) ;
+ + w ;
}
btree - > clear ( range ) ;
}
else {
2017-09-06 07:59:31 +08:00
KeyValue kv = randomKV ( maxKeySize , maxValueSize ) ;
2017-08-22 13:29:57 +08:00
keyBytesInserted + = kv . key . size ( ) ;
ValueBytesInserted + = kv . value . size ( ) ;
debug_printf ( " Set '%s' -> '%s' @%lld \n " , kv . key . toString ( ) . c_str ( ) , kv . value . toString ( ) . c_str ( ) , version ) ;
btree - > set ( kv ) ;
written [ std : : make_pair ( kv . key . toString ( ) , version ) ] = kv . value . toString ( ) ;
keys . insert ( kv . key ) ;
}
2017-06-10 05:56:41 +08:00
}
}
Void _ = wait ( btree - > commit ( ) ) ;
// Check that all writes can be read at their written versions
2017-08-22 13:29:57 +08:00
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator i = written . cbegin ( ) ;
state std : : map < std : : pair < std : : string , Version > , Optional < std : : string > > : : const_iterator iEnd = written . cend ( ) ;
2017-06-10 05:56:41 +08:00
state int errors = 0 ;
2017-07-26 07:10:19 +08:00
insertTime + = now ( ) - startTime ;
2017-06-10 05:56:41 +08:00
printf ( " Checking changes committed thus far. \n " ) ;
2017-07-26 07:10:19 +08:00
if ( useDisk & & g_random - > random01 ( ) < .1 ) {
2017-07-14 13:11:48 +08:00
printf ( " Reopening disk btree \n " ) ;
2017-10-10 04:24:16 +08:00
delete btree ;
2017-07-14 13:11:48 +08:00
Future < Void > closedFuture = pager - > onClosed ( ) ;
pager - > close ( ) ;
Void _ = wait ( closedFuture ) ;
2017-09-22 14:51:55 +08:00
pager = new IndirectShadowPager ( pagerFile ) ;
2017-07-14 13:11:48 +08:00
2017-10-10 04:24:16 +08:00
btree = new VersionedBTree ( pager , pagerFile , pageSize ) ;
2017-07-14 13:11:48 +08:00
Void _ = wait ( btree - > init ( ) ) ;
2017-09-15 20:19:39 +08:00
Version v = wait ( btree - > getLatestVersion ( ) ) ;
ASSERT ( v = = version ) ;
2017-07-14 13:11:48 +08:00
}
2017-09-15 20:19:39 +08:00
// Read back every key at every version set or cleared and verify the result.
2017-06-10 05:56:41 +08:00
while ( i ! = iEnd ) {
state std : : string key = i - > first . first ;
state Version ver = i - > first . second ;
2017-08-22 13:29:57 +08:00
state Optional < std : : string > val = i - > second ;
2017-06-10 05:56:41 +08:00
state Reference < IStoreCursor > cur = btree - > readAtVersion ( ver ) ;
2017-09-16 16:45:39 +08:00
//debug_printf("Verifying @%lld '%s'\n", ver, key.c_str());
2017-09-15 20:19:39 +08:00
Void _ = wait ( cur - > findEqual ( key ) ) ;
2017-06-10 05:56:41 +08:00
2017-08-22 13:29:57 +08:00
if ( val . present ( ) ) {
if ( ! ( cur - > isValid ( ) & & cur - > getKey ( ) = = key & & cur - > getValue ( ) = = val . get ( ) ) ) {
+ + errors ;
if ( ! cur - > isValid ( ) )
2017-09-16 16:45:39 +08:00
printf ( " Verify ERROR: key_not_found: '%s' -> '%s' @%lld \n " , key . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2017-08-22 13:29:57 +08:00
else if ( cur - > getKey ( ) ! = key )
2017-09-16 16:45:39 +08:00
printf ( " Verify ERROR: key_incorrect: found '%s' expected '%s' @%lld \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , key . c_str ( ) , ver ) ;
2017-08-22 13:29:57 +08:00
else if ( cur - > getValue ( ) ! = val . get ( ) )
2017-09-16 16:45:39 +08:00
printf ( " Verify ERROR: value_incorrect: for '%s' found '%s' expected '%s' @%lld \n " , cur - > getKey ( ) . toString ( ) . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , val . get ( ) . c_str ( ) , ver ) ;
2017-08-22 13:29:57 +08:00
}
} else {
if ( cur - > isValid ( ) & & cur - > getKey ( ) = = key ) {
+ + errors ;
2017-09-16 16:45:39 +08:00
printf ( " Verify ERROR: cleared_key_found: '%s' -> '%s' @%lld \n " , key . c_str ( ) , cur - > getValue ( ) . toString ( ) . c_str ( ) , ver ) ;
2017-08-22 13:29:57 +08:00
}
2017-06-10 05:56:41 +08:00
}
+ + i ;
}
2017-09-15 20:19:39 +08:00
// For every version written thus far, range read a random range and verify the results.
state Version iVersion = lastVer ;
while ( iVersion < version ) {
2017-09-16 16:45:39 +08:00
int e = wait ( verifyRandomRange ( btree , iVersion , & written ) ) ;
2017-09-15 20:19:39 +08:00
errors + = e ;
+ + iVersion ;
}
2017-06-10 05:56:41 +08:00
printf ( " %d sets, %d errors \n " , ( int ) written . size ( ) , errors ) ;
if ( errors ! = 0 )
throw internal_error ( ) ;
2017-07-26 07:10:19 +08:00
printf ( " Inserted %lld bytes (%lld key, %lld value) in %f seconds. \n " , keyBytesInserted + ValueBytesInserted , keyBytesInserted , ValueBytesInserted , insertTime ) ;
2018-06-08 18:32:34 +08:00
2017-06-10 05:56:41 +08:00
}
2017-07-26 07:10:19 +08:00
printf ( " Inserted %lld bytes (%lld key, %lld value) in %f seconds. \n " , keyBytesInserted + ValueBytesInserted , keyBytesInserted , ValueBytesInserted , insertTime ) ;
2017-06-10 05:56:41 +08:00
2017-07-14 13:11:48 +08:00
Future < Void > closedFuture = pager - > onClosed ( ) ;
pager - > close ( ) ;
Void _ = wait ( closedFuture ) ;
return Void ( ) ;
}
TEST_CASE ( " /redwood/performance/set " ) {
2018-07-05 12:12:09 +08:00
state IPager * pager = new IndirectShadowPager ( " unittest_pageFile " ) ;
state VersionedBTree * btree = new VersionedBTree ( pager , " unittest_pageFile " ) ;
2017-07-14 13:11:48 +08:00
Void _ = wait ( btree - > init ( ) ) ;
state int nodeCount = 100000 ;
2018-06-15 08:52:25 +08:00
state int maxChangesPerVersion = 100 ;
state int versions = 5000 ;
int maxKeySize = 50 ;
2017-07-14 13:11:48 +08:00
int maxValueSize = 500 ;
state std : : string key ( maxKeySize , ' k ' ) ;
state std : : string value ( maxKeySize , ' v ' ) ;
2018-06-15 08:52:25 +08:00
state int64_t kvBytes = 0 ;
state int records = 0 ;
2017-07-14 13:11:48 +08:00
state double startTime = now ( ) ;
while ( - - versions ) {
Version lastVer = wait ( btree - > getLatestVersion ( ) ) ;
state Version version = lastVer + 1 ;
btree - > setWriteVersion ( version ) ;
int changes = g_random - > randomInt ( 0 , maxChangesPerVersion ) ;
while ( changes - - ) {
KeyValue kv ;
// Change first 4 bytes of key to an int
* ( uint32_t * ) key . data ( ) = g_random - > randomInt ( 0 , nodeCount ) ;
kv . key = StringRef ( ( uint8_t * ) key . data ( ) , g_random - > randomInt ( 10 , key . size ( ) ) ) ;
kv . value = StringRef ( ( uint8_t * ) value . data ( ) , g_random - > randomInt ( 0 , value . size ( ) ) ) ;
btree - > set ( kv ) ;
2018-06-15 08:52:25 +08:00
kvBytes + = kv . key . size ( ) + kv . value . size ( ) ;
+ + records ;
2017-07-14 13:11:48 +08:00
}
if ( g_random - > random01 ( ) < .01 ) {
Void _ = wait ( btree - > commit ( ) ) ;
2018-06-15 08:52:25 +08:00
double elapsed = now ( ) - startTime ;
2018-07-05 17:59:13 +08:00
printf ( " Committed (cumulative) %lld bytes in %d records in %f seconds, %.2f MB/s \n " , kvBytes , records , elapsed , kvBytes / elapsed / 1e6 ) ;
2017-07-14 13:11:48 +08:00
}
}
Void _ = wait ( btree - > commit ( ) ) ;
Future < Void > closedFuture = pager - > onClosed ( ) ;
pager - > close ( ) ;
Void _ = wait ( closedFuture ) ;
2018-06-15 08:52:25 +08:00
double elapsed = now ( ) - startTime ;
2018-06-17 21:48:41 +08:00
printf ( " Wrote (final) %lld bytes in %d records in %f seconds, %.2f MB/s \n " , kvBytes , records , elapsed , kvBytes / elapsed / 1e6 ) ;
2018-06-15 08:52:25 +08:00
2017-06-10 05:56:41 +08:00
return Void ( ) ;
}