2017-05-26 04:48:44 +08:00
/*
* TagPartitionedLogSystem . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
*
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
*
* http : //www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
# include "flow/actorcompiler.h"
# include "flow/ActorCollection.h"
# include "LogSystem.h"
# include "ServerDBInfo.h"
# include "DBCoreState.h"
# include "WaitFailure.h"
# include "fdbclient/SystemData.h"
# include "fdbrpc/simulator.h"
# include "fdbrpc/Replication.h"
# include "fdbrpc/ReplicationUtils.h"
template < class Collection >
void uniquify ( Collection & c ) {
std : : sort ( c . begin ( ) , c . end ( ) ) ;
c . resize ( std : : unique ( c . begin ( ) , c . end ( ) ) - c . begin ( ) ) ;
}
ACTOR static Future < Void > reportTLogCommitErrors ( Future < Void > commitReply , UID debugID ) {
try {
Void _ = wait ( commitReply ) ;
return Void ( ) ;
} catch ( Error & e ) {
if ( e . code ( ) = = error_code_broken_promise )
throw master_tlog_failed ( ) ;
else if ( e . code ( ) ! = error_code_actor_cancelled & & e . code ( ) ! = error_code_tlog_stopped )
TraceEvent ( SevError , " MasterTLogCommitRequestError " , debugID ) . error ( e ) ;
throw ;
}
}
struct OldLogData {
std : : vector < Reference < AsyncVar < OptionalInterface < TLogInterface > > > > logServers ;
int32_t tLogWriteAntiQuorum ;
int32_t tLogReplicationFactor ;
std : : vector < LocalityData > tLogLocalities ; // Stores the localities of the log servers
IRepPolicyRef tLogPolicy ;
Version epochEnd ;
OldLogData ( ) : tLogWriteAntiQuorum ( 0 ) , tLogReplicationFactor ( 0 ) , epochEnd ( 0 ) { }
} ;
struct TagPartitionedLogSystem : ILogSystem , ReferenceCounted < TagPartitionedLogSystem > {
UID dbgid ;
int tLogWriteAntiQuorum , tLogReplicationFactor , logSystemType ;
LocalitySetRef logServerSet ;
std : : vector < int > logIndexArray ;
std : : map < int , LocalityEntry > logEntryMap ;
IRepPolicyRef tLogPolicy ;
std : : vector < LocalityData > tLogLocalities ;
// new members
Future < Void > rejoins ;
Future < Void > recoveryComplete ;
bool recoveryCompleteWrittenToCoreState ;
Optional < Version > epochEndVersion ;
std : : set < Tag > epochEndTags ;
Version knownCommittedVersion ;
LocalityData locality ;
std : : map < std : : pair < int , Tag > , Version > outstandingPops ; // For each currently running popFromLog actor, (log server #, tag)->popped version
ActorCollection actors ;
std : : vector < OldLogData > oldLogData ;
std : : vector < Reference < AsyncVar < OptionalInterface < TLogInterface > > > > logServers ;
TagPartitionedLogSystem ( UID dbgid , LocalityData locality ) : dbgid ( dbgid ) , locality ( locality ) , actors ( false ) , recoveryCompleteWrittenToCoreState ( false ) , tLogWriteAntiQuorum ( 0 ) , tLogReplicationFactor ( 0 ) , logSystemType ( 0 ) { }
virtual void stopRejoins ( ) {
rejoins = Future < Void > ( ) ;
}
virtual void addref ( ) {
ReferenceCounted < TagPartitionedLogSystem > : : addref ( ) ;
}
virtual void delref ( ) {
ReferenceCounted < TagPartitionedLogSystem > : : delref ( ) ;
}
virtual std : : string describe ( ) {
std : : string result ;
for ( auto it : logServers ) {
result = result + it - > get ( ) . id ( ) . toString ( ) + " , " ;
}
return result ;
}
virtual UID getDebugID ( ) {
return dbgid ;
}
static Future < Void > recoverAndEndEpoch ( Reference < AsyncVar < Reference < ILogSystem > > > const & outLogSystem , UID const & dbgid , DBCoreState const & oldState , FutureStream < TLogRejoinRequest > const & rejoins , LocalityData const & locality ) {
return epochEnd ( outLogSystem , dbgid , oldState , rejoins , locality ) ;
}
static Reference < ILogSystem > fromLogSystemConfig ( UID const & dbgid , LocalityData const & locality , LogSystemConfig const & lsConf ) {
ASSERT ( lsConf . logSystemType = = 2 | | ( lsConf . logSystemType = = 0 & & ! lsConf . tLogs . size ( ) ) ) ;
//ASSERT(lsConf.epoch == epoch); //< FIXME
Reference < TagPartitionedLogSystem > logSystem ( new TagPartitionedLogSystem ( dbgid , locality ) ) ;
for ( int i = 0 ; i < lsConf . tLogs . size ( ) ; i + + )
logSystem - > logServers . push_back ( Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( new AsyncVar < OptionalInterface < TLogInterface > > ( lsConf . tLogs [ i ] ) ) ) ;
logSystem - > oldLogData . resize ( lsConf . oldTLogs . size ( ) ) ;
for ( int i = 0 ; i < lsConf . oldTLogs . size ( ) ; i + + ) {
for ( int j = 0 ; j < lsConf . oldTLogs [ i ] . tLogs . size ( ) ; j + + ) {
logSystem - > oldLogData [ i ] . logServers . push_back ( Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( new AsyncVar < OptionalInterface < TLogInterface > > ( lsConf . oldTLogs [ i ] . tLogs [ j ] ) ) ) ;
}
logSystem - > oldLogData [ i ] . tLogWriteAntiQuorum = lsConf . oldTLogs [ i ] . tLogWriteAntiQuorum ;
logSystem - > oldLogData [ i ] . tLogReplicationFactor = lsConf . oldTLogs [ i ] . tLogReplicationFactor ;
logSystem - > oldLogData [ i ] . tLogPolicy = lsConf . oldTLogs [ i ] . tLogPolicy ;
logSystem - > oldLogData [ i ] . tLogLocalities = lsConf . oldTLogs [ i ] . tLogLocalities ;
logSystem - > oldLogData [ i ] . epochEnd = lsConf . oldTLogs [ i ] . epochEnd ;
}
//logSystem->epoch = lsConf.epoch;
logSystem - > tLogWriteAntiQuorum = lsConf . tLogWriteAntiQuorum ;
logSystem - > tLogReplicationFactor = lsConf . tLogReplicationFactor ;
logSystem - > tLogPolicy = lsConf . tLogPolicy ;
logSystem - > tLogLocalities = lsConf . tLogLocalities ;
logSystem - > logSystemType = lsConf . logSystemType ;
logSystem - > UpdateLocalitySet ( lsConf . tLogs ) ;
return logSystem ;
}
static Reference < ILogSystem > fromOldLogSystemConfig ( UID const & dbgid , LocalityData const & locality , LogSystemConfig const & lsConf ) {
ASSERT ( lsConf . logSystemType = = 2 | | ( lsConf . logSystemType = = 0 & & ! lsConf . tLogs . size ( ) ) ) ;
//ASSERT(lsConf.epoch == epoch); //< FIXME
Reference < TagPartitionedLogSystem > logSystem ( new TagPartitionedLogSystem ( dbgid , locality ) ) ;
if ( lsConf . oldTLogs . size ( ) ) {
for ( int i = 0 ; i < lsConf . oldTLogs [ 0 ] . tLogs . size ( ) ; i + + )
logSystem - > logServers . push_back ( Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( new AsyncVar < OptionalInterface < TLogInterface > > ( lsConf . oldTLogs [ 0 ] . tLogs [ i ] ) ) ) ;
//logSystem->epoch = lsConf.epoch;
logSystem - > tLogWriteAntiQuorum = lsConf . oldTLogs [ 0 ] . tLogWriteAntiQuorum ;
logSystem - > tLogReplicationFactor = lsConf . oldTLogs [ 0 ] . tLogReplicationFactor ;
logSystem - > tLogPolicy = lsConf . oldTLogs [ 0 ] . tLogPolicy ;
logSystem - > tLogLocalities = lsConf . oldTLogs [ 0 ] . tLogLocalities ;
logSystem - > oldLogData . resize ( lsConf . oldTLogs . size ( ) - 1 ) ;
for ( int i = 1 ; i < lsConf . oldTLogs . size ( ) ; i + + ) {
for ( int j = 0 ; j < lsConf . oldTLogs [ i ] . tLogs . size ( ) ; j + + ) {
logSystem - > oldLogData [ i - 1 ] . logServers . push_back ( Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( new AsyncVar < OptionalInterface < TLogInterface > > ( lsConf . oldTLogs [ i ] . tLogs [ j ] ) ) ) ;
}
logSystem - > oldLogData [ i - 1 ] . tLogWriteAntiQuorum = lsConf . oldTLogs [ i ] . tLogWriteAntiQuorum ;
logSystem - > oldLogData [ i - 1 ] . tLogReplicationFactor = lsConf . oldTLogs [ i ] . tLogReplicationFactor ;
logSystem - > oldLogData [ i - 1 ] . tLogPolicy = lsConf . oldTLogs [ i ] . tLogPolicy ;
logSystem - > oldLogData [ i - 1 ] . tLogLocalities = lsConf . oldTLogs [ i ] . tLogLocalities ;
logSystem - > oldLogData [ i - 1 ] . epochEnd = lsConf . oldTLogs [ i ] . epochEnd ;
}
}
logSystem - > logSystemType = lsConf . logSystemType ;
return logSystem ;
}
virtual void toCoreState ( DBCoreState & newState ) {
if ( recoveryComplete . isValid ( ) & & recoveryComplete . isError ( ) )
throw recoveryComplete . getError ( ) ;
newState . tLogs . clear ( ) ;
tLogLocalities . clear ( ) ;
for ( auto & t : logServers ) {
newState . tLogs . push_back ( t - > get ( ) . id ( ) ) ;
tLogLocalities . push_back ( t - > get ( ) . interf ( ) . locality ) ;
}
newState . oldTLogData . clear ( ) ;
if ( ! recoveryComplete . isValid ( ) | | ! recoveryComplete . isReady ( ) ) {
newState . oldTLogData . resize ( oldLogData . size ( ) ) ;
for ( int i = 0 ; i < oldLogData . size ( ) ; i + + ) {
for ( auto & t : oldLogData [ i ] . logServers )
newState . oldTLogData [ i ] . tLogs . push_back ( t - > get ( ) . id ( ) ) ;
newState . oldTLogData [ i ] . tLogWriteAntiQuorum = oldLogData [ i ] . tLogWriteAntiQuorum ;
newState . oldTLogData [ i ] . tLogReplicationFactor = oldLogData [ i ] . tLogReplicationFactor ;
newState . oldTLogData [ i ] . tLogPolicy = oldLogData [ i ] . tLogPolicy ;
newState . oldTLogData [ i ] . tLogLocalities = oldLogData [ i ] . tLogLocalities ;
newState . oldTLogData [ i ] . epochEnd = oldLogData [ i ] . epochEnd ;
}
}
newState . tLogWriteAntiQuorum = tLogWriteAntiQuorum ;
newState . tLogReplicationFactor = tLogReplicationFactor ;
newState . tLogPolicy = tLogPolicy ;
newState . tLogLocalities = tLogLocalities ;
newState . logSystemType = logSystemType ;
}
virtual Future < Void > onCoreStateChanged ( ) {
ASSERT ( recoveryComplete . isValid ( ) ) ;
if ( recoveryComplete . isReady ( ) )
return Never ( ) ;
return recoveryComplete ;
}
virtual void coreStateWritten ( DBCoreState const & newState ) {
if ( ! newState . oldTLogData . size ( ) )
recoveryCompleteWrittenToCoreState = true ;
}
virtual Future < Void > onError ( ) {
// Never returns normally, but throws an error if the subsystem stops working
// FIXME: Run waitFailureClient on the master instead of these onFailedFor?
if ( ! logServers . size ( ) ) return Never ( ) ;
vector < Future < Void > > failed ;
for ( auto & t : logServers )
if ( t - > get ( ) . present ( ) )
failed . push_back ( waitFailureClient ( t - > get ( ) . interf ( ) . waitFailure , SERVER_KNOBS - > TLOG_TIMEOUT , - SERVER_KNOBS - > TLOG_TIMEOUT / SERVER_KNOBS - > SECONDS_BEFORE_NO_FAILURE_DELAY ) ) ;
ASSERT ( failed . size ( ) > = 1 ) ;
return tagError < Void > ( quorum ( failed , 1 ) , master_tlog_failed ( ) ) | | actors . getResult ( ) ;
}
virtual Future < Void > push ( Version prevVersion , Version version , Version knownCommittedVersion , LogPushData & data , Optional < UID > debugID ) {
// FIXME: Randomize request order as in LegacyLogSystem?
vector < Future < Void > > tLogCommitResults ;
for ( int loc = 0 ; loc < logServers . size ( ) ; loc + + ) {
Future < Void > commitMessage = reportTLogCommitErrors (
logServers [ loc ] - > get ( ) . interf ( ) . commit . getReply (
TLogCommitRequest ( data . getArena ( ) , prevVersion , version , knownCommittedVersion , data . getMessages ( loc ) , data . getTags ( loc ) , debugID ) , TaskTLogCommitReply ) ,
getDebugID ( ) ) ;
actors . add ( commitMessage ) ;
tLogCommitResults . push_back ( commitMessage ) ;
}
return quorum ( tLogCommitResults , tLogCommitResults . size ( ) - tLogWriteAntiQuorum ) ;
}
virtual Reference < IPeekCursor > peek ( Version begin , Tag tag , bool parallelGetMore ) {
if ( oldLogData . size ( ) = = 0 | | begin > = oldLogData [ 0 ] . epochEnd ) {
return Reference < ILogSystem : : MergedPeekCursor > ( new ILogSystem : : MergedPeekCursor ( logServers , logServers . size ( ) ? bestLocationFor ( tag ) : - 1 ,
( int ) logServers . size ( ) + 1 - tLogReplicationFactor , tag , begin , getPeekEnd ( ) , parallelGetMore , tLogLocalities , tLogPolicy , tLogReplicationFactor ) ) ;
} else {
std : : vector < Reference < ILogSystem : : IPeekCursor > > cursors ;
std : : vector < LogMessageVersion > epochEnds ;
cursors . push_back ( Reference < ILogSystem : : MergedPeekCursor > ( new ILogSystem : : MergedPeekCursor ( logServers , logServers . size ( ) ? bestLocationFor ( tag ) : - 1 ,
( int ) logServers . size ( ) + 1 - tLogReplicationFactor , tag , oldLogData [ 0 ] . epochEnd , getPeekEnd ( ) , parallelGetMore , tLogLocalities , tLogPolicy , tLogReplicationFactor ) ) ) ;
for ( int i = 0 ; i < oldLogData . size ( ) & & begin < oldLogData [ i ] . epochEnd ; i + + ) {
cursors . push_back ( Reference < ILogSystem : : MergedPeekCursor > ( new ILogSystem : : MergedPeekCursor ( oldLogData [ i ] . logServers , oldLogData [ i ] . logServers . size ( ) ? oldBestLocationFor ( tag , i ) : - 1 ,
( int ) oldLogData [ i ] . logServers . size ( ) + 1 - oldLogData [ i ] . tLogReplicationFactor , tag , i + 1 = = oldLogData . size ( ) ? begin : std : : max ( oldLogData [ i + 1 ] . epochEnd , begin ) , oldLogData [ i ] . epochEnd , parallelGetMore , oldLogData [ i ] . tLogLocalities , oldLogData [ i ] . tLogPolicy , oldLogData [ i ] . tLogReplicationFactor ) ) ) ;
epochEnds . push_back ( LogMessageVersion ( oldLogData [ i ] . epochEnd ) ) ;
}
return Reference < ILogSystem : : MultiCursor > ( new ILogSystem : : MultiCursor ( cursors , epochEnds ) ) ;
}
}
virtual Reference < IPeekCursor > peekSingle ( Version begin , Tag tag ) {
if ( oldLogData . size ( ) = = 0 | | begin > = oldLogData [ 0 ] . epochEnd ) {
return Reference < ILogSystem : : ServerPeekCursor > ( new ILogSystem : : ServerPeekCursor ( logServers . size ( ) ?
logServers [ bestLocationFor ( tag ) ] :
Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( ) , tag , begin , getPeekEnd ( ) , false , false ) ) ;
} else {
TEST ( true ) ; //peekSingle used during non-copying tlog recovery
std : : vector < Reference < ILogSystem : : IPeekCursor > > cursors ;
std : : vector < LogMessageVersion > epochEnds ;
cursors . push_back ( Reference < ILogSystem : : ServerPeekCursor > ( new ILogSystem : : ServerPeekCursor ( logServers . size ( ) ?
logServers [ bestLocationFor ( tag ) ] :
Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( ) , tag , oldLogData [ 0 ] . epochEnd , getPeekEnd ( ) , false , false ) ) ) ;
for ( int i = 0 ; i < oldLogData . size ( ) & & begin < oldLogData [ i ] . epochEnd ; i + + ) {
cursors . push_back ( Reference < ILogSystem : : MergedPeekCursor > ( new ILogSystem : : MergedPeekCursor ( oldLogData [ i ] . logServers , oldLogData [ i ] . logServers . size ( ) ? oldBestLocationFor ( tag , i ) : - 1 ,
( int ) oldLogData [ i ] . logServers . size ( ) + 1 - oldLogData [ i ] . tLogReplicationFactor , tag , i + 1 = = oldLogData . size ( ) ? begin : std : : max ( oldLogData [ i + 1 ] . epochEnd , begin ) , oldLogData [ i ] . epochEnd , false ,
oldLogData [ i ] . tLogLocalities , oldLogData [ i ] . tLogPolicy , oldLogData [ i ] . tLogReplicationFactor ) ) ) ;
epochEnds . push_back ( LogMessageVersion ( oldLogData [ i ] . epochEnd ) ) ;
}
return Reference < ILogSystem : : MultiCursor > ( new ILogSystem : : MultiCursor ( cursors , epochEnds ) ) ;
}
}
virtual void pop ( Version upTo , Tag _tag ) {
if ( ! logServers . size ( ) | | ! upTo ) return ;
Tag tag = _tag ;
for ( auto log = 0 ; log < logServers . size ( ) ; log + + ) {
Version prev = outstandingPops [ std : : make_pair ( log , tag ) ] ;
if ( prev < upTo )
outstandingPops [ std : : make_pair ( log , tag ) ] = upTo ;
if ( prev = = 0 )
actors . add ( popFromLog ( this , log , tag ) ) ;
}
}
ACTOR static Future < Void > popFromLog ( TagPartitionedLogSystem * self , int log , Tag tag ) {
state Version last = 0 ;
loop {
Void _ = wait ( delay ( 1.0 ) ) ; //< FIXME: knob
state Version to = self - > outstandingPops [ std : : make_pair ( log , tag ) ] ;
if ( to < = last ) {
self - > outstandingPops . erase ( std : : make_pair ( log , tag ) ) ;
return Void ( ) ;
}
try {
auto & interf = self - > logServers [ log ] ;
if ( ! interf - > get ( ) . present ( ) )
return Void ( ) ;
Void _ = wait ( interf - > get ( ) . interf ( ) . popMessages . getReply ( TLogPopRequest ( to , tag ) ) ) ;
last = to ;
} catch ( Error & e ) {
if ( e . code ( ) = = error_code_actor_cancelled ) throw ;
TraceEvent ( ( e . code ( ) = = error_code_broken_promise ) ? SevInfo : SevError , " LogPopError " , self - > dbgid ) . detail ( " Log " , self - > logServers [ log ] - > get ( ) . id ( ) ) . error ( e ) ;
return Void ( ) ; // Leaving outstandingPops filled in means no further pop requests to this tlog from this logSystem
}
}
}
2017-08-29 04:46:14 +08:00
ACTOR static Future < Void > confirmEpochLive_internal ( TagPartitionedLogSystem * self , Optional < UID > debugID ) {
state vector < Future < Void > > alive ;
for ( auto & t : self - > logServers ) {
if ( t - > get ( ) . present ( ) ) {
alive . push_back ( brokenPromiseToNever (
t - > get ( ) . interf ( ) . confirmRunning . getReply ( TLogConfirmRunningRequest ( debugID ) ,
TaskTLogConfirmRunningReply ) ) ) ;
} else {
alive . push_back ( Never ( ) ) ;
}
2017-05-26 04:48:44 +08:00
}
2017-08-29 04:46:14 +08:00
loop {
LocalityGroup locked ;
std : : vector < LocalityData > unlocked , unused ;
for ( int i = 0 ; i < alive . size ( ) ; i + + ) {
if ( alive [ i ] . isReady ( ) & & ! alive [ i ] . isError ( ) ) {
locked . add ( self - > tLogLocalities [ i ] ) ;
} else {
unlocked . push_back ( self - > tLogLocalities [ i ] ) ;
}
}
bool quorum_obtained = locked . validate ( self - > tLogPolicy ) ;
if ( ! quorum_obtained & & self - > tLogWriteAntiQuorum ! = 0 ) {
quorum_obtained = ! validateAllCombinations ( unused , locked , self - > tLogPolicy , unlocked , self - > tLogWriteAntiQuorum , false ) ;
}
if ( self - > tLogReplicationFactor - self - > tLogWriteAntiQuorum = = 1 & & locked . size ( ) > 0 ) {
ASSERT ( quorum_obtained ) ;
}
if ( quorum_obtained ) {
return Void ( ) ;
}
// The current set of responders that we have weren't enough to form a quorum, so we must
// wait for more responses and try again.
std : : vector < Future < Void > > changes ;
for ( int i = 0 ; i < alive . size ( ) ; i + + ) {
if ( ! alive [ i ] . isReady ( ) ) {
changes . push_back ( ready ( alive [ i ] ) ) ;
2017-09-14 06:45:09 +08:00
} else if ( alive [ i ] . isReady ( ) & & alive [ i ] . isError ( ) & &
2017-09-14 06:49:39 +08:00
alive [ i ] . getError ( ) . code ( ) = = error_code_tlog_stopped ) {
2017-09-14 06:45:09 +08:00
// All commits must go to all TLogs. If any TLog is stopped, then our epoch has ended.
return Never ( ) ;
2017-08-29 04:46:14 +08:00
}
}
ASSERT ( changes . size ( ) ! = 0 ) ;
Void _ = wait ( waitForAny ( changes ) ) ;
}
}
// Returns success after confirming that pushes in the current epoch are still possible.
virtual Future < Void > confirmEpochLive ( Optional < UID > debugID ) {
return confirmEpochLive_internal ( this , debugID ) ;
2017-05-26 04:48:44 +08:00
}
virtual Future < Reference < ILogSystem > > newEpoch ( vector < WorkerInterface > availableLogServers , DatabaseConfiguration const & config , LogEpoch recoveryCount ) {
// Call only after end_epoch() has successfully completed. Returns a new epoch immediately following this one. The new epoch
// is only provisional until the caller updates the coordinated DBCoreState
return newEpoch ( Reference < TagPartitionedLogSystem > : : addRef ( this ) , availableLogServers , config , recoveryCount ) ;
}
virtual LogSystemConfig getLogSystemConfig ( ) {
LogSystemConfig logSystemConfig ;
logSystemConfig . logSystemType = logSystemType ;
logSystemConfig . tLogWriteAntiQuorum = tLogWriteAntiQuorum ;
logSystemConfig . tLogReplicationFactor = tLogReplicationFactor ;
logSystemConfig . tLogPolicy = tLogPolicy ;
logSystemConfig . tLogLocalities = tLogLocalities ;
for ( int i = 0 ; i < logServers . size ( ) ; i + + )
logSystemConfig . tLogs . push_back ( logServers [ i ] - > get ( ) ) ;
if ( ! recoveryCompleteWrittenToCoreState ) {
for ( int i = 0 ; i < oldLogData . size ( ) ; i + + ) {
logSystemConfig . oldTLogs . push_back ( OldTLogConf ( ) ) ;
for ( int j = 0 ; j < oldLogData [ i ] . logServers . size ( ) ; j + + ) {
logSystemConfig . oldTLogs [ i ] . tLogs . push_back ( oldLogData [ i ] . logServers [ j ] - > get ( ) ) ;
}
logSystemConfig . oldTLogs [ i ] . tLogWriteAntiQuorum = oldLogData [ i ] . tLogWriteAntiQuorum ;
logSystemConfig . oldTLogs [ i ] . tLogReplicationFactor = oldLogData [ i ] . tLogReplicationFactor ;
logSystemConfig . oldTLogs [ i ] . tLogPolicy = oldLogData [ i ] . tLogPolicy ;
logSystemConfig . oldTLogs [ i ] . tLogLocalities = oldLogData [ i ] . tLogLocalities ;
logSystemConfig . oldTLogs [ i ] . epochEnd = oldLogData [ i ] . epochEnd ;
}
}
return logSystemConfig ;
}
virtual Standalone < StringRef > getLogsValue ( ) {
vector < std : : pair < UID , NetworkAddress > > logs ;
for ( int i = 0 ; i < logServers . size ( ) ; i + + ) {
logs . push_back ( std : : make_pair ( logServers [ i ] - > get ( ) . id ( ) , logServers [ i ] - > get ( ) . present ( ) ? logServers [ i ] - > get ( ) . interf ( ) . address ( ) : NetworkAddress ( ) ) ) ;
}
vector < std : : pair < UID , NetworkAddress > > oldLogs ;
if ( ! recoveryCompleteWrittenToCoreState ) {
for ( int i = 0 ; i < oldLogData . size ( ) ; i + + ) {
for ( int j = 0 ; j < oldLogData [ i ] . logServers . size ( ) ; j + + ) {
oldLogs . push_back ( std : : make_pair ( oldLogData [ i ] . logServers [ j ] - > get ( ) . id ( ) , oldLogData [ i ] . logServers [ j ] - > get ( ) . present ( ) ? oldLogData [ i ] . logServers [ j ] - > get ( ) . interf ( ) . address ( ) : NetworkAddress ( ) ) ) ;
}
}
}
return logsValue ( logs , oldLogs ) ;
}
virtual Future < Void > onLogSystemConfigChange ( ) {
std : : vector < Future < Void > > changes ;
changes . push_back ( Never ( ) ) ;
for ( int i = 0 ; i < logServers . size ( ) ; i + + )
changes . push_back ( logServers [ i ] - > onChange ( ) ) ;
for ( int i = 0 ; i < oldLogData . size ( ) ; i + + ) {
for ( int j = 0 ; j < oldLogData [ i ] . logServers . size ( ) ; j + + ) {
changes . push_back ( oldLogData [ i ] . logServers [ j ] - > onChange ( ) ) ;
}
}
return waitForAny ( changes ) ;
}
virtual int getLogServerCount ( ) { return logServers . size ( ) ; }
virtual Version getEnd ( ) {
ASSERT ( epochEndVersion . present ( ) ) ;
return epochEndVersion . get ( ) + 1 ;
}
Version getPeekEnd ( ) {
if ( epochEndVersion . present ( ) )
return getEnd ( ) ;
else
return std : : numeric_limits < Version > : : max ( ) ;
}
int bestLocationFor ( Tag tag ) {
return tag % logServers . size ( ) ;
}
int oldBestLocationFor ( Tag tag , int idx ) {
return tag % oldLogData [ idx ] . logServers . size ( ) ;
}
virtual void getPushLocations ( std : : vector < Tag > const & tags , std : : vector < int > & locations ) {
// Ensure that the replication server set and replication policy
// have been defined
ASSERT ( logServerSet . getPtr ( ) ) ;
ASSERT ( tLogPolicy . getPtr ( ) ) ;
std : : vector < LocalityEntry > alsoServers , resultEntries ;
for ( auto & t : tags ) {
locations . push_back ( bestLocationFor ( t ) ) ;
}
uniquify ( locations ) ;
if ( locations . size ( ) )
alsoServers . reserve ( locations . size ( ) ) ;
// Convert locations to the also servers
for ( auto location : locations ) {
ASSERT ( logEntryMap [ location ] . _id = = location ) ;
alsoServers . push_back ( logEntryMap [ location ] ) ;
}
// Run the policy, assert if unable to satify
bool result = logServerSet - > selectReplicas ( tLogPolicy , alsoServers , resultEntries ) ;
ASSERT ( result ) ;
// Add the new servers to the location array
LocalityMap < int > * logServerMap = ( LocalityMap < int > * ) logServerSet . getPtr ( ) ;
for ( auto entry : resultEntries ) {
locations . push_back ( * logServerMap - > getObject ( entry ) ) ;
}
// TraceEvent("getPushLocations").detail("Policy", tLogPolicy->info())
// .detail("Results", locations.size()).detail("Selection", logServerSet->size())
// .detail("Included", alsoServers.size()).detail("Duration", timer() - t);
}
void UpdateLocalitySet ( vector < OptionalInterface < TLogInterface > > const & tlogs )
{
LocalityMap < int > * logServerMap ;
logServerSet = LocalitySetRef ( new LocalityMap < int > ( ) ) ;
logServerMap = ( LocalityMap < int > * ) logServerSet . getPtr ( ) ;
logEntryMap . clear ( ) ;
logIndexArray . clear ( ) ;
logIndexArray . reserve ( tlogs . size ( ) ) ;
for ( int i = 0 ; i < tlogs . size ( ) ; i + + ) {
if ( tlogs [ i ] . present ( ) ) {
logIndexArray . push_back ( i ) ;
ASSERT ( logEntryMap . find ( i ) = = logEntryMap . end ( ) ) ;
logEntryMap [ logIndexArray . back ( ) ] = logServerMap - > add ( tlogs [ i ] . interf ( ) . locality , & logIndexArray . back ( ) ) ;
}
}
}
void UpdateLocalitySet (
vector < WorkerInterface > const & workers ,
vector < InitializeTLogRequest > const & reqs )
{
LocalityMap < int > * logServerMap ;
logServerSet = LocalitySetRef ( new LocalityMap < int > ( ) ) ;
logServerMap = ( LocalityMap < int > * ) logServerSet . getPtr ( ) ;
logEntryMap . clear ( ) ;
logIndexArray . clear ( ) ;
logIndexArray . reserve ( workers . size ( ) ) ;
for ( int i = 0 ; i < workers . size ( ) ; i + + ) {
ASSERT ( logEntryMap . find ( i ) = = logEntryMap . end ( ) ) ;
logIndexArray . push_back ( i ) ;
logEntryMap [ logIndexArray . back ( ) ] = logServerMap - > add ( workers [ i ] . locality , & logIndexArray . back ( ) ) ;
}
}
std : : set < Tag > const & getEpochEndTags ( ) const { return epochEndTags ; }
ACTOR static Future < Void > monitorLog ( Reference < AsyncVar < OptionalInterface < TLogInterface > > > logServer , Reference < AsyncVar < bool > > failed ) {
state Future < Void > waitFailure ;
loop {
if ( logServer - > get ( ) . present ( ) )
waitFailure = waitFailureTracker ( logServer - > get ( ) . interf ( ) . waitFailure , failed ) ;
else
failed - > set ( true ) ;
Void _ = wait ( logServer - > onChange ( ) ) ;
}
}
ACTOR static Future < Void > epochEnd ( Reference < AsyncVar < Reference < ILogSystem > > > outLogSystem , UID dbgid , DBCoreState prevState , FutureStream < TLogRejoinRequest > rejoinRequests , LocalityData locality ) {
// Stops a co-quorum of tlogs so that no further versions can be committed until the DBCoreState coordination state is changed
// Creates a new logSystem representing the (now frozen) epoch
// No other important side effects.
// The writeQuorum in the master info is from the previous configuration
2017-08-29 04:47:35 +08:00
state vector < Future < TLogLockResult > > tLogReply ( prevState . tLogs . size ( ) ) ;
2017-05-26 04:48:44 +08:00
if ( ! prevState . tLogs . size ( ) ) {
// This is a brand new database
Reference < TagPartitionedLogSystem > logSystem ( new TagPartitionedLogSystem ( dbgid , locality ) ) ;
logSystem - > tLogWriteAntiQuorum = prevState . tLogWriteAntiQuorum ;
logSystem - > tLogReplicationFactor = prevState . tLogReplicationFactor ;
logSystem - > tLogPolicy = prevState . tLogPolicy ;
logSystem - > tLogLocalities = prevState . tLogLocalities ;
logSystem - > logSystemType = prevState . logSystemType ;
logSystem - > epochEndVersion = 0 ;
logSystem - > knownCommittedVersion = 0 ;
outLogSystem - > set ( logSystem ) ;
Void _ = wait ( Future < Void > ( Never ( ) ) ) ;
throw internal_error ( ) ;
}
TEST ( true ) ; // Master recovery from pre-existing database
// To ensure consistent recovery, the number of servers NOT in the write quorum plus the number of servers NOT in the read quorum
// have to be strictly less than the replication factor. Otherwise there could be a replica set consistent entirely of servers that
// are out of date due to not being in the write quorum or unavailable due to not being in the read quorum.
2017-08-29 04:47:56 +08:00
// So with N = # of tlogs, W = antiquorum, R = required count, F = replication factor,
// W + (N - R) < F, and optimally (N-W)+(N-R)=F-1. Thus R=N+1-F+W.
2017-05-26 04:48:44 +08:00
state int requiredCount = ( int ) prevState . tLogs . size ( ) + 1 - prevState . tLogReplicationFactor + prevState . tLogWriteAntiQuorum ;
ASSERT ( requiredCount > 0 & & requiredCount < = prevState . tLogs . size ( ) ) ;
ASSERT ( prevState . tLogReplicationFactor > = 1 & & prevState . tLogReplicationFactor < = prevState . tLogs . size ( ) ) ;
ASSERT ( prevState . tLogWriteAntiQuorum > = 0 & & prevState . tLogWriteAntiQuorum < prevState . tLogs . size ( ) ) ;
// trackRejoins listens for rejoin requests from the tLogs that we are recovering from, to learn their TLogInterfaces
state std : : vector < Reference < AsyncVar < OptionalInterface < TLogInterface > > > > logServers ;
state std : : vector < Reference < AsyncVar < OptionalInterface < TLogInterface > > > > allLogServers ;
state std : : vector < OldLogData > oldLogData ;
state std : : vector < Reference < AsyncVar < bool > > > logFailed ;
state std : : vector < Future < Void > > failureTrackers ;
for ( int i = 0 ; i < prevState . tLogs . size ( ) ; i + + ) {
Reference < AsyncVar < OptionalInterface < TLogInterface > > > logVar = Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( new AsyncVar < OptionalInterface < TLogInterface > > ( OptionalInterface < TLogInterface > ( prevState . tLogs [ i ] ) ) ) ;
logServers . push_back ( logVar ) ;
allLogServers . push_back ( logVar ) ;
logFailed . push_back ( Reference < AsyncVar < bool > > ( new AsyncVar < bool > ( ) ) ) ;
failureTrackers . push_back ( monitorLog ( logServers [ i ] , logFailed [ i ] ) ) ;
}
for ( int i = 0 ; i < prevState . oldTLogData . size ( ) ; i + + ) {
oldLogData . push_back ( OldLogData ( ) ) ;
for ( int j = 0 ; j < prevState . oldTLogData [ i ] . tLogs . size ( ) ; j + + ) {
Reference < AsyncVar < OptionalInterface < TLogInterface > > > logVar = Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( new AsyncVar < OptionalInterface < TLogInterface > > ( OptionalInterface < TLogInterface > ( prevState . oldTLogData [ i ] . tLogs [ j ] ) ) ) ;
oldLogData [ i ] . logServers . push_back ( logVar ) ;
allLogServers . push_back ( logVar ) ;
}
oldLogData [ i ] . tLogReplicationFactor = prevState . oldTLogData [ i ] . tLogReplicationFactor ;
oldLogData [ i ] . tLogWriteAntiQuorum = prevState . oldTLogData [ i ] . tLogWriteAntiQuorum ;
oldLogData [ i ] . epochEnd = prevState . oldTLogData [ i ] . epochEnd ;
oldLogData [ i ] . tLogPolicy = prevState . oldTLogData [ i ] . tLogPolicy ;
oldLogData [ i ] . tLogLocalities = prevState . oldTLogData [ i ] . tLogLocalities ;
}
state Future < Void > rejoins = trackRejoins ( dbgid , allLogServers , rejoinRequests ) ;
2017-08-29 04:47:35 +08:00
state bool buggify_lock_minimal_tlogs = BUGGIFY ;
if ( ! buggify_lock_minimal_tlogs ) {
for ( int t = 0 ; t < logServers . size ( ) ; t + + ) {
tLogReply [ t ] = lockTLog ( dbgid , logServers [ t ] ) ;
}
}
2017-05-26 04:48:44 +08:00
state Optional < Version > last_end ;
state bool lastWaitForRecovery = true ;
state int cycles = 0 ;
loop {
2017-08-29 04:47:35 +08:00
if ( buggify_lock_minimal_tlogs ) {
lockMinimalTLogSet ( dbgid , prevState , logServers , logFailed , & tLogReply ) ;
}
2017-05-26 04:48:44 +08:00
std : : vector < LocalityData > availableItems , badCombo ;
std : : vector < TLogLockResult > results ;
std : : string sServerState ;
LocalityGroup unResponsiveSet ;
double t = timer ( ) ;
cycles + + ;
for ( int t = 0 ; t < logServers . size ( ) ; t + + ) {
2017-08-29 04:47:35 +08:00
if ( tLogReply [ t ] . isValid ( ) & & tLogReply [ t ] . isReady ( ) & & ! tLogReply [ t ] . isError ( ) & & ! logFailed [ t ] - > get ( ) ) {
2017-05-26 04:48:44 +08:00
results . push_back ( tLogReply [ t ] . get ( ) ) ;
availableItems . push_back ( prevState . tLogLocalities [ t ] ) ;
sServerState + = ' a ' ;
}
else {
unResponsiveSet . add ( prevState . tLogLocalities [ t ] ) ;
sServerState + = ' f ' ;
}
}
// Check if the list of results is not larger than the anti quorum
bool bTooManyFailures = ( results . size ( ) < = prevState . tLogWriteAntiQuorum ) ;
// Check if failed logs complete the policy
bTooManyFailures = bTooManyFailures | |
( ( unResponsiveSet . size ( ) > = prevState . tLogReplicationFactor ) & &
( unResponsiveSet . validate ( prevState . tLogPolicy ) ) ) ;
// Check all combinations of the AntiQuorum within the failed
if ( ( ! bTooManyFailures ) & &
( prevState . tLogWriteAntiQuorum ) & &
( ! validateAllCombinations ( badCombo , unResponsiveSet , prevState . tLogPolicy , availableItems , prevState . tLogWriteAntiQuorum , false ) ) )
{
TraceEvent ( " EpochEndBadCombo " , dbgid ) . detail ( " Cycles " , cycles )
. detail ( " Present " , results . size ( ) )
. detail ( " Available " , availableItems . size ( ) )
. detail ( " Absent " , logServers . size ( ) - results . size ( ) )
. detail ( " ServerState " , sServerState )
. detail ( " ReplicationFactor " , prevState . tLogReplicationFactor )
. detail ( " AntiQuorum " , prevState . tLogWriteAntiQuorum )
. detail ( " Policy " , prevState . tLogPolicy - > info ( ) )
. detail ( " TooManyFailures " , bTooManyFailures )
. detail ( " LogZones " , : : describeZones ( prevState . tLogLocalities ) )
. detail ( " LogDataHalls " , : : describeDataHalls ( prevState . tLogLocalities ) ) ;
bTooManyFailures = true ;
}
// If too many TLogs are failed for recovery to be possible, we could wait forever here.
//Void _ = wait( smartQuorum( tLogReply, requiredCount, SERVER_KNOBS->RECOVERY_TLOG_SMART_QUORUM_DELAY ) || rejoins );
ASSERT ( logServers . size ( ) = = tLogReply . size ( ) ) ;
if ( ! bTooManyFailures ) {
std : : sort ( results . begin ( ) , results . end ( ) , sort_by_end ( ) ) ;
int absent = logServers . size ( ) - results . size ( ) ;
int safe_range_begin = prevState . tLogWriteAntiQuorum ;
int new_safe_range_begin = std : : min ( prevState . tLogWriteAntiQuorum , ( int ) ( results . size ( ) - 1 ) ) ;
int safe_range_end = prevState . tLogReplicationFactor - absent ;
Version end = results [ new_safe_range_begin ] . end ;
Version knownCommittedVersion = end - ( g_network - > isSimulated ( ) ? 10 * SERVER_KNOBS - > VERSIONS_PER_SECOND : SERVER_KNOBS - > MAX_READ_TRANSACTION_LIFE_VERSIONS ) ; //In simulation this must be the maximum MAX_READ_TRANSACTION_LIFE_VERSIONS
for ( int i = 0 ; i < results . size ( ) ; i + + ) {
knownCommittedVersion = std : : max ( knownCommittedVersion , results [ i ] . knownCommittedVersion ) ;
}
if ( ( prevState . logSystemType = = 2 & & ( ! last_end . present ( ) | | ( ( safe_range_end > 0 ) & & ( safe_range_end - 1 < results . size ( ) ) & & results [ safe_range_end - 1 ] . end < last_end . get ( ) ) ) ) ) {
TEST ( last_end . present ( ) ) ; // Restarting recovery at an earlier point
Reference < TagPartitionedLogSystem > logSystem ( new TagPartitionedLogSystem ( dbgid , locality ) ) ;
TraceEvent ( " LogSystemRecovery " , dbgid ) . detail ( " Cycles " , cycles )
. detail ( " TotalServers " , logServers . size ( ) )
. detail ( " Present " , results . size ( ) )
. detail ( " Available " , availableItems . size ( ) )
. detail ( " Absent " , logServers . size ( ) - results . size ( ) )
. detail ( " ServerState " , sServerState )
. detail ( " ReplicationFactor " , prevState . tLogReplicationFactor )
. detail ( " AntiQuorum " , prevState . tLogWriteAntiQuorum )
. detail ( " Policy " , prevState . tLogPolicy - > info ( ) )
. detail ( " TooManyFailures " , bTooManyFailures )
. detail ( " LastVersion " , ( last_end . present ( ) ) ? last_end . get ( ) : - 1L )
. detail ( " RecoveryVersion " , ( ( safe_range_end > 0 ) & & ( safe_range_end - 1 < results . size ( ) ) ) ? results [ safe_range_end - 1 ] . end : - 1 )
. detail ( " EndVersion " , end )
. detail ( " SafeBegin " , safe_range_begin )
. detail ( " SafeEnd " , safe_range_end )
. detail ( " NewSafeBegin " , new_safe_range_begin )
. detail ( " LogZones " , : : describeZones ( prevState . tLogLocalities ) )
. detail ( " LogDataHalls " , : : describeDataHalls ( prevState . tLogLocalities ) )
. detail ( " tLogs " , ( int ) prevState . tLogs . size ( ) )
. detail ( " oldTlogsSize " , ( int ) prevState . oldTLogData . size ( ) )
. detail ( " logSystemType " , prevState . logSystemType )
. detail ( " At " , end ) . detail ( " AvailableServers " , results . size ( ) )
. detail ( " knownCommittedVersion " , knownCommittedVersion ) ;
last_end = end ;
logSystem - > logServers = logServers ;
logSystem - > oldLogData = oldLogData ;
logSystem - > tLogReplicationFactor = prevState . tLogReplicationFactor ;
logSystem - > tLogWriteAntiQuorum = prevState . tLogWriteAntiQuorum ;
logSystem - > tLogPolicy = prevState . tLogPolicy ;
logSystem - > tLogLocalities = prevState . tLogLocalities ;
logSystem - > logSystemType = prevState . logSystemType ;
logSystem - > rejoins = rejoins ;
logSystem - > epochEndVersion = end ;
logSystem - > knownCommittedVersion = knownCommittedVersion ;
for ( auto & r : results )
logSystem - > epochEndTags . insert ( r . tags . begin ( ) , r . tags . end ( ) ) ;
outLogSystem - > set ( logSystem ) ;
}
else {
TraceEvent ( " LogSystemUnchangedRecovery " , dbgid ) . detail ( " Cycles " , cycles )
. detail ( " TotalServers " , logServers . size ( ) )
. detail ( " Present " , results . size ( ) )
. detail ( " Available " , availableItems . size ( ) )
. detail ( " Absent " , logServers . size ( ) - results . size ( ) )
. detail ( " ServerState " , sServerState )
. detail ( " ReplicationFactor " , prevState . tLogReplicationFactor )
. detail ( " AntiQuorum " , prevState . tLogWriteAntiQuorum )
. detail ( " Policy " , prevState . tLogPolicy - > info ( ) )
. detail ( " TooManyFailures " , bTooManyFailures )
. detail ( " LastVersion " , ( last_end . present ( ) ) ? last_end . get ( ) : - 1L )
. detail ( " RecoveryVersion " , ( ( safe_range_end > 0 ) & & ( safe_range_end - 1 < results . size ( ) ) ) ? results [ safe_range_end - 1 ] . end : - 1 )
. detail ( " EndVersion " , end )
. detail ( " SafeBegin " , safe_range_begin )
. detail ( " SafeEnd " , safe_range_end )
. detail ( " NewSafeBegin " , new_safe_range_begin )
. detail ( " LogZones " , : : describeZones ( prevState . tLogLocalities ) )
. detail ( " LogDataHalls " , : : describeDataHalls ( prevState . tLogLocalities ) ) ;
}
}
// Too many failures
else {
TraceEvent ( " LogSystemWaitingForRecovery " , dbgid ) . detail ( " Cycles " , cycles )
. detail ( " AvailableServers " , results . size ( ) )
. detail ( " TotalServers " , logServers . size ( ) )
. detail ( " Present " , results . size ( ) )
. detail ( " Available " , availableItems . size ( ) )
. detail ( " Absent " , logServers . size ( ) - results . size ( ) )
. detail ( " ServerState " , sServerState )
. detail ( " ReplicationFactor " , prevState . tLogReplicationFactor )
. detail ( " AntiQuorum " , prevState . tLogWriteAntiQuorum )
. detail ( " Policy " , prevState . tLogPolicy - > info ( ) )
. detail ( " TooManyFailures " , bTooManyFailures )
. detail ( " LogZones " , : : describeZones ( prevState . tLogLocalities ) )
. detail ( " LogDataHalls " , : : describeDataHalls ( prevState . tLogLocalities ) ) ;
}
// Wait for anything relevant to change
std : : vector < Future < Void > > changes ;
for ( int i = 0 ; i < logServers . size ( ) ; i + + ) {
2017-08-29 04:47:35 +08:00
if ( tLogReply [ i ] . isValid ( ) & & ! tLogReply [ i ] . isReady ( ) ) {
2017-05-26 04:48:44 +08:00
changes . push_back ( ready ( tLogReply [ i ] ) ) ;
2017-09-16 04:11:21 +08:00
if ( buggify_lock_minimal_tlogs ) {
changes . push_back ( logFailed [ i ] - > onChange ( ) ) ;
}
2017-08-29 04:47:35 +08:00
} else {
2017-05-26 04:48:44 +08:00
changes . push_back ( logFailed [ i ] - > onChange ( ) ) ;
2017-08-29 04:47:35 +08:00
changes . push_back ( logServers [ i ] - > onChange ( ) ) ;
2017-05-26 04:48:44 +08:00
}
}
ASSERT ( changes . size ( ) ) ;
Void _ = wait ( waitForAny ( changes ) ) ;
}
}
ACTOR static Future < Reference < ILogSystem > > newEpoch (
Reference < TagPartitionedLogSystem > oldLogSystem , vector < WorkerInterface > workers , DatabaseConfiguration configuration , LogEpoch recoveryCount )
{
state double startTime = now ( ) ;
state Reference < TagPartitionedLogSystem > logSystem ( new TagPartitionedLogSystem ( oldLogSystem - > getDebugID ( ) , oldLogSystem - > locality ) ) ;
state UID recruitmentID = g_random - > randomUniqueID ( ) ;
logSystem - > tLogWriteAntiQuorum = configuration . tLogWriteAntiQuorum ;
logSystem - > tLogReplicationFactor = configuration . tLogReplicationFactor ;
logSystem - > tLogPolicy = configuration . tLogPolicy ;
logSystem - > logSystemType = 2 ;
if ( oldLogSystem - > logServers . size ( ) ) {
logSystem - > oldLogData . push_back ( OldLogData ( ) ) ;
logSystem - > oldLogData [ 0 ] . tLogWriteAntiQuorum = oldLogSystem - > tLogWriteAntiQuorum ;
logSystem - > oldLogData [ 0 ] . tLogReplicationFactor = oldLogSystem - > tLogReplicationFactor ;
logSystem - > oldLogData [ 0 ] . tLogPolicy = oldLogSystem - > tLogPolicy ;
logSystem - > oldLogData [ 0 ] . tLogLocalities = oldLogSystem - > tLogLocalities ;
logSystem - > oldLogData [ 0 ] . epochEnd = oldLogSystem - > knownCommittedVersion + 1 ;
logSystem - > oldLogData [ 0 ] . logServers = oldLogSystem - > logServers ;
}
for ( int i = 0 ; i < oldLogSystem - > oldLogData . size ( ) ; i + + ) {
logSystem - > oldLogData . push_back ( oldLogSystem - > oldLogData [ i ] ) ;
}
state vector < Future < TLogInterface > > initializationReplies ;
vector < InitializeTLogRequest > reqs ( workers . size ( ) ) ;
for ( int i = 0 ; i < workers . size ( ) ; i + + ) {
InitializeTLogRequest & req = reqs [ i ] ;
req . recruitmentID = recruitmentID ;
req . storeType = configuration . tLogDataStoreType ;
req . recoverFrom = oldLogSystem - > getLogSystemConfig ( ) ;
req . recoverAt = oldLogSystem - > epochEndVersion . get ( ) ;
req . knownCommittedVersion = oldLogSystem - > knownCommittedVersion ;
req . epoch = recoveryCount ;
2017-08-29 04:47:56 +08:00
TraceEvent ( " TLogInitializeRequest " ) . detail ( " address " , workers [ i ] . tLog . getEndpoint ( ) . address ) ;
2017-05-26 04:48:44 +08:00
}
logSystem - > tLogLocalities . resize ( workers . size ( ) ) ;
logSystem - > logServers . resize ( workers . size ( ) ) ; // Dummy interfaces, so that logSystem->getPushLocations() below uses the correct size
// Send requests array (reqs) also
logSystem - > UpdateLocalitySet ( workers , reqs ) ;
std : : vector < int > locations ;
for ( Tag tag : oldLogSystem - > getEpochEndTags ( ) ) {
locations . clear ( ) ;
logSystem - > getPushLocations ( vector < Tag > ( 1 , tag ) , locations ) ;
for ( int loc : locations )
reqs [ loc ] . recoverTags . push_back ( tag ) ;
}
for ( int i = 0 ; i < workers . size ( ) ; i + + )
initializationReplies . push_back ( transformErrors ( throwErrorOr ( workers [ i ] . tLog . getReplyUnlessFailedFor ( reqs [ i ] , SERVER_KNOBS - > TLOG_TIMEOUT , SERVER_KNOBS - > MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ) , master_recovery_failed ( ) ) ) ;
Void _ = wait ( waitForAll ( initializationReplies ) ) ;
for ( int i = 0 ; i < initializationReplies . size ( ) ; i + + ) {
logSystem - > logServers [ i ] = Reference < AsyncVar < OptionalInterface < TLogInterface > > > ( new AsyncVar < OptionalInterface < TLogInterface > > ( OptionalInterface < TLogInterface > ( initializationReplies [ i ] . get ( ) ) ) ) ;
logSystem - > tLogLocalities [ i ] = workers [ i ] . locality ;
}
//Don't force failure of recovery if it took us a long time to recover. This avoids multiple long running recoveries causing tests to timeout
if ( BUGGIFY & & now ( ) - startTime < 300 & & g_network - > isSimulated ( ) & & g_simulator . speedUpSimulation ) throw master_recovery_failed ( ) ;
std : : vector < Future < Void > > recoveryComplete ;
for ( int i = 0 ; i < logSystem - > logServers . size ( ) ; i + + )
recoveryComplete . push_back ( transformErrors ( throwErrorOr ( logSystem - > logServers [ i ] - > get ( ) . interf ( ) . recoveryFinished . getReplyUnlessFailedFor ( TLogRecoveryFinishedRequest ( ) , SERVER_KNOBS - > TLOG_TIMEOUT , SERVER_KNOBS - > MASTER_FAILURE_SLOPE_DURING_RECOVERY ) ) , master_recovery_failed ( ) ) ) ;
logSystem - > recoveryComplete = waitForAll ( recoveryComplete ) ;
return logSystem ;
}
ACTOR static Future < Void > trackRejoins ( UID dbgid , std : : vector < Reference < AsyncVar < OptionalInterface < TLogInterface > > > > logServers , FutureStream < struct TLogRejoinRequest > rejoinRequests ) {
state std : : map < UID , ReplyPromise < bool > > lastReply ;
try {
loop {
TLogRejoinRequest req = waitNext ( rejoinRequests ) ;
int pos = - 1 ;
for ( int i = 0 ; i < logServers . size ( ) ; i + + ) {
if ( logServers [ i ] - > get ( ) . id ( ) = = req . myInterface . id ( ) ) {
pos = i ;
break ;
}
}
if ( pos ! = - 1 ) {
TraceEvent ( " TLogJoinedMe " , dbgid ) . detail ( " TLog " , req . myInterface . id ( ) ) . detail ( " Address " , req . myInterface . commit . getEndpoint ( ) . address . toString ( ) ) ;
if ( ! logServers [ pos ] - > get ( ) . present ( ) | | req . myInterface . commit . getEndpoint ( ) ! = logServers [ pos ] - > get ( ) . interf ( ) . commit . getEndpoint ( ) )
logServers [ pos ] - > setUnconditional ( OptionalInterface < TLogInterface > ( req . myInterface ) ) ;
lastReply [ req . myInterface . id ( ) ] . send ( false ) ;
lastReply [ req . myInterface . id ( ) ] = req . reply ;
}
else {
TraceEvent ( " TLogJoinedMeUnknown " , dbgid ) . detail ( " TLog " , req . myInterface . id ( ) ) . detail ( " Address " , req . myInterface . commit . getEndpoint ( ) . address . toString ( ) ) ;
req . reply . send ( true ) ;
}
}
} catch ( . . . ) {
for ( auto it = lastReply . begin ( ) ; it ! = lastReply . end ( ) ; + + it )
it - > second . send ( true ) ;
throw ;
}
}
ACTOR static Future < TLogLockResult > lockTLog ( UID myID , Reference < AsyncVar < OptionalInterface < TLogInterface > > > tlog ) {
TraceEvent ( " TLogLockStarted " , myID ) . detail ( " TLog " , tlog - > get ( ) . id ( ) ) ;
loop {
choose {
when ( TLogLockResult data = wait ( tlog - > get ( ) . present ( ) ? brokenPromiseToNever ( tlog - > get ( ) . interf ( ) . lock . getReply < TLogLockResult > ( ) ) : Never ( ) ) ) {
TraceEvent ( " TLogLocked " , myID ) . detail ( " TLog " , tlog - > get ( ) . id ( ) ) . detail ( " end " , data . end ) ;
return data ;
}
when ( Void _ = wait ( tlog - > onChange ( ) ) ) { }
}
}
}
2017-08-29 04:47:35 +08:00
static void lockMinimalTLogSet ( const UID & dbgid , const DBCoreState & prevState ,
const std : : vector < Reference < AsyncVar < OptionalInterface < TLogInterface > > > > & logServers ,
const std : : vector < Reference < AsyncVar < bool > > > & logFailed ,
vector < Future < TLogLockResult > > * tLogReply ) {
// Invariant: tLogReply[i] must correspond to the tlog stored as logServers[i].
ASSERT ( tLogReply - > size ( ) = = prevState . tLogLocalities . size ( ) ) ;
ASSERT ( logFailed . size ( ) = = tLogReply - > size ( ) ) ;
// For any given index, only one of the following will be true.
auto locking_completed = [ & logFailed , tLogReply ] ( int index ) {
const auto & entry = tLogReply - > at ( index ) ;
return entry . isValid ( ) & & entry . isReady ( ) & & ! entry . isError ( ) ;
} ;
auto locking_failed = [ & logFailed , tLogReply ] ( int index ) {
const auto & entry = tLogReply - > at ( index ) ;
return ( logFailed [ index ] - > get ( ) & & ! ( entry . isValid ( ) & & entry . isReady ( ) & & ! entry . isError ( ) ) ) | |
( entry . isValid ( ) & & entry . isReady ( ) & & entry . isError ( ) ) ;
} ;
auto locking_pending = [ & logFailed , tLogReply ] ( int index ) {
const auto & entry = tLogReply - > at ( index ) ;
return ! logFailed [ index ] - > get ( ) & & ( entry . isValid ( ) & & ! entry . isReady ( ) ) ;
} ;
auto locking_skipped = [ & logFailed , tLogReply ] ( int index ) {
const auto & entry = tLogReply - > at ( index ) ;
return ! logFailed [ index ] - > get ( ) & & ! entry . isValid ( ) ;
} ;
auto can_obtain_quorum = [ & prevState ] ( std : : function < bool ( int ) > filter ) {
LocalityGroup filter_true ;
std : : vector < LocalityData > filter_false , unused ;
for ( int i = 0 ; i < prevState . tLogLocalities . size ( ) ; i + + ) {
if ( filter ( i ) ) {
filter_true . add ( prevState . tLogLocalities [ i ] ) ;
} else {
filter_false . push_back ( prevState . tLogLocalities [ i ] ) ;
}
}
bool valid = filter_true . validate ( prevState . tLogPolicy ) ;
if ( ! valid & & prevState . tLogWriteAntiQuorum > 0 ) {
valid = ! validateAllCombinations ( unused , filter_true , prevState . tLogPolicy , filter_false , prevState . tLogWriteAntiQuorum , false ) ;
}
return valid ;
} ;
// Step 1: Check that there aren't already enough locked TLogs.
// This isn't needed for correctness, but prevents us from doing a lot of unnecessary work below.
// TODO: Once toolchain supports C++17, use std::not_fn().
auto not_locking_completed = [ & locking_completed ] ( int index ) { return ! locking_completed ( index ) ; } ;
if ( ! can_obtain_quorum ( not_locking_completed ) ) {
TraceEvent ( SevInfo , " MasterRecoveryTLogLockingAlreadySucceeded " , dbgid ) ;
return ;
}
// Step 2: Verify that if all the failed TLogs come back, they can't form a quorum.
if ( can_obtain_quorum ( locking_failed ) ) {
TraceEvent ( SevInfo , " MasterRecoveryTLogLockingImpossible " , dbgid ) ;
return ;
}
// Step 3: It's possible for us to succeed, but we need to lock additional logs.
//
// First, we need an accurate picture of what TLogs we're capable of locking. We can't tell the
// difference between a temporarily failed TLog and a permanently failed TLog. Thus, we assume
// all failures are permanent, and manually re-issue lock requests if they rejoin.
for ( int i = 0 ; i < logFailed . size ( ) ; i + + ) {
const auto & r = tLogReply - > at ( i ) ;
TEST ( locking_failed ( i ) & & ( r . isValid ( ) & & ! r . isReady ( ) ) ) ; // A TLog failed with a pending request.
// The reboot_a_tlog BUGGIFY below should cause the above case to be hit.
if ( locking_failed ( i ) ) {
tLogReply - > at ( i ) = Future < TLogLockResult > ( ) ;
}
}
// We're trying to paritition the set of old tlogs into two sets, L and R, such that:
// (1). R does not validate the policy
// (2). |R| is as large as possible
// (3). L contains all the already-locked TLogs
// and then we only issue lock requests to TLogs in L. This is safe, as R does not have quorum,
// so no commits may occur. It does not matter if L forms a quorum or not.
//
// We form these sets by starting with L as all machines and R as the empty set, and moving a
// random machine from L to R until (1) or (2) no longer holds as true. Code-wise, L is
// [0..end-can_omit), and R is [end-can_omit..end), and we move a random machine via randomizing
// the order of the tlogs. Choosing a random machine was verified to generate a good-enough
// result to be interesting intests sufficiently frequently that we don't need to try to
// calculate the exact optimal solution.
std : : vector < std : : pair < LocalityData , int > > tlogs ;
for ( int i = 0 ; i < prevState . tLogLocalities . size ( ) ; i + + ) {
tlogs . emplace_back ( prevState . tLogLocalities [ i ] , i ) ;
}
g_random - > randomShuffle ( tlogs ) ;
// Rearrange the array such that things that the left is logs closer to being locked, and
// the right is logs that can't be locked. This makes us prefer locking already-locked TLogs,
// which is how we respect the decisions made in the previous execution.
auto idx_to_order = [ & locking_completed , & locking_failed , & locking_pending , & locking_skipped ] ( int index ) {
if ( locking_completed ( index ) ) return 0 ;
if ( locking_pending ( index ) ) return 1 ;
if ( locking_skipped ( index ) ) return 2 ;
if ( locking_failed ( index ) ) return 3 ;
ASSERT ( false ) ; // Programmer error.
return - 1 ;
} ;
std : : sort ( tlogs . begin ( ) , tlogs . end ( ) ,
// TODO: Change long type to `auto` once toolchain supports C++17.
[ & idx_to_order ] ( const std : : pair < LocalityData , int > & lhs , const std : : pair < LocalityData , int > & rhs ) {
return idx_to_order ( lhs . second ) < idx_to_order ( rhs . second ) ;
} ) ;
// Indexes that aren't in the vector are the ones we're considering omitting. Remove indexes until
// the removed set forms a quorum.
int can_omit = 0 ;
std : : vector < int > to_lock_indexes ;
for ( auto it = tlogs . cbegin ( ) ; it ! = tlogs . cend ( ) - 1 ; it + + ) {
to_lock_indexes . push_back ( it - > second ) ;
}
auto filter = [ & to_lock_indexes ] ( int index ) {
return std : : find ( to_lock_indexes . cbegin ( ) , to_lock_indexes . cend ( ) , index ) = = to_lock_indexes . cend ( ) ;
} ;
while ( true ) {
if ( can_obtain_quorum ( filter ) ) {
break ;
} else {
can_omit + + ;
ASSERT ( can_omit < tlogs . size ( ) ) ;
to_lock_indexes . pop_back ( ) ;
}
}
if ( prevState . tLogReplicationFactor - prevState . tLogWriteAntiQuorum = = 1 ) {
ASSERT ( can_omit = = 0 ) ;
}
// Our previous check of making sure there aren't too many failed logs should have prevented this.
ASSERT ( ! locking_failed ( tlogs [ tlogs . size ( ) - can_omit - 1 ] . second ) ) ;
// If we've managed to leave more tlogs unlocked than (RF-AQ), it means we've hit the case
// where the policy engine has allowed us to have multiple logs in the same failure domain
// with independant sets of data. This case will validated that no code is relying on the old
// quorum=(RF-AQ) logic, and now goes through the policy engine instead.
TEST ( can_omit > = prevState . tLogReplicationFactor - prevState . tLogWriteAntiQuorum ) ; // Locking a subset of the TLogs while ending an epoch.
const bool reboot_a_tlog = g_simulator . enableConnectionFailures & & BUGGIFY & & g_random - > random01 ( ) < 0.25 ;
TraceEvent ( SevInfo , " MasterRecoveryTLogLocking " , dbgid )
. detail ( " locks " , tlogs . size ( ) - can_omit )
. detail ( " skipped " , can_omit )
. detail ( " replication " , prevState . tLogReplicationFactor )
. detail ( " antiquorum " , prevState . tLogWriteAntiQuorum )
. detail ( " reboot_buggify " , reboot_a_tlog ) ;
for ( int i = 0 ; i < tlogs . size ( ) - can_omit ; i + + ) {
const int index = tlogs [ i ] . second ;
Future < TLogLockResult > & entry = tLogReply - > at ( index ) ;
if ( ! entry . isValid ( ) ) {
entry = lockTLog ( dbgid , logServers [ index ] ) ;
}
}
if ( reboot_a_tlog ) {
for ( int i = 0 ; i < tlogs . size ( ) - can_omit ; i + + ) {
const int index = tlogs [ i ] . second ;
if ( logServers [ index ] - > get ( ) . present ( ) ) {
g_simulator . rebootProcess (
g_simulator . getProcessByAddress (
logServers [ index ] - > get ( ) . interf ( ) . address ( ) ) ,
ISimulator : : RebootProcess ) ;
break ;
}
}
}
// Intentionally leave `tlogs.size() - can_omit` .. `tlogs.size()` as !isValid() Futures.
}
2017-05-26 04:48:44 +08:00
template < class T >
static vector < T > getReadyNonError ( vector < Future < T > > const & futures ) {
// Return the values of those futures which have (non-error) values ready
std : : vector < T > result ;
for ( auto & f : futures )
if ( f . isReady ( ) & & ! f . isError ( ) )
result . push_back ( f . get ( ) ) ;
return result ;
}
struct sort_by_end {
bool operator ( ) ( TLogLockResult const & a , TLogLockResult const & b ) const { return a . end < b . end ; }
} ;
} ;
Future < Void > ILogSystem : : recoverAndEndEpoch ( Reference < AsyncVar < Reference < ILogSystem > > > const & outLogSystem , UID const & dbgid , DBCoreState const & oldState , FutureStream < TLogRejoinRequest > const & rejoins , LocalityData const & locality ) {
return TagPartitionedLogSystem : : recoverAndEndEpoch ( outLogSystem , dbgid , oldState , rejoins , locality ) ;
}
Reference < ILogSystem > ILogSystem : : fromLogSystemConfig ( UID const & dbgid , struct LocalityData const & locality , struct LogSystemConfig const & conf ) {
if ( conf . logSystemType = = 0 )
return Reference < ILogSystem > ( ) ;
else if ( conf . logSystemType = = 2 )
return TagPartitionedLogSystem : : fromLogSystemConfig ( dbgid , locality , conf ) ;
else
throw internal_error ( ) ;
}
Reference < ILogSystem > ILogSystem : : fromOldLogSystemConfig ( UID const & dbgid , struct LocalityData const & locality , struct LogSystemConfig const & conf ) {
if ( conf . logSystemType = = 0 )
return Reference < ILogSystem > ( ) ;
else if ( conf . logSystemType = = 2 )
return TagPartitionedLogSystem : : fromOldLogSystemConfig ( dbgid , locality , conf ) ;
else
throw internal_error ( ) ;
}
Reference < ILogSystem > ILogSystem : : fromServerDBInfo ( UID const & dbgid , ServerDBInfo const & dbInfo ) {
return fromLogSystemConfig ( dbgid , dbInfo . myLocality , dbInfo . logSystemConfig ) ;
}