2017-05-26 04:48:44 +08:00
/*
* sim2 . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* http : //www.apache.org/licenses/LICENSE-2.0
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
2018-10-20 01:30:13 +08:00
# include "fdbrpc/simulator.h"
2017-05-26 04:48:44 +08:00
# include "flow/IThreadPool.h"
2018-08-02 09:09:54 +08:00
# include "flow/Util.h"
2018-10-20 01:30:13 +08:00
# include "fdbrpc/IAsyncFile.h"
# include "fdbrpc/AsyncFileCached.actor.h"
# include "fdbrpc/AsyncFileNonDurable.actor.h"
2017-05-26 04:48:44 +08:00
# include "flow/Hash3.h"
2018-10-20 01:30:13 +08:00
# include "fdbrpc/TraceFileIO.h"
2017-05-26 04:48:44 +08:00
# include "flow/FaultInjection.h"
# include "flow/network.h"
2018-10-20 01:30:13 +08:00
# include "fdbrpc/Net2FileSystem.h"
2017-05-26 04:48:44 +08:00
# include "fdbrpc/Replication.h"
# include "fdbrpc/ReplicationUtils.h"
2018-10-20 01:30:13 +08:00
# include "fdbrpc/AsyncFileWriteChecker.h"
2018-12-05 08:33:15 +08:00
# include "flow/actorcompiler.h" // This must be the last #include.
2017-05-26 04:48:44 +08:00
bool simulator_should_inject_fault ( const char * context , const char * file , int line , int error_code ) {
if ( ! g_network - > isSimulated ( ) ) return false ;
auto p = g_simulator . getCurrentProcess ( ) ;
if ( p - > fault_injection_p2 & & g_random - > random01 ( ) < p - > fault_injection_p2 & & ! g_simulator . speedUpSimulation ) {
2019-04-10 02:16:45 +08:00
uint32_t h1 = line + ( p - > fault_injection_r > > 32 ) ;
2017-05-26 04:48:44 +08:00
if ( h1 < p - > fault_injection_p1 * std : : numeric_limits < uint32_t > : : max ( ) ) {
2017-06-16 08:40:19 +08:00
TEST ( true ) ; // A fault was injected
TEST ( error_code = = error_code_io_timeout ) ; // An io timeout was injected
TEST ( error_code = = error_code_io_error ) ; // An io error was injected
TEST ( error_code = = error_code_platform_error ) ; // A platform error was injected.
2017-05-26 04:48:44 +08:00
TraceEvent ( SevWarn , " FaultInjected " ) . detail ( " Context " , context ) . detail ( " File " , file ) . detail ( " Line " , line ) . detail ( " ErrorCode " , error_code ) ;
2017-05-27 08:43:28 +08:00
if ( error_code = = error_code_io_timeout ) {
g_network - > setGlobal ( INetwork : : enASIOTimedOut , ( flowGlobalType ) true ) ;
}
2017-05-26 04:48:44 +08:00
return true ;
}
}
return false ;
}
2017-08-23 07:56:33 +08:00
void ISimulator : : displayWorkers ( ) const
{
2019-01-19 07:42:48 +08:00
std : : map < std : : string , std : : vector < ISimulator : : ProcessInfo * > > machineMap ;
2017-08-23 07:56:33 +08:00
2019-01-30 03:35:02 +08:00
// Create a map of machine Id
2017-08-23 07:56:33 +08:00
for ( auto processInfo : getAllProcesses ( ) ) {
std : : string dataHall = processInfo - > locality . dataHallId ( ) . present ( ) ? processInfo - > locality . dataHallId ( ) . get ( ) . printable ( ) : " [unset] " ;
2019-01-19 07:42:48 +08:00
std : : string machineId = processInfo - > locality . machineId ( ) . present ( ) ? processInfo - > locality . machineId ( ) . get ( ) . printable ( ) : " [unset] " ;
machineMap [ format ( " %-8s %s " , dataHall . c_str ( ) , machineId . c_str ( ) ) ] . push_back ( processInfo ) ;
2017-08-23 07:56:33 +08:00
}
2019-01-19 07:42:48 +08:00
printf ( " DataHall MachineId \n " ) ;
2017-10-04 01:48:16 +08:00
printf ( " Address Name Class Excluded Failed Rebooting Cleared Role DataFolder \n " ) ;
2019-01-19 07:42:48 +08:00
for ( auto & machineRecord : machineMap ) {
printf ( " \n %s \n " , machineRecord . first . c_str ( ) ) ;
for ( auto & processInfo : machineRecord . second ) {
2017-10-04 01:48:16 +08:00
printf ( " %9s %-10s%-13s%-8s %-6s %-9s %-8s %-48s %-40s \n " ,
processInfo - > address . toString ( ) . c_str ( ) , processInfo - > name , processInfo - > startingClass . toString ( ) . c_str ( ) , ( processInfo - > isExcluded ( ) ? " True " : " False " ) , ( processInfo - > failed ? " True " : " False " ) , ( processInfo - > rebooting ? " True " : " False " ) , ( processInfo - > isCleared ( ) ? " True " : " False " ) , getRoles ( processInfo - > address ) . c_str ( ) , processInfo - > dataFolder ) ;
2017-08-23 07:56:33 +08:00
}
}
return ;
}
2017-05-26 04:48:44 +08:00
namespace std {
template < >
class hash < Endpoint > {
public :
size_t operator ( ) ( const Endpoint & s ) const
{
return hashlittle ( & s , sizeof ( s ) , 0 ) ;
}
} ;
}
bool onlyBeforeSimulatorInit ( ) {
return g_network - > isSimulated ( ) & & g_simulator . getAllProcesses ( ) . empty ( ) ;
}
const UID TOKEN_ENDPOINT_NOT_FOUND ( - 1 , - 1 ) ;
ISimulator * g_pSimulator = 0 ;
thread_local ISimulator : : ProcessInfo * ISimulator : : currentProcess = 0 ;
int openCount = 0 ;
struct SimClogging {
double getSendDelay ( NetworkAddress from , NetworkAddress to ) {
return halfLatency ( ) ;
double tnow = now ( ) ;
double t = tnow + halfLatency ( ) ;
if ( ! g_simulator . speedUpSimulation & & clogSendUntil . count ( to . ip ) )
t = std : : max ( t , clogSendUntil [ to . ip ] ) ;
return t - tnow ;
}
double getRecvDelay ( NetworkAddress from , NetworkAddress to ) {
2018-10-27 00:23:12 +08:00
auto pair = std : : make_pair ( from . ip , to . ip ) ;
2017-05-26 04:48:44 +08:00
double tnow = now ( ) ;
double t = tnow + halfLatency ( ) ;
if ( ! g_simulator . speedUpSimulation )
t + = clogPairLatency [ pair ] ;
if ( ! g_simulator . speedUpSimulation & & clogPairUntil . count ( pair ) )
t = std : : max ( t , clogPairUntil [ pair ] ) ;
if ( ! g_simulator . speedUpSimulation & & clogRecvUntil . count ( to . ip ) )
t = std : : max ( t , clogRecvUntil [ to . ip ] ) ;
return t - tnow ;
}
2019-02-27 10:04:03 +08:00
void clogPairFor ( const IPAddress & from , const IPAddress & to , double t ) {
2018-10-27 00:23:12 +08:00
auto & u = clogPairUntil [ std : : make_pair ( from , to ) ] ;
2017-05-26 04:48:44 +08:00
u = std : : max ( u , now ( ) + t ) ;
}
2019-02-27 10:04:03 +08:00
void clogSendFor ( const IPAddress & from , double t ) {
2017-05-26 04:48:44 +08:00
auto & u = clogSendUntil [ from ] ;
u = std : : max ( u , now ( ) + t ) ;
}
2019-02-27 10:04:03 +08:00
void clogRecvFor ( const IPAddress & from , double t ) {
2017-05-26 04:48:44 +08:00
auto & u = clogRecvUntil [ from ] ;
u = std : : max ( u , now ( ) + t ) ;
}
2019-02-27 10:04:03 +08:00
double setPairLatencyIfNotSet ( const IPAddress & from , const IPAddress & to , double t ) {
2018-10-27 00:23:12 +08:00
auto i = clogPairLatency . find ( std : : make_pair ( from , to ) ) ;
2017-05-26 04:48:44 +08:00
if ( i = = clogPairLatency . end ( ) )
2018-10-27 00:23:12 +08:00
i = clogPairLatency . insert ( std : : make_pair ( std : : make_pair ( from , to ) , t ) ) . first ;
2017-05-26 04:48:44 +08:00
return i - > second ;
}
2019-02-27 10:04:03 +08:00
2017-05-26 04:48:44 +08:00
private :
2019-02-27 10:04:03 +08:00
std : : map < IPAddress , double > clogSendUntil , clogRecvUntil ;
std : : map < std : : pair < IPAddress , IPAddress > , double > clogPairUntil ;
std : : map < std : : pair < IPAddress , IPAddress > , double > clogPairLatency ;
2017-05-26 04:48:44 +08:00
double halfLatency ( ) {
double a = g_random - > random01 ( ) ;
const double pFast = 0.999 ;
if ( a < = pFast ) {
a = a / pFast ;
return 0.5 * ( FLOW_KNOBS - > MIN_NETWORK_LATENCY * ( 1 - a ) + FLOW_KNOBS - > FAST_NETWORK_LATENCY / pFast * a ) ; // 0.5ms average
} else {
a = ( a - pFast ) / ( 1 - pFast ) ; // uniform 0-1 again
return 0.5 * ( FLOW_KNOBS - > MIN_NETWORK_LATENCY * ( 1 - a ) + FLOW_KNOBS - > SLOW_NETWORK_LATENCY * a ) ; // long tail up to X ms
}
}
} ;
SimClogging g_clogging ;
struct Sim2Conn : IConnection , ReferenceCounted < Sim2Conn > {
Sim2Conn ( ISimulator : : ProcessInfo * process )
: process ( process ) , dbgid ( g_random - > randomUniqueID ( ) ) , opened ( false ) , closedByCaller ( false )
{
pipes = sender ( this ) & & receiver ( this ) ;
}
// connect() is called on a pair of connections immediately after creation; logically it is part of the constructor and no other method may be called previously!
void connect ( Reference < Sim2Conn > peer , NetworkAddress peerEndpoint ) {
this - > peer = peer ;
this - > peerProcess = peer - > process ;
this - > peerId = peer - > dbgid ;
this - > peerEndpoint = peerEndpoint ;
// Every one-way connection gets a random permanent latency and a random send buffer for the duration of the connection
auto latency = g_clogging . setPairLatencyIfNotSet ( peerProcess - > address . ip , process - > address . ip , FLOW_KNOBS - > MAX_CLOGGING_LATENCY * g_random - > random01 ( ) ) ;
sendBufSize = std : : max < double > ( g_random - > randomInt ( 0 , 5000000 ) , 25e6 * ( latency + .002 ) ) ;
TraceEvent ( " Sim2Connection " ) . detail ( " SendBufSize " , sendBufSize ) . detail ( " Latency " , latency ) ;
}
~ Sim2Conn ( ) {
ASSERT_ABORT ( ! opened | | closedByCaller ) ;
}
virtual void addref ( ) { ReferenceCounted < Sim2Conn > : : addref ( ) ; }
virtual void delref ( ) { ReferenceCounted < Sim2Conn > : : delref ( ) ; }
virtual void close ( ) { closedByCaller = true ; closeInternal ( ) ; }
virtual Future < Void > onWritable ( ) { return whenWritable ( this ) ; }
virtual Future < Void > onReadable ( ) { return whenReadable ( this ) ; }
bool isPeerGone ( ) {
return ! peer | | peerProcess - > failed ;
}
void peerClosed ( ) {
leakedConnectionTracker = trackLeakedConnection ( this ) ;
}
// Reads as many bytes as possible from the read buffer into [begin,end) and returns the number of bytes read (might be 0)
// (or may throw an error if the connection dies)
virtual int read ( uint8_t * begin , uint8_t * end ) {
rollRandomClose ( ) ;
int64_t avail = receivedBytes . get ( ) - readBytes . get ( ) ; // SOMEDAY: random?
int toRead = std : : min < int64_t > ( end - begin , avail ) ;
ASSERT ( toRead > = 0 & & toRead < = recvBuf . size ( ) & & toRead < = end - begin ) ;
for ( int i = 0 ; i < toRead ; i + + )
begin [ i ] = recvBuf [ i ] ;
recvBuf . erase ( recvBuf . begin ( ) , recvBuf . begin ( ) + toRead ) ;
readBytes . set ( readBytes . get ( ) + toRead ) ;
return toRead ;
}
// Writes as many bytes as possible from the given SendBuffer chain into the write buffer and returns the number of bytes written (might be 0)
// (or may throw an error if the connection dies)
virtual int write ( SendBuffer const * buffer , int limit ) {
rollRandomClose ( ) ;
ASSERT ( limit > 0 ) ;
int toSend = 0 ;
if ( BUGGIFY ) {
toSend = std : : min ( limit , buffer - > bytes_written - buffer - > bytes_sent ) ;
} else {
for ( auto p = buffer ; p ; p = p - > next ) {
toSend + = p - > bytes_written - p - > bytes_sent ;
if ( toSend > = limit ) {
if ( toSend > limit )
toSend = limit ;
break ;
}
}
}
ASSERT ( toSend ) ;
if ( BUGGIFY ) toSend = std : : min ( toSend , g_random - > randomInt ( 0 , 1000 ) ) ;
if ( ! peer ) return toSend ;
toSend = std : : min ( toSend , peer - > availableSendBufferForPeer ( ) ) ;
ASSERT ( toSend > = 0 ) ;
int leftToSend = toSend ;
for ( auto p = buffer ; p & & leftToSend > 0 ; p = p - > next ) {
int ts = std : : min ( leftToSend , p - > bytes_written - p - > bytes_sent ) ;
peer - > recvBuf . insert ( peer - > recvBuf . end ( ) , p - > data + p - > bytes_sent , p - > data + p - > bytes_sent + ts ) ;
leftToSend - = ts ;
}
ASSERT ( leftToSend = = 0 ) ;
peer - > writtenBytes . set ( peer - > writtenBytes . get ( ) + toSend ) ;
return toSend ;
}
// Returns the network address and port of the other end of the connection. In the case of an incoming connection, this may not
// be an address we can connect to!
virtual NetworkAddress getPeerAddress ( ) { return peerEndpoint ; }
virtual UID getDebugID ( ) { return dbgid ; }
bool opened , closedByCaller ;
private :
ISimulator : : ProcessInfo * process , * peerProcess ;
UID dbgid , peerId ;
NetworkAddress peerEndpoint ;
std : : deque < uint8_t > recvBuf ; // Includes bytes written but not yet received!
AsyncVar < int64_t > readBytes , // bytes already pulled from recvBuf (location of the beginning of recvBuf)
receivedBytes ,
sentBytes ,
writtenBytes ; // location of the end of recvBuf ( == recvBuf.size() + readBytes.get() )
Reference < Sim2Conn > peer ;
int sendBufSize ;
Future < Void > leakedConnectionTracker ;
Future < Void > pipes ;
int availableSendBufferForPeer ( ) const { return sendBufSize - ( writtenBytes . get ( ) - receivedBytes . get ( ) ) ; } // SOMEDAY: acknowledgedBytes instead of receivedBytes
void closeInternal ( ) {
if ( peer ) {
peer - > peerClosed ( ) ;
}
leakedConnectionTracker . cancel ( ) ;
peer . clear ( ) ;
}
ACTOR static Future < Void > sender ( Sim2Conn * self ) {
loop {
2018-08-11 04:57:10 +08:00
wait ( self - > writtenBytes . onChange ( ) ) ; // takes place on peer!
2017-05-26 04:48:44 +08:00
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > peerProcess ) ;
2018-08-11 04:57:10 +08:00
wait ( delay ( .002 * g_random - > random01 ( ) ) ) ;
2017-05-26 04:48:44 +08:00
self - > sentBytes . set ( self - > writtenBytes . get ( ) ) ; // or possibly just some sometimes...
}
}
ACTOR static Future < Void > receiver ( Sim2Conn * self ) {
loop {
if ( self - > sentBytes . get ( ) ! = self - > receivedBytes . get ( ) )
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( self - > peerProcess ) ) ;
2017-05-26 04:48:44 +08:00
while ( self - > sentBytes . get ( ) = = self - > receivedBytes . get ( ) )
2018-08-11 04:57:10 +08:00
wait ( self - > sentBytes . onChange ( ) ) ;
2017-05-26 04:48:44 +08:00
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > peerProcess ) ;
state int64_t pos = g_random - > random01 ( ) < .5 ? self - > sentBytes . get ( ) : g_random - > randomInt64 ( self - > receivedBytes . get ( ) , self - > sentBytes . get ( ) + 1 ) ;
2018-08-11 04:57:10 +08:00
wait ( delay ( g_clogging . getSendDelay ( self - > process - > address , self - > peerProcess - > address ) ) ) ;
wait ( g_simulator . onProcess ( self - > process ) ) ;
2017-05-26 04:48:44 +08:00
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > process ) ;
2018-08-11 04:57:10 +08:00
wait ( delay ( g_clogging . getRecvDelay ( self - > process - > address , self - > peerProcess - > address ) ) ) ;
2017-05-26 04:48:44 +08:00
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > process ) ;
self - > receivedBytes . set ( pos ) ;
2018-08-11 04:57:10 +08:00
wait ( Future < Void > ( Void ( ) ) ) ; // Prior notification can delete self and cancel this actor
2017-05-26 04:48:44 +08:00
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > process ) ;
}
}
ACTOR static Future < Void > whenReadable ( Sim2Conn * self ) {
try {
loop {
if ( self - > readBytes . get ( ) ! = self - > receivedBytes . get ( ) ) {
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > process ) ;
return Void ( ) ;
}
2018-08-11 04:57:10 +08:00
wait ( self - > receivedBytes . onChange ( ) ) ;
2017-05-26 04:48:44 +08:00
self - > rollRandomClose ( ) ;
}
} catch ( Error & e ) {
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > process ) ;
throw ;
}
}
ACTOR static Future < Void > whenWritable ( Sim2Conn * self ) {
try {
loop {
if ( ! self - > peer ) return Void ( ) ;
if ( self - > peer - > availableSendBufferForPeer ( ) > 0 ) {
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > process ) ;
return Void ( ) ;
}
try {
2018-08-11 04:57:10 +08:00
wait ( self - > peer - > receivedBytes . onChange ( ) ) ;
2017-05-26 04:48:44 +08:00
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > peerProcess ) ;
} catch ( Error & e ) {
if ( e . code ( ) ! = error_code_broken_promise ) throw ;
}
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( self - > process ) ) ;
2017-05-26 04:48:44 +08:00
}
} catch ( Error & e ) {
ASSERT ( g_simulator . getCurrentProcess ( ) = = self - > process ) ;
throw ;
}
}
void rollRandomClose ( ) {
2017-09-19 03:46:29 +08:00
if ( now ( ) - g_simulator . lastConnectionFailure > g_simulator . connectionFailuresDisableDuration & & g_random - > random01 ( ) < .00001 ) {
g_simulator . lastConnectionFailure = now ( ) ;
2017-05-26 04:48:44 +08:00
double a = g_random - > random01 ( ) , b = g_random - > random01 ( ) ;
TEST ( true ) ; // Simulated connection failure
TraceEvent ( " ConnectionFailure " , dbgid ) . detail ( " MyAddr " , process - > address ) . detail ( " PeerAddr " , peerProcess - > address ) . detail ( " SendClosed " , a > .33 ) . detail ( " RecvClosed " , a < .66 ) . detail ( " Explicit " , b < .3 ) ;
if ( a < .66 & & peer ) peer - > closeInternal ( ) ;
if ( a > .33 ) closeInternal ( ) ;
// At the moment, we occasionally notice the connection failed immediately. In principle, this could happen but only after a delay.
if ( b < .3 )
throw connection_failed ( ) ;
}
}
ACTOR static Future < Void > trackLeakedConnection ( Sim2Conn * self ) {
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( self - > process ) ) ;
2017-05-26 04:48:44 +08:00
// SOMEDAY: Make this value variable? Dependent on buggification status?
2018-08-11 04:57:10 +08:00
wait ( delay ( 20.0 ) ) ;
2017-05-26 04:48:44 +08:00
TraceEvent ( SevError , " LeakedConnection " , self - > dbgid ) . error ( connection_leaked ( ) ) . detail ( " MyAddr " , self - > process - > address ) . detail ( " PeerAddr " , self - > peerEndpoint ) . detail ( " PeerId " , self - > peerId ) . detail ( " Opened " , self - > opened ) ;
return Void ( ) ;
}
} ;
# include <fcntl.h>
# include <sys/stat.h>
int sf_open ( const char * filename , int flags , int convFlags , int mode ) ;
# if defined(_WIN32)
# include <io.h>
# elif defined(__unixish__)
# define _open ::open
# define _read ::read
# define _write ::write
# define _close ::close
# define _lseeki64 ::lseek
# define _commit ::fsync
# define _chsize ::ftruncate
# define O_BINARY 0
int sf_open ( const char * filename , int flags , int convFlags , int mode ) {
return _open ( filename , convFlags , mode ) ;
}
# else
# error How do i open a file on a new platform?
# endif
class SimpleFile : public IAsyncFile , public ReferenceCounted < SimpleFile > {
public :
static void init ( ) { }
static bool should_poll ( ) { return false ; }
ACTOR static Future < Reference < IAsyncFile > > open ( std : : string filename , int flags , int mode ,
Reference < DiskParameters > diskParameters = Reference < DiskParameters > ( new DiskParameters ( 25000 , 150000000 ) ) , bool delayOnWrite = true ) {
state ISimulator : : ProcessInfo * currentProcess = g_simulator . getCurrentProcess ( ) ;
state int currentTaskID = g_network - > getCurrentTask ( ) ;
if ( + + openCount > = 3000 ) {
TraceEvent ( SevError , " TooManyFiles " ) ;
ASSERT ( false ) ;
}
if ( openCount = = 2000 ) {
TraceEvent ( SevWarnAlways , " DisableConnectionFailures_TooManyFiles " ) ;
g_simulator . speedUpSimulation = true ;
2017-09-19 03:46:29 +08:00
g_simulator . connectionFailuresDisableDuration = 1e6 ;
2017-05-26 04:48:44 +08:00
}
2019-02-27 08:47:04 +08:00
// Filesystems on average these days seem to start to have limits of around 255 characters for a
// filename. We add ".part" below, so we need to stay under 250.
ASSERT ( basename ( filename ) . size ( ) < 250 ) ;
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onMachine ( currentProcess ) ) ;
2017-05-26 04:48:44 +08:00
try {
2018-08-11 04:57:10 +08:00
wait ( delay ( FLOW_KNOBS - > MIN_OPEN_TIME + g_random - > random01 ( ) * ( FLOW_KNOBS - > MAX_OPEN_TIME - FLOW_KNOBS - > MIN_OPEN_TIME ) ) ) ;
2017-05-26 04:48:44 +08:00
std : : string open_filename = filename ;
if ( flags & OPEN_ATOMIC_WRITE_AND_CREATE ) {
ASSERT ( ( flags & OPEN_CREATE ) & & ( flags & OPEN_READWRITE ) & & ! ( flags & OPEN_EXCLUSIVE ) ) ;
open_filename = filename + " .part " ;
}
int h = sf_open ( open_filename . c_str ( ) , flags , flagConversion ( flags ) , mode ) ;
if ( h = = - 1 ) {
bool notFound = errno = = ENOENT ;
Error e = notFound ? file_not_found ( ) : io_error ( ) ;
TraceEvent ( notFound ? SevWarn : SevWarnAlways , " FileOpenError " ) . error ( e ) . GetLastError ( ) . detail ( " File " , filename ) . detail ( " Flags " , flags ) ;
throw e ;
}
platform : : makeTemporary ( open_filename . c_str ( ) ) ;
SimpleFile * simpleFile = new SimpleFile ( h , diskParameters , delayOnWrite , filename , open_filename , flags ) ;
state Reference < IAsyncFile > file = Reference < IAsyncFile > ( simpleFile ) ;
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( currentProcess , currentTaskID ) ) ;
2017-05-26 04:48:44 +08:00
return file ;
} catch ( Error & e ) {
state Error err = e ;
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( currentProcess , currentTaskID ) ) ;
2017-05-26 04:48:44 +08:00
throw err ;
}
}
virtual void addref ( ) { ReferenceCounted < SimpleFile > : : addref ( ) ; }
virtual void delref ( ) { ReferenceCounted < SimpleFile > : : delref ( ) ; }
virtual int64_t debugFD ( ) { return ( int64_t ) h ; }
virtual Future < int > read ( void * data , int length , int64_t offset ) {
return read_impl ( this , data , length , offset ) ;
}
virtual Future < Void > write ( void const * data , int length , int64_t offset ) {
return write_impl ( this , StringRef ( ( const uint8_t * ) data , length ) , offset ) ;
}
virtual Future < Void > truncate ( int64_t size ) {
return truncate_impl ( this , size ) ;
}
virtual Future < Void > sync ( ) {
return sync_impl ( this ) ;
}
virtual Future < int64_t > size ( ) {
return size_impl ( this ) ;
}
virtual std : : string getFilename ( ) {
return actualFilename ;
}
~ SimpleFile ( ) {
_close ( h ) ;
}
private :
int h ;
//Performance parameters of simulated disk
Reference < DiskParameters > diskParameters ;
std : : string filename , actualFilename ;
int flags ;
UID dbgId ;
//If true, then writes/truncates will be preceded by a delay (like other operations). If false, then they will not
//This is to support AsyncFileNonDurable, which issues its own delays for writes and truncates
bool delayOnWrite ;
SimpleFile ( int h , Reference < DiskParameters > diskParameters , bool delayOnWrite , const std : : string & filename , const std : : string & actualFilename , int flags )
: h ( h ) , diskParameters ( diskParameters ) , delayOnWrite ( delayOnWrite ) , filename ( filename ) , actualFilename ( actualFilename ) , dbgId ( g_random - > randomUniqueID ( ) ) , flags ( flags ) { }
static int flagConversion ( int flags ) {
int outFlags = O_BINARY ;
if ( flags & OPEN_READWRITE ) outFlags | = O_RDWR ;
if ( flags & OPEN_CREATE ) outFlags | = O_CREAT ;
if ( flags & OPEN_READONLY ) outFlags | = O_RDONLY ;
if ( flags & OPEN_EXCLUSIVE ) outFlags | = O_EXCL ;
if ( flags & OPEN_ATOMIC_WRITE_AND_CREATE ) outFlags | = O_TRUNC ;
return outFlags ;
}
ACTOR static Future < int > read_impl ( SimpleFile * self , void * data , int length , int64_t offset ) {
2018-12-05 08:33:15 +08:00
ASSERT ( ( self - > flags & IAsyncFile : : OPEN_NO_AIO ) ! = 0 | |
( ( uintptr_t ) data % 4096 = = 0 & & length % 4096 = = 0 & & offset % 4096 = = 0 ) ) ; // Required by KAIO.
2017-05-26 04:48:44 +08:00
state UID opId = g_random - > randomUniqueID ( ) ;
if ( randLog )
fprintf ( randLog , " SFR1 %s %s %s %d %lld \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) , length , offset ) ;
2018-08-11 04:57:10 +08:00
wait ( waitUntilDiskReady ( self - > diskParameters , length ) ) ;
2017-05-26 04:48:44 +08:00
if ( _lseeki64 ( self - > h , offset , SEEK_SET ) = = - 1 ) {
TraceEvent ( SevWarn , " SimpleFileIOError " ) . detail ( " Location " , 1 ) ;
throw io_error ( ) ;
}
unsigned int read_bytes = 0 ;
if ( ( read_bytes = _read ( self - > h , data , ( unsigned int ) length ) ) = = - 1 ) {
TraceEvent ( SevWarn , " SimpleFileIOError " ) . detail ( " Location " , 2 ) ;
throw io_error ( ) ;
}
if ( randLog ) {
uint32_t a = 0 , b = 0 ;
hashlittle2 ( data , read_bytes , & a , & b ) ;
fprintf ( randLog , " SFR2 %s %s %s %d %d \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) , read_bytes , a ) ;
}
debugFileCheck ( " SimpleFileRead " , self - > filename , data , offset , length ) ;
INJECT_FAULT ( io_timeout , " SimpleFile::read " ) ;
2017-05-27 08:43:28 +08:00
INJECT_FAULT ( io_error , " SimpleFile::read " ) ;
2017-05-26 04:48:44 +08:00
return read_bytes ;
}
ACTOR static Future < Void > write_impl ( SimpleFile * self , StringRef data , int64_t offset ) {
state UID opId = g_random - > randomUniqueID ( ) ;
if ( randLog ) {
uint32_t a = 0 , b = 0 ;
hashlittle2 ( data . begin ( ) , data . size ( ) , & a , & b ) ;
fprintf ( randLog , " SFW1 %s %s %s %d %d %lld \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) , a , data . size ( ) , offset ) ;
}
if ( self - > delayOnWrite )
2018-08-11 04:57:10 +08:00
wait ( waitUntilDiskReady ( self - > diskParameters , data . size ( ) ) ) ;
2017-05-26 04:48:44 +08:00
if ( _lseeki64 ( self - > h , offset , SEEK_SET ) = = - 1 ) {
TraceEvent ( SevWarn , " SimpleFileIOError " ) . detail ( " Location " , 3 ) ;
throw io_error ( ) ;
}
unsigned int write_bytes = 0 ;
if ( ( write_bytes = _write ( self - > h , ( void * ) data . begin ( ) , data . size ( ) ) ) = = - 1 ) {
TraceEvent ( SevWarn , " SimpleFileIOError " ) . detail ( " Location " , 4 ) ;
throw io_error ( ) ;
}
if ( write_bytes ! = data . size ( ) ) {
TraceEvent ( SevWarn , " SimpleFileIOError " ) . detail ( " Location " , 5 ) ;
throw io_error ( ) ;
}
if ( randLog ) {
fprintf ( randLog , " SFW2 %s %s %s \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) ) ;
}
debugFileCheck ( " SimpleFileWrite " , self - > filename , ( void * ) data . begin ( ) , offset , data . size ( ) ) ;
2017-05-27 08:43:28 +08:00
2017-05-26 04:48:44 +08:00
INJECT_FAULT ( io_timeout , " SimpleFile::write " ) ;
2017-05-27 08:43:28 +08:00
INJECT_FAULT ( io_error , " SimpleFile::write " ) ;
2017-05-26 04:48:44 +08:00
return Void ( ) ;
}
ACTOR static Future < Void > truncate_impl ( SimpleFile * self , int64_t size ) {
state UID opId = g_random - > randomUniqueID ( ) ;
if ( randLog )
fprintf ( randLog , " SFT1 %s %s %s %lld \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) , size ) ;
if ( self - > delayOnWrite )
2018-08-11 04:57:10 +08:00
wait ( waitUntilDiskReady ( self - > diskParameters , 0 ) ) ;
2017-05-26 04:48:44 +08:00
if ( _chsize ( self - > h , ( long ) size ) = = - 1 ) {
TraceEvent ( SevWarn , " SimpleFileIOError " ) . detail ( " Location " , 6 ) ;
throw io_error ( ) ;
}
if ( randLog )
fprintf ( randLog , " SFT2 %s %s %s \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) ) ;
2017-05-27 08:43:28 +08:00
INJECT_FAULT ( io_timeout , " SimpleFile::truncate " ) ;
2017-05-26 04:48:44 +08:00
INJECT_FAULT ( io_error , " SimpleFile::truncate " ) ;
return Void ( ) ;
}
ACTOR static Future < Void > sync_impl ( SimpleFile * self ) {
state UID opId = g_random - > randomUniqueID ( ) ;
if ( randLog )
fprintf ( randLog , " SFC1 %s %s %s \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) ) ;
if ( self - > delayOnWrite )
2018-08-11 04:57:10 +08:00
wait ( waitUntilDiskReady ( self - > diskParameters , 0 , true ) ) ;
2017-05-26 04:48:44 +08:00
if ( self - > flags & OPEN_ATOMIC_WRITE_AND_CREATE ) {
self - > flags & = ~ OPEN_ATOMIC_WRITE_AND_CREATE ;
auto & machineCache = g_simulator . getCurrentProcess ( ) - > machine - > openFiles ;
std : : string sourceFilename = self - > filename + " .part " ;
if ( machineCache . count ( sourceFilename ) ) {
2018-06-09 02:11:08 +08:00
TraceEvent ( " SimpleFileRename " ) . detail ( " From " , sourceFilename ) . detail ( " To " , self - > filename ) . detail ( " SourceCount " , machineCache . count ( sourceFilename ) ) . detail ( " FileCount " , machineCache . count ( self - > filename ) ) ;
2017-05-26 04:48:44 +08:00
renameFile ( sourceFilename . c_str ( ) , self - > filename . c_str ( ) ) ;
ASSERT ( ! machineCache . count ( self - > filename ) ) ;
machineCache [ self - > filename ] = machineCache [ sourceFilename ] ;
machineCache . erase ( sourceFilename ) ;
self - > actualFilename = self - > filename ;
}
}
if ( randLog )
fprintf ( randLog , " SFC2 %s %s %s \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) ) ;
2017-05-27 08:43:28 +08:00
INJECT_FAULT ( io_timeout , " SimpleFile::sync " ) ;
2017-05-26 04:48:44 +08:00
INJECT_FAULT ( io_error , " SimpleFile::sync " ) ;
return Void ( ) ;
}
ACTOR static Future < int64_t > size_impl ( SimpleFile * self ) {
state UID opId = g_random - > randomUniqueID ( ) ;
if ( randLog )
fprintf ( randLog , " SFS1 %s %s %s \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) ) ;
2018-08-11 04:57:10 +08:00
wait ( waitUntilDiskReady ( self - > diskParameters , 0 ) ) ;
2017-05-26 04:48:44 +08:00
int64_t pos = _lseeki64 ( self - > h , 0L , SEEK_END ) ;
if ( pos = = - 1 ) {
TraceEvent ( SevWarn , " SimpleFileIOError " ) . detail ( " Location " , 8 ) ;
throw io_error ( ) ;
}
if ( randLog )
fprintf ( randLog , " SFS2 %s %s %s %lld \n " , self - > dbgId . shortString ( ) . c_str ( ) , self - > filename . c_str ( ) , opId . shortString ( ) . c_str ( ) , pos ) ;
INJECT_FAULT ( io_error , " SimpleFile::size " ) ;
return pos ;
}
} ;
struct SimDiskSpace {
int64_t totalSpace ;
int64_t baseFreeSpace ; //The original free space of the disk + deltas from simulated external modifications
double lastUpdate ;
} ;
void doReboot ( ISimulator : : ProcessInfo * const & p , ISimulator : : KillType const & kt ) ;
struct Sim2Listener : IListener , ReferenceCounted < Sim2Listener > {
2018-12-07 03:48:50 +08:00
explicit Sim2Listener ( ISimulator : : ProcessInfo * process , const NetworkAddress & listenAddr )
: process ( process ) ,
address ( listenAddr ) { }
2017-05-26 04:48:44 +08:00
void incomingConnection ( double seconds , Reference < IConnection > conn ) { // Called by another process!
incoming ( Reference < Sim2Listener > : : addRef ( this ) , seconds , conn ) ;
}
virtual void addref ( ) { ReferenceCounted < Sim2Listener > : : addref ( ) ; }
virtual void delref ( ) { ReferenceCounted < Sim2Listener > : : delref ( ) ; }
virtual Future < Reference < IConnection > > accept ( ) {
return popOne ( nextConnection . getFuture ( ) ) ;
}
2018-12-07 03:48:50 +08:00
virtual NetworkAddress getListenAddress ( ) { return address ; }
2017-05-26 04:48:44 +08:00
private :
ISimulator : : ProcessInfo * process ;
PromiseStream < Reference < IConnection > > nextConnection ;
ACTOR static void incoming ( Reference < Sim2Listener > self , double seconds , Reference < IConnection > conn ) {
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( self - > process ) ) ;
wait ( delay ( seconds ) ) ;
2017-05-26 04:48:44 +08:00
if ( ( ( Sim2Conn * ) conn . getPtr ( ) ) - > isPeerGone ( ) & & g_random - > random01 ( ) < 0.5 )
return ;
2019-01-09 23:41:02 +08:00
TraceEvent ( " Sim2IncomingConn " , conn - > getDebugID ( ) )
. detail ( " ListenAddress " , self - > getListenAddress ( ) )
. detail ( " PeerAddress " , conn - > getPeerAddress ( ) ) ;
2017-05-26 04:48:44 +08:00
self - > nextConnection . send ( conn ) ;
}
ACTOR static Future < Reference < IConnection > > popOne ( FutureStream < Reference < IConnection > > conns ) {
Reference < IConnection > c = waitNext ( conns ) ;
( ( Sim2Conn * ) c . getPtr ( ) ) - > opened = true ;
return c ;
}
2018-12-07 03:48:50 +08:00
NetworkAddress address ;
2017-05-26 04:48:44 +08:00
} ;
# define g_sim2 ((Sim2&)g_simulator)
class Sim2 : public ISimulator , public INetworkConnections {
public :
// Implement INetwork interface
// Everything actually network related is delegated to the Sim2Net class; Sim2 is only concerned with simulating machines and time
virtual double now ( ) { return time ; }
virtual Future < class Void > delay ( double seconds , int taskID ) {
ASSERT ( taskID > = TaskMinPriority & & taskID < = TaskMaxPriority ) ;
return delay ( seconds , taskID , currentProcess ) ;
}
Future < class Void > delay ( double seconds , int taskID , ProcessInfo * machine ) {
ASSERT ( seconds > = - 0.0001 ) ;
seconds = std : : max ( 0.0 , seconds ) ;
Future < Void > f ;
if ( ! currentProcess - > rebooting & & machine = = currentProcess & & ! currentProcess - > shutdownSignal . isSet ( ) & & FLOW_KNOBS - > MAX_BUGGIFIED_DELAY > 0 & & g_random - > random01 ( ) < 0.25 ) { //FIXME: why doesnt this work when we are changing machines?
seconds + = FLOW_KNOBS - > MAX_BUGGIFIED_DELAY * pow ( g_random - > random01 ( ) , 1000.0 ) ;
}
mutex . enter ( ) ;
tasks . push ( Task ( time + seconds , taskID , taskCount + + , machine , f ) ) ;
mutex . leave ( ) ;
return f ;
}
ACTOR static Future < Void > checkShutdown ( Sim2 * self , int taskID ) {
2019-04-17 05:22:31 +08:00
wait ( success ( self - > getCurrentProcess ( ) - > shutdownSignal . getFuture ( ) ) ) ;
2017-05-26 04:48:44 +08:00
self - > setCurrentTask ( taskID ) ;
return Void ( ) ;
}
virtual Future < class Void > yield ( int taskID ) {
if ( taskID = = TaskDefaultYield ) taskID = currentTaskID ;
if ( check_yield ( taskID ) ) {
// We want to check that yielders can handle actual time elapsing (it sometimes will outside simulation), but
// don't want to prevent instantaneous shutdown of "rebooted" machines.
return delay ( getCurrentProcess ( ) - > rebooting ? 0 : .001 , taskID ) | | checkShutdown ( this , taskID ) ;
}
setCurrentTask ( taskID ) ;
return Void ( ) ;
}
virtual bool check_yield ( int taskID ) {
if ( yielded ) return true ;
if ( - - yield_limit < = 0 ) {
yield_limit = g_random - > randomInt ( 1 , 150 ) ; // If yield returns false *too* many times in a row, there could be a stack overflow, since we can't deterministically check stack size as the real network does
return yielded = true ;
}
return yielded = BUGGIFY_WITH_PROB ( 0.01 ) ;
}
virtual int getCurrentTask ( ) {
return currentTaskID ;
}
virtual void setCurrentTask ( int taskID ) {
currentTaskID = taskID ;
}
// Sets the taskID/priority of the current task, without yielding
2018-05-09 07:28:13 +08:00
virtual Future < Reference < IConnection > > connect ( NetworkAddress toAddr , std : : string host ) {
ASSERT ( ! toAddr . isTLS ( ) & & host . empty ( ) ) ;
2017-05-26 04:48:44 +08:00
if ( ! addressMap . count ( toAddr ) ) {
return waitForProcessAndConnect ( toAddr , this ) ;
}
auto peerp = getProcessByAddress ( toAddr ) ;
Reference < Sim2Conn > myc ( new Sim2Conn ( getCurrentProcess ( ) ) ) ;
Reference < Sim2Conn > peerc ( new Sim2Conn ( peerp ) ) ;
2018-12-07 03:48:50 +08:00
myc - > connect ( peerc , toAddr ) ;
2019-02-28 16:09:53 +08:00
IPAddress localIp ;
if ( getCurrentProcess ( ) - > address . ip . isV6 ( ) ) {
IPAddress : : IPAddressStore store = getCurrentProcess ( ) - > address . ip . toV6 ( ) ;
uint16_t * ipParts = ( uint16_t * ) store . data ( ) ;
ipParts [ 7 ] + = g_random - > randomInt ( 0 , 256 ) ;
localIp = IPAddress ( store ) ;
} else {
localIp = IPAddress ( getCurrentProcess ( ) - > address . ip . toV4 ( ) + g_random - > randomInt ( 0 , 256 ) ) ;
}
2019-02-27 10:04:03 +08:00
peerc - > connect ( myc , NetworkAddress ( localIp , g_random - > randomInt ( 40000 , 60000 ) ) ) ;
2017-05-26 04:48:44 +08:00
2018-12-07 03:48:50 +08:00
( ( Sim2Listener * ) peerp - > getListener ( toAddr ) . getPtr ( ) ) - > incomingConnection ( 0.5 * g_random - > random01 ( ) , Reference < IConnection > ( peerc ) ) ;
2017-05-26 04:48:44 +08:00
return onConnect ( : : delay ( 0.5 * g_random - > random01 ( ) ) , myc ) ;
}
2017-10-16 12:51:11 +08:00
virtual Future < std : : vector < NetworkAddress > > resolveTCPEndpoint ( std : : string host , std : : string service ) {
throw lookup_failed ( ) ;
}
2017-05-26 04:48:44 +08:00
ACTOR static Future < Reference < IConnection > > onConnect ( Future < Void > ready , Reference < Sim2Conn > conn ) {
2018-08-11 04:57:10 +08:00
wait ( ready ) ;
2017-05-26 04:48:44 +08:00
if ( conn - > isPeerGone ( ) & & g_random - > random01 ( ) < 0.5 ) {
conn . clear ( ) ;
2018-08-11 04:57:10 +08:00
wait ( Never ( ) ) ;
2017-05-26 04:48:44 +08:00
}
conn - > opened = true ;
return conn ;
}
virtual Reference < IListener > listen ( NetworkAddress localAddr ) {
ASSERT ( ! localAddr . isTLS ( ) ) ;
2018-12-07 03:48:50 +08:00
Reference < IListener > listener ( getCurrentProcess ( ) - > getListener ( localAddr ) ) ;
ASSERT ( listener ) ;
return listener ;
2017-05-26 04:48:44 +08:00
}
ACTOR static Future < Reference < IConnection > > waitForProcessAndConnect (
NetworkAddress toAddr , INetworkConnections * self ) {
// We have to be able to connect to processes that don't yet exist, so we do some silly polling
loop {
2018-08-11 04:57:10 +08:00
wait ( : : delay ( 0.1 * g_random - > random01 ( ) ) ) ;
2017-05-26 04:48:44 +08:00
if ( g_sim2 . addressMap . count ( toAddr ) ) {
Reference < IConnection > c = wait ( self - > connect ( toAddr ) ) ;
return c ;
}
}
}
virtual void stop ( ) { isStopped = true ; }
virtual bool isSimulated ( ) const { return true ; }
struct SimThreadArgs {
THREAD_FUNC_RETURN ( * func ) ( void * ) ;
void * arg ;
ISimulator : : ProcessInfo * currentProcess ;
SimThreadArgs ( THREAD_FUNC_RETURN ( * func ) ( void * ) , void * arg ) : func ( func ) , arg ( arg ) {
ASSERT ( g_network - > isSimulated ( ) ) ;
currentProcess = g_simulator . getCurrentProcess ( ) ;
}
} ;
//Starts a new thread, making sure to set any thread local state
THREAD_FUNC simStartThread ( void * arg ) {
SimThreadArgs * simArgs = ( SimThreadArgs * ) arg ;
ISimulator : : currentProcess = simArgs - > currentProcess ;
simArgs - > func ( simArgs - > arg ) ;
delete simArgs ;
THREAD_RETURN ;
}
virtual THREAD_HANDLE startThread ( THREAD_FUNC_RETURN ( * func ) ( void * ) , void * arg ) {
SimThreadArgs * simArgs = new SimThreadArgs ( func , arg ) ;
return : : startThread ( simStartThread , simArgs ) ;
}
virtual void getDiskBytes ( std : : string const & directory , int64_t & free , int64_t & total ) {
ProcessInfo * proc = getCurrentProcess ( ) ;
SimDiskSpace & diskSpace = diskSpaceMap [ proc - > address . ip ] ;
int64_t totalFileSize = 0 ;
int numFiles = 0 ;
//Get the size of all files we've created on the server and subtract them from the free space
for ( auto file = proc - > machine - > openFiles . begin ( ) ; file ! = proc - > machine - > openFiles . end ( ) ; + + file ) {
if ( file - > second . isReady ( ) ) {
totalFileSize + = ( ( AsyncFileNonDurable * ) file - > second . get ( ) . getPtr ( ) ) - > approximateSize ;
}
numFiles + + ;
}
if ( diskSpace . totalSpace = = 0 ) {
diskSpace . totalSpace = 5e9 + g_random - > random01 ( ) * 100e9 ; //Total space between 5GB and 105GB
diskSpace . baseFreeSpace = std : : min < int64_t > ( diskSpace . totalSpace , std : : max ( 5e9 , ( g_random - > random01 ( ) * ( 1 - .075 ) + .075 ) * diskSpace . totalSpace ) + totalFileSize ) ; //Minimum 5GB or 7.5% total disk space, whichever is higher
2018-06-09 02:11:08 +08:00
TraceEvent ( " Sim2DiskSpaceInitialization " ) . detail ( " TotalSpace " , diskSpace . totalSpace ) . detail ( " BaseFreeSpace " , diskSpace . baseFreeSpace ) . detail ( " TotalFileSize " , totalFileSize ) . detail ( " NumFiles " , numFiles ) ;
2017-05-26 04:48:44 +08:00
}
else {
int64_t maxDelta = std : : min ( 5.0 , ( now ( ) - diskSpace . lastUpdate ) ) * ( BUGGIFY ? 10e6 : 1e6 ) ; //External processes modifying the disk
int64_t delta = - maxDelta + g_random - > random01 ( ) * maxDelta * 2 ;
diskSpace . baseFreeSpace = std : : min < int64_t > ( diskSpace . totalSpace , std : : max < int64_t > ( diskSpace . baseFreeSpace + delta , totalFileSize ) ) ;
}
diskSpace . lastUpdate = now ( ) ;
total = diskSpace . totalSpace ;
free = std : : max < int64_t > ( 0 , diskSpace . baseFreeSpace - totalFileSize ) ;
if ( free = = 0 )
2018-06-09 02:11:08 +08:00
TraceEvent ( SevWarnAlways , " Sim2NoFreeSpace " ) . detail ( " TotalSpace " , diskSpace . totalSpace ) . detail ( " BaseFreeSpace " , diskSpace . baseFreeSpace ) . detail ( " TotalFileSize " , totalFileSize ) . detail ( " NumFiles " , numFiles ) ;
2017-05-26 04:48:44 +08:00
}
virtual bool isAddressOnThisHost ( NetworkAddress const & addr ) {
return addr . ip = = getCurrentProcess ( ) - > address . ip ;
}
ACTOR static Future < Void > deleteFileImpl ( Sim2 * self , std : : string filename , bool mustBeDurable ) {
// This is a _rudimentary_ simulation of the untrustworthiness of non-durable deletes and the possibility of
// rebooting during a durable one. It isn't perfect: for example, on real filesystems testing
// for the existence of a non-durably deleted file BEFORE a reboot will show that it apparently doesn't exist.
2017-08-26 01:12:58 +08:00
if ( g_simulator . getCurrentProcess ( ) - > machine - > openFiles . count ( filename ) ) {
g_simulator . getCurrentProcess ( ) - > machine - > openFiles . erase ( filename ) ;
g_simulator . getCurrentProcess ( ) - > machine - > deletingFiles . insert ( filename ) ;
}
2017-05-26 04:48:44 +08:00
if ( mustBeDurable | | g_random - > random01 ( ) < 0.5 ) {
2017-08-26 01:12:58 +08:00
state ISimulator : : ProcessInfo * currentProcess = g_simulator . getCurrentProcess ( ) ;
state int currentTaskID = g_network - > getCurrentTask ( ) ;
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onMachine ( currentProcess ) ) ;
2017-08-26 01:12:58 +08:00
try {
2018-08-11 04:57:10 +08:00
wait ( : : delay ( 0.05 * g_random - > random01 ( ) ) ) ;
2017-08-26 01:12:58 +08:00
if ( ! currentProcess - > rebooting ) {
auto f = IAsyncFileSystem : : filesystem ( self - > net2 ) - > deleteFile ( filename , false ) ;
ASSERT ( f . isReady ( ) ) ;
2018-08-11 04:57:10 +08:00
wait ( : : delay ( 0.05 * g_random - > random01 ( ) ) ) ;
2017-08-26 01:12:58 +08:00
TEST ( true ) ; // Simulated durable delete
}
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( currentProcess , currentTaskID ) ) ;
2017-08-26 01:12:58 +08:00
return Void ( ) ;
} catch ( Error & e ) {
state Error err = e ;
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( currentProcess , currentTaskID ) ) ;
2017-08-26 01:12:58 +08:00
throw err ;
2017-05-26 04:48:44 +08:00
}
} else {
TEST ( true ) ; // Simulated non-durable delete
return Void ( ) ;
}
}
ACTOR static Future < Void > runLoop ( Sim2 * self ) {
state ISimulator : : ProcessInfo * callingMachine = self - > currentProcess ;
while ( ! self - > isStopped ) {
2018-08-11 04:57:10 +08:00
wait ( self - > net2 - > yield ( TaskDefaultYield ) ) ;
2017-05-26 04:48:44 +08:00
self - > mutex . enter ( ) ;
if ( self - > tasks . size ( ) = = 0 ) {
self - > mutex . leave ( ) ;
ASSERT ( false ) ;
}
//if (!randLog/* && now() >= 32.0*/)
// randLog = fopen("randLog.txt", "wt");
Task t = std : : move ( self - > tasks . top ( ) ) ; // Unfortunately still a copy under gcc where .top() returns const&
self - > currentTaskID = t . taskID ;
self - > tasks . pop ( ) ;
self - > mutex . leave ( ) ;
self - > execTask ( t ) ;
self - > yielded = false ;
}
self - > currentProcess = callingMachine ;
self - > net2 - > stop ( ) ;
return Void ( ) ;
}
ACTOR Future < Void > _run ( Sim2 * self ) {
Future < Void > loopFuture = self - > runLoop ( self ) ;
self - > net2 - > run ( ) ;
2018-08-11 04:57:10 +08:00
wait ( loopFuture ) ;
2017-05-26 04:48:44 +08:00
return Void ( ) ;
}
// Implement ISimulator interface
virtual void run ( ) {
_run ( this ) ;
}
2019-02-28 16:09:53 +08:00
virtual ProcessInfo * newProcess ( const char * name , IPAddress ip , uint16_t port , uint16_t listenPerProcess ,
LocalityData locality , ProcessClass startingClass , const char * dataFolder ,
const char * coordinationFolder ) {
2019-01-19 07:42:48 +08:00
ASSERT ( locality . machineId ( ) . present ( ) ) ;
MachineInfo & machine = machines [ locality . machineId ( ) . get ( ) ] ;
if ( ! machine . machineId . present ( ) )
machine . machineId = locality . machineId ( ) ;
2017-05-26 04:48:44 +08:00
for ( int i = 0 ; i < machine . processes . size ( ) ; i + + ) {
2019-01-19 07:42:48 +08:00
if ( machine . processes [ i ] - > locality . machineId ( ) ! = locality . machineId ( ) ) { // SOMEDAY: compute ip from locality to avoid this check
2019-02-28 16:09:53 +08:00
TraceEvent ( " Sim2Mismatch " )
. detail ( " IP " , format ( " %s " , ip . toString ( ) . c_str ( ) ) )
2019-03-15 09:40:28 +08:00
. detail ( " MachineId " , locality . machineId ( ) )
2019-02-28 16:09:53 +08:00
. detail ( " NewName " , name )
2019-03-15 09:40:28 +08:00
. detail ( " ExistingMachineId " , machine . processes [ i ] - > locality . machineId ( ) )
2019-02-28 16:09:53 +08:00
. detail ( " ExistingName " , machine . processes [ i ] - > name ) ;
2017-05-26 04:48:44 +08:00
ASSERT ( false ) ;
}
ASSERT ( machine . processes [ i ] - > address . port ! = port ) ;
}
// This is for async operations on non-durable files.
// These files must live on after process kills for sim purposes.
if ( machine . machineProcess = = 0 ) {
NetworkAddress machineAddress ( ip , 0 , false , false ) ;
2018-12-07 03:48:50 +08:00
machine . machineProcess = new ProcessInfo ( " Machine " , locality , startingClass , { machineAddress } , this , " " , " " ) ;
2017-05-26 04:48:44 +08:00
machine . machineProcess - > machine = & machine ;
}
2018-12-07 03:48:50 +08:00
NetworkAddressList addresses ;
2019-03-24 08:54:46 +08:00
addresses . address = NetworkAddress ( ip , port , true , false ) ;
if ( listenPerProcess = = 2 ) {
addresses . secondaryAddress = NetworkAddress ( ip , port + 1 , true , false ) ;
2018-12-07 03:48:50 +08:00
}
ProcessInfo * m = new ProcessInfo ( name , locality , startingClass , addresses , this , dataFolder , coordinationFolder ) ;
for ( int processPort = port ; processPort < port + listenPerProcess ; + + processPort ) {
NetworkAddress address ( ip , processPort , true , false ) ; // SOMEDAY see above about becoming SSL!
m - > listenerMap [ address ] = Reference < IListener > ( new Sim2Listener ( m , address ) ) ;
addressMap [ address ] = m ;
}
2017-05-26 04:48:44 +08:00
m - > machine = & machine ;
machine . processes . push_back ( m ) ;
2019-03-24 08:54:46 +08:00
currentlyRebootingProcesses . erase ( addresses . address ) ;
m - > excluded = g_simulator . isExcluded ( addresses . address ) ;
m - > cleared = g_simulator . isCleared ( addresses . address ) ;
2017-05-26 04:48:44 +08:00
m - > setGlobal ( enTDMetrics , ( flowGlobalType ) & m - > tdmetrics ) ;
m - > setGlobal ( enNetworkConnections , ( flowGlobalType ) m - > network ) ;
2017-06-16 08:40:19 +08:00
m - > setGlobal ( enASIOTimedOut , ( flowGlobalType ) false ) ;
2017-05-26 04:48:44 +08:00
2019-03-15 09:40:28 +08:00
TraceEvent ( " NewMachine " ) . detail ( " Name " , name ) . detail ( " Address " , m - > address ) . detail ( " MachineId " , m - > locality . machineId ( ) ) . detail ( " Excluded " , m - > excluded ) . detail ( " Cleared " , m - > cleared ) ;
2017-05-26 04:48:44 +08:00
// FIXME: Sometimes, connections to/from this process will explicitly close
return m ;
}
2017-05-27 05:20:11 +08:00
virtual bool isAvailable ( ) const
{
2018-07-17 06:56:43 +08:00
std : : vector < ProcessInfo * > processesLeft , processesDead ;
2017-05-27 05:20:11 +08:00
for ( auto processInfo : getAllProcesses ( ) ) {
2017-06-20 07:48:15 +08:00
if ( processInfo - > isAvailableClass ( ) ) {
2018-07-17 06:56:43 +08:00
if ( processInfo - > isExcluded ( ) | | processInfo - > isCleared ( ) | | ! processInfo - > isAvailable ( ) ) {
2017-09-13 04:33:13 +08:00
processesDead . push_back ( processInfo ) ;
2018-07-17 06:56:43 +08:00
} else {
2017-06-20 07:48:15 +08:00
processesLeft . push_back ( processInfo ) ;
2018-07-17 06:56:43 +08:00
}
2017-05-27 05:20:11 +08:00
}
}
return canKillProcesses ( processesLeft , processesDead , KillInstantly , NULL ) ;
}
2017-05-26 04:48:44 +08:00
2018-07-17 01:06:57 +08:00
virtual bool datacenterDead ( Optional < Standalone < StringRef > > dcId ) const
{
if ( ! dcId . present ( ) ) {
return false ;
}
LocalityGroup primaryProcessesLeft , primaryProcessesDead ;
std : : vector < LocalityData > primaryLocalitiesDead , primaryLocalitiesLeft ;
for ( auto processInfo : getAllProcesses ( ) ) {
if ( processInfo - > isAvailableClass ( ) & & processInfo - > locality . dcId ( ) = = dcId ) {
2018-07-17 06:56:43 +08:00
if ( processInfo - > isExcluded ( ) | | processInfo - > isCleared ( ) | | ! processInfo - > isAvailable ( ) ) {
primaryProcessesDead . add ( processInfo - > locality ) ;
primaryLocalitiesDead . push_back ( processInfo - > locality ) ;
} else {
primaryProcessesLeft . add ( processInfo - > locality ) ;
primaryLocalitiesLeft . push_back ( processInfo - > locality ) ;
}
2018-07-17 01:06:57 +08:00
}
}
std : : vector < LocalityData > badCombo ;
bool primaryTLogsDead = tLogWriteAntiQuorum ? ! validateAllCombinations ( badCombo , primaryProcessesDead , tLogPolicy , primaryLocalitiesLeft , tLogWriteAntiQuorum , false ) : primaryProcessesDead . validate ( tLogPolicy ) ;
if ( usableRegions > 1 & & remoteTLogPolicy & & ! primaryTLogsDead ) {
primaryTLogsDead = primaryProcessesDead . validate ( remoteTLogPolicy ) ;
}
return primaryTLogsDead | | primaryProcessesDead . validate ( storagePolicy ) ;
}
2017-05-26 04:48:44 +08:00
// The following function will determine if the specified configuration of available and dead processes can allow the cluster to survive
2017-05-27 05:20:11 +08:00
virtual bool canKillProcesses ( std : : vector < ProcessInfo * > const & availableProcesses , std : : vector < ProcessInfo * > const & deadProcesses , KillType kt , KillType * newKillType ) const
2017-05-26 04:48:44 +08:00
{
2017-10-20 06:36:32 +08:00
bool canSurvive = true ;
int nQuorum = ( ( desiredCoordinators + 1 ) / 2 ) * 2 - 1 ;
2017-06-20 07:48:15 +08:00
2017-10-20 06:36:32 +08:00
KillType newKt = kt ;
2017-05-26 04:48:44 +08:00
if ( ( kt = = KillInstantly ) | | ( kt = = InjectFaults ) | | ( kt = = RebootAndDelete ) | | ( kt = = RebootProcessAndDelete ) )
{
2017-10-20 06:36:32 +08:00
LocalityGroup primaryProcessesLeft , primaryProcessesDead ;
LocalityGroup primarySatelliteProcessesLeft , primarySatelliteProcessesDead ;
LocalityGroup remoteProcessesLeft , remoteProcessesDead ;
LocalityGroup remoteSatelliteProcessesLeft , remoteSatelliteProcessesDead ;
std : : vector < LocalityData > primaryLocalitiesDead , primaryLocalitiesLeft ;
std : : vector < LocalityData > primarySatelliteLocalitiesDead , primarySatelliteLocalitiesLeft ;
std : : vector < LocalityData > remoteLocalitiesDead , remoteLocalitiesLeft ;
std : : vector < LocalityData > remoteSatelliteLocalitiesDead , remoteSatelliteLocalitiesLeft ;
std : : vector < LocalityData > badCombo ;
std : : set < Optional < Standalone < StringRef > > > uniqueMachines ;
2018-03-07 10:38:05 +08:00
if ( ! primaryDcId . present ( ) ) {
2017-10-20 06:36:32 +08:00
for ( auto processInfo : availableProcesses ) {
primaryProcessesLeft . add ( processInfo - > locality ) ;
primaryLocalitiesLeft . push_back ( processInfo - > locality ) ;
2019-01-19 07:42:48 +08:00
uniqueMachines . insert ( processInfo - > locality . zoneId ( ) ) ;
2017-10-20 06:36:32 +08:00
}
for ( auto processInfo : deadProcesses ) {
primaryProcessesDead . add ( processInfo - > locality ) ;
primaryLocalitiesDead . push_back ( processInfo - > locality ) ;
}
} else {
for ( auto processInfo : availableProcesses ) {
2019-01-19 07:42:48 +08:00
uniqueMachines . insert ( processInfo - > locality . zoneId ( ) ) ;
2017-10-20 06:36:32 +08:00
if ( processInfo - > locality . dcId ( ) = = primaryDcId ) {
primaryProcessesLeft . add ( processInfo - > locality ) ;
primaryLocalitiesLeft . push_back ( processInfo - > locality ) ;
} else if ( processInfo - > locality . dcId ( ) = = remoteDcId ) {
remoteProcessesLeft . add ( processInfo - > locality ) ;
remoteLocalitiesLeft . push_back ( processInfo - > locality ) ;
} else if ( std : : find ( primarySatelliteDcIds . begin ( ) , primarySatelliteDcIds . end ( ) , processInfo - > locality . dcId ( ) ) ! = primarySatelliteDcIds . end ( ) ) {
primarySatelliteProcessesLeft . add ( processInfo - > locality ) ;
primarySatelliteLocalitiesLeft . push_back ( processInfo - > locality ) ;
} else if ( std : : find ( remoteSatelliteDcIds . begin ( ) , remoteSatelliteDcIds . end ( ) , processInfo - > locality . dcId ( ) ) ! = remoteSatelliteDcIds . end ( ) ) {
remoteSatelliteProcessesLeft . add ( processInfo - > locality ) ;
remoteSatelliteLocalitiesLeft . push_back ( processInfo - > locality ) ;
}
}
for ( auto processInfo : deadProcesses ) {
if ( processInfo - > locality . dcId ( ) = = primaryDcId ) {
primaryProcessesDead . add ( processInfo - > locality ) ;
primaryLocalitiesDead . push_back ( processInfo - > locality ) ;
} else if ( processInfo - > locality . dcId ( ) = = remoteDcId ) {
remoteProcessesDead . add ( processInfo - > locality ) ;
remoteLocalitiesDead . push_back ( processInfo - > locality ) ;
} else if ( std : : find ( primarySatelliteDcIds . begin ( ) , primarySatelliteDcIds . end ( ) , processInfo - > locality . dcId ( ) ) ! = primarySatelliteDcIds . end ( ) ) {
primarySatelliteProcessesDead . add ( processInfo - > locality ) ;
primarySatelliteLocalitiesDead . push_back ( processInfo - > locality ) ;
} else if ( std : : find ( remoteSatelliteDcIds . begin ( ) , remoteSatelliteDcIds . end ( ) , processInfo - > locality . dcId ( ) ) ! = remoteSatelliteDcIds . end ( ) ) {
remoteSatelliteProcessesDead . add ( processInfo - > locality ) ;
remoteSatelliteLocalitiesDead . push_back ( processInfo - > locality ) ;
}
}
2017-05-26 04:48:44 +08:00
}
2017-10-20 06:36:32 +08:00
bool tooManyDead = false ;
bool notEnoughLeft = false ;
bool primaryTLogsDead = tLogWriteAntiQuorum ? ! validateAllCombinations ( badCombo , primaryProcessesDead , tLogPolicy , primaryLocalitiesLeft , tLogWriteAntiQuorum , false ) : primaryProcessesDead . validate ( tLogPolicy ) ;
2018-06-18 10:31:15 +08:00
if ( usableRegions > 1 & & remoteTLogPolicy & & ! primaryTLogsDead ) {
2018-06-09 06:28:44 +08:00
primaryTLogsDead = primaryProcessesDead . validate ( remoteTLogPolicy ) ;
}
2017-10-20 06:36:32 +08:00
2018-03-07 10:38:05 +08:00
if ( ! primaryDcId . present ( ) ) {
2017-10-20 06:36:32 +08:00
tooManyDead = primaryTLogsDead | | primaryProcessesDead . validate ( storagePolicy ) ;
notEnoughLeft = ! primaryProcessesLeft . validate ( tLogPolicy ) | | ! primaryProcessesLeft . validate ( storagePolicy ) ;
} else {
bool remoteTLogsDead = tLogWriteAntiQuorum ? ! validateAllCombinations ( badCombo , remoteProcessesDead , tLogPolicy , remoteLocalitiesLeft , tLogWriteAntiQuorum , false ) : remoteProcessesDead . validate ( tLogPolicy ) ;
2018-06-18 10:31:15 +08:00
if ( usableRegions > 1 & & remoteTLogPolicy & & ! remoteTLogsDead ) {
2018-06-09 06:28:44 +08:00
remoteTLogsDead = remoteProcessesDead . validate ( remoteTLogPolicy ) ;
}
2017-10-20 06:36:32 +08:00
if ( ! hasSatelliteReplication ) {
2018-06-18 10:31:15 +08:00
if ( usableRegions > 1 ) {
2018-03-07 10:38:05 +08:00
tooManyDead = primaryTLogsDead | | remoteTLogsDead | | ( primaryProcessesDead . validate ( storagePolicy ) & & remoteProcessesDead . validate ( storagePolicy ) ) ;
2018-07-06 12:36:09 +08:00
notEnoughLeft = ! primaryProcessesLeft . validate ( tLogPolicy ) | | ! primaryProcessesLeft . validate ( remoteTLogPolicy ) | | ! primaryProcessesLeft . validate ( storagePolicy ) | | ! remoteProcessesLeft . validate ( tLogPolicy ) | | ! remoteProcessesLeft . validate ( remoteTLogPolicy ) | | ! remoteProcessesLeft . validate ( storagePolicy ) ;
2018-03-07 10:38:05 +08:00
} else {
tooManyDead = primaryTLogsDead | | remoteTLogsDead | | primaryProcessesDead . validate ( storagePolicy ) | | remoteProcessesDead . validate ( storagePolicy ) ;
2018-07-06 12:36:09 +08:00
notEnoughLeft = ! primaryProcessesLeft . validate ( tLogPolicy ) | | ! primaryProcessesLeft . validate ( storagePolicy ) | | ! remoteProcessesLeft . validate ( tLogPolicy ) | | ! remoteProcessesLeft . validate ( storagePolicy ) ;
2018-03-07 10:38:05 +08:00
}
2017-10-20 06:36:32 +08:00
} else {
2018-07-10 13:01:46 +08:00
bool primarySatelliteTLogsDead = satelliteTLogWriteAntiQuorumFallback ? ! validateAllCombinations ( badCombo , primarySatelliteProcessesDead , satelliteTLogPolicyFallback , primarySatelliteLocalitiesLeft , satelliteTLogWriteAntiQuorumFallback , false ) : primarySatelliteProcessesDead . validate ( satelliteTLogPolicyFallback ) ;
bool remoteSatelliteTLogsDead = satelliteTLogWriteAntiQuorumFallback ? ! validateAllCombinations ( badCombo , remoteSatelliteProcessesDead , satelliteTLogPolicyFallback , remoteSatelliteLocalitiesLeft , satelliteTLogWriteAntiQuorumFallback , false ) : remoteSatelliteProcessesDead . validate ( satelliteTLogPolicyFallback ) ;
2017-10-20 06:36:32 +08:00
2018-07-06 12:36:09 +08:00
if ( usableRegions > 1 ) {
notEnoughLeft = ! primaryProcessesLeft . validate ( tLogPolicy ) | | ! primaryProcessesLeft . validate ( remoteTLogPolicy ) | | ! primaryProcessesLeft . validate ( storagePolicy ) | | ! primarySatelliteProcessesLeft . validate ( satelliteTLogPolicy ) | | ! remoteProcessesLeft . validate ( tLogPolicy ) | | ! remoteProcessesLeft . validate ( remoteTLogPolicy ) | | ! remoteProcessesLeft . validate ( storagePolicy ) | | ! remoteSatelliteProcessesLeft . validate ( satelliteTLogPolicy ) ;
} else {
notEnoughLeft = ! primaryProcessesLeft . validate ( tLogPolicy ) | | ! primaryProcessesLeft . validate ( storagePolicy ) | | ! primarySatelliteProcessesLeft . validate ( satelliteTLogPolicy ) | | ! remoteProcessesLeft . validate ( tLogPolicy ) | | ! remoteProcessesLeft . validate ( storagePolicy ) | | ! remoteSatelliteProcessesLeft . validate ( satelliteTLogPolicy ) ;
}
2018-07-06 05:04:42 +08:00
if ( usableRegions > 1 & & allowLogSetKills ) {
2018-07-05 04:22:32 +08:00
tooManyDead = ( primaryTLogsDead & & primarySatelliteTLogsDead ) | | ( remoteTLogsDead & & remoteSatelliteTLogsDead ) | | ( primaryTLogsDead & & remoteTLogsDead ) | | ( primaryProcessesDead . validate ( storagePolicy ) & & remoteProcessesDead . validate ( storagePolicy ) ) ;
2018-03-07 10:38:05 +08:00
} else {
2018-07-05 04:22:32 +08:00
tooManyDead = primaryTLogsDead | | remoteTLogsDead | | primaryProcessesDead . validate ( storagePolicy ) | | remoteProcessesDead . validate ( storagePolicy ) ;
2018-03-07 10:38:05 +08:00
}
2017-10-20 06:36:32 +08:00
}
2017-05-26 04:48:44 +08:00
}
2017-10-20 06:36:32 +08:00
2017-05-26 04:48:44 +08:00
// Reboot if dead machines do fulfill policies
2017-10-20 06:36:32 +08:00
if ( tooManyDead ) {
2017-05-26 04:48:44 +08:00
newKt = Reboot ;
canSurvive = false ;
2018-06-09 02:11:08 +08:00
TraceEvent ( " KillChanged " ) . detail ( " KillType " , kt ) . detail ( " NewKillType " , newKt ) . detail ( " TLogPolicy " , tLogPolicy - > info ( ) ) . detail ( " Reason " , " tLogPolicy validates against dead processes. " ) ;
2017-05-26 04:48:44 +08:00
}
// Reboot and Delete if remaining machines do NOT fulfill policies
2018-07-05 04:22:32 +08:00
else if ( ( kt < RebootAndDelete ) & & notEnoughLeft ) {
newKt = RebootAndDelete ;
2017-05-26 04:48:44 +08:00
canSurvive = false ;
2018-06-09 02:11:08 +08:00
TraceEvent ( " KillChanged " ) . detail ( " KillType " , kt ) . detail ( " NewKillType " , newKt ) . detail ( " TLogPolicy " , tLogPolicy - > info ( ) ) . detail ( " Reason " , " tLogPolicy does not validates against remaining processes. " ) ;
2017-05-26 04:48:44 +08:00
}
2018-07-05 04:22:32 +08:00
else if ( ( kt < RebootAndDelete ) & & ( nQuorum > uniqueMachines . size ( ) ) ) {
newKt = RebootAndDelete ;
2017-06-20 07:48:15 +08:00
canSurvive = false ;
2018-06-09 02:11:08 +08:00
TraceEvent ( " KillChanged " ) . detail ( " KillType " , kt ) . detail ( " NewKillType " , newKt ) . detail ( " StoragePolicy " , storagePolicy - > info ( ) ) . detail ( " Quorum " , nQuorum ) . detail ( " Machines " , uniqueMachines . size ( ) ) . detail ( " Reason " , " Not enough unique machines to perform auto configuration of coordinators. " ) ;
2017-06-20 07:48:15 +08:00
}
2017-05-26 04:48:44 +08:00
else {
2018-06-09 02:11:08 +08:00
TraceEvent ( " CanSurviveKills " ) . detail ( " KillType " , kt ) . detail ( " TLogPolicy " , tLogPolicy - > info ( ) ) . detail ( " StoragePolicy " , storagePolicy - > info ( ) ) . detail ( " Quorum " , nQuorum ) . detail ( " Machines " , uniqueMachines . size ( ) ) ;
2017-05-26 04:48:44 +08:00
}
}
if ( newKillType ) * newKillType = newKt ;
return canSurvive ;
}
virtual void destroyProcess ( ISimulator : : ProcessInfo * p ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " ProcessDestroyed " ) . detail ( " Name " , p - > name ) . detail ( " Address " , p - > address ) . detail ( " MachineId " , p - > locality . machineId ( ) ) ;
2017-05-26 04:48:44 +08:00
currentlyRebootingProcesses . insert ( std : : pair < NetworkAddress , ProcessInfo * > ( p - > address , p ) ) ;
2019-01-19 07:42:48 +08:00
std : : vector < ProcessInfo * > & processes = machines [ p - > locality . machineId ( ) . get ( ) ] . processes ;
2017-05-26 04:48:44 +08:00
if ( p ! = processes . back ( ) ) {
auto it = std : : find ( processes . begin ( ) , processes . end ( ) , p ) ;
std : : swap ( * it , processes . back ( ) ) ;
}
processes . pop_back ( ) ;
killProcess_internal ( p , KillInstantly ) ;
}
void killProcess_internal ( ProcessInfo * machine , KillType kt ) {
2017-05-27 08:43:28 +08:00
TEST ( true ) ; // Simulated machine was killed with any kill type
2017-05-26 04:48:44 +08:00
TEST ( kt = = KillInstantly ) ; // Simulated machine was killed instantly
TEST ( kt = = InjectFaults ) ; // Simulated machine was killed with faults
if ( kt = = KillInstantly ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " FailMachine " ) . detail ( " Name " , machine - > name ) . detail ( " Address " , machine - > address ) . detail ( " ZoneId " , machine - > locality . zoneId ( ) ) . detail ( " Process " , machine - > toString ( ) ) . detail ( " Rebooting " , machine - > rebooting ) . detail ( " Protected " , protectedAddresses . count ( machine - > address ) ) . backtrace ( ) ;
2017-05-26 04:48:44 +08:00
// This will remove all the "tracked" messages that came from the machine being killed
latestEventCache . clear ( ) ;
machine - > failed = true ;
} else if ( kt = = InjectFaults ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " FaultMachine " ) . detail ( " Name " , machine - > name ) . detail ( " Address " , machine - > address ) . detail ( " ZoneId " , machine - > locality . zoneId ( ) ) . detail ( " Process " , machine - > toString ( ) ) . detail ( " Rebooting " , machine - > rebooting ) . detail ( " Protected " , protectedAddresses . count ( machine - > address ) ) . backtrace ( ) ;
2017-05-26 04:48:44 +08:00
should_inject_fault = simulator_should_inject_fault ;
machine - > fault_injection_r = g_random - > randomUniqueID ( ) . first ( ) ;
machine - > fault_injection_p1 = 0.1 ;
machine - > fault_injection_p2 = g_random - > random01 ( ) ;
} else {
ASSERT ( false ) ;
}
2017-05-27 05:20:11 +08:00
ASSERT ( ! protectedAddresses . count ( machine - > address ) | | machine - > rebooting ) ;
2017-05-26 04:48:44 +08:00
}
virtual void rebootProcess ( ProcessInfo * process , KillType kt ) {
2017-08-29 02:25:37 +08:00
if ( kt = = RebootProcessAndDelete & & protectedAddresses . count ( process - > address ) ) {
TraceEvent ( " RebootChanged " ) . detail ( " ZoneId " , process - > locality . describeZone ( ) ) . detail ( " KillType " , RebootProcess ) . detail ( " OrigKillType " , kt ) . detail ( " Reason " , " Protected process " ) ;
2017-05-26 04:48:44 +08:00
kt = RebootProcess ;
2017-08-29 02:25:37 +08:00
}
2017-05-26 04:48:44 +08:00
doReboot ( process , kt ) ;
}
virtual void rebootProcess ( Optional < Standalone < StringRef > > zoneId , bool allProcesses ) {
if ( allProcesses ) {
auto processes = getAllProcesses ( ) ;
for ( int i = 0 ; i < processes . size ( ) ; i + + )
if ( processes [ i ] - > locality . zoneId ( ) = = zoneId & & ! processes [ i ] - > rebooting )
doReboot ( processes [ i ] , RebootProcess ) ;
} else {
auto processes = getAllProcesses ( ) ;
for ( int i = 0 ; i < processes . size ( ) ; i + + ) {
if ( processes [ i ] - > locality . zoneId ( ) ! = zoneId | | processes [ i ] - > rebooting ) {
2018-08-02 09:09:54 +08:00
swapAndPop ( & processes , i - - ) ;
2017-05-26 04:48:44 +08:00
}
}
if ( processes . size ( ) )
doReboot ( g_random - > randomChoice ( processes ) , RebootProcess ) ;
}
}
virtual void killProcess ( ProcessInfo * machine , KillType kt ) {
2018-06-09 02:11:08 +08:00
TraceEvent ( " AttemptingKillProcess " ) ;
2017-05-26 04:48:44 +08:00
if ( kt < RebootAndDelete ) {
killProcess_internal ( machine , kt ) ;
}
}
virtual void killInterface ( NetworkAddress address , KillType kt ) {
if ( kt < RebootAndDelete ) {
2019-01-19 07:42:48 +08:00
std : : vector < ProcessInfo * > & processes = machines [ addressMap [ address ] - > locality . machineId ( ) ] . processes ;
2017-05-26 04:48:44 +08:00
for ( int i = 0 ; i < processes . size ( ) ; i + + )
killProcess_internal ( processes [ i ] , kt ) ;
}
}
2019-01-19 07:42:48 +08:00
virtual bool killZone ( Optional < Standalone < StringRef > > zoneId , KillType kt , bool forceKill , KillType * ktFinal ) {
auto processes = getAllProcesses ( ) ;
std : : set < Optional < Standalone < StringRef > > > zoneMachines ;
for ( auto & process : processes ) {
if ( process - > locality . zoneId ( ) = = zoneId ) {
zoneMachines . insert ( process - > locality . machineId ( ) ) ;
}
}
bool result = false ;
for ( auto & machineId : zoneMachines ) {
if ( killMachine ( machineId , kt , forceKill , ktFinal ) ) {
result = true ;
}
}
return result ;
}
virtual bool killMachine ( Optional < Standalone < StringRef > > machineId , KillType kt , bool forceKill , KillType * ktFinal ) {
2017-05-26 04:48:44 +08:00
auto ktOrig = kt ;
2017-06-20 07:48:15 +08:00
TEST ( true ) ; // Trying to killing a machine
2017-05-27 08:43:28 +08:00
TEST ( kt = = KillInstantly ) ; // Trying to kill instantly
TEST ( kt = = InjectFaults ) ; // Trying to kill by injecting faults
2017-05-26 04:48:44 +08:00
if ( speedUpSimulation & & ! forceKill ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " AbortedKill " ) . detail ( " MachineId " , machineId ) . detail ( " Reason " , " Unforced kill within speedy simulation. " ) . backtrace ( ) ;
2017-10-05 18:07:20 +08:00
if ( ktFinal ) * ktFinal = None ;
2017-05-26 04:48:44 +08:00
return false ;
}
int processesOnMachine = 0 ;
2017-05-27 08:43:28 +08:00
KillType originalKt = kt ;
2017-05-26 04:48:44 +08:00
// Reboot if any of the processes are protected and count the number of processes not rebooting
2019-01-19 07:42:48 +08:00
for ( auto & process : machines [ machineId ] . processes ) {
2017-05-26 04:48:44 +08:00
if ( protectedAddresses . count ( process - > address ) )
kt = Reboot ;
if ( ! process - > rebooting )
processesOnMachine + + ;
}
2017-06-20 07:48:15 +08:00
// Do nothing, if no processes to kill
if ( processesOnMachine = = 0 ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " AbortedKill " ) . detail ( " MachineId " , machineId ) . detail ( " Reason " , " The target had no processes running. " ) . detail ( " Processes " , processesOnMachine ) . detail ( " ProcessesPerMachine " , processesPerMachine ) . backtrace ( ) ;
2017-10-05 18:07:20 +08:00
if ( ktFinal ) * ktFinal = None ;
2017-06-20 07:48:15 +08:00
return false ;
}
2017-05-26 04:48:44 +08:00
// Check if machine can be removed, if requested
2018-09-18 09:32:39 +08:00
if ( ! forceKill & & ( ( kt = = KillInstantly ) | | ( kt = = InjectFaults ) | | ( kt = = RebootAndDelete ) | | ( kt = = RebootProcessAndDelete ) ) )
2017-05-26 04:48:44 +08:00
{
2018-02-27 05:13:37 +08:00
std : : vector < ProcessInfo * > processesLeft , processesDead ;
2017-10-04 01:48:16 +08:00
int protectedWorker = 0 , unavailable = 0 , excluded = 0 , cleared = 0 ;
2017-05-26 04:48:44 +08:00
2018-07-06 12:13:56 +08:00
for ( auto processInfo : getAllProcesses ( ) ) {
if ( processInfo - > isAvailableClass ( ) ) {
if ( processInfo - > isExcluded ( ) ) {
processesDead . push_back ( processInfo ) ;
excluded + + ;
2017-05-26 04:48:44 +08:00
}
2018-07-06 12:13:56 +08:00
else if ( processInfo - > isCleared ( ) ) {
processesDead . push_back ( processInfo ) ;
cleared + + ;
}
else if ( ! processInfo - > isAvailable ( ) ) {
processesDead . push_back ( processInfo ) ;
unavailable + + ;
}
else if ( protectedAddresses . count ( processInfo - > address ) ) {
processesLeft . push_back ( processInfo ) ;
protectedWorker + + ;
}
2019-01-19 07:42:48 +08:00
else if ( processInfo - > locality . machineId ( ) ! = machineId ) {
2018-07-06 12:13:56 +08:00
processesLeft . push_back ( processInfo ) ;
2018-07-17 06:56:43 +08:00
} else {
2018-07-06 12:13:56 +08:00
processesDead . push_back ( processInfo ) ;
2018-07-17 06:56:43 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
2017-05-27 05:20:11 +08:00
if ( ! canKillProcesses ( processesLeft , processesDead , kt , & kt ) ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " ChangedKillMachine " ) . detail ( " MachineId " , machineId ) . detail ( " KillType " , kt ) . detail ( " OrigKillType " , ktOrig ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " TotalProcesses " , machines . size ( ) ) . detail ( " ProcessesPerMachine " , processesPerMachine ) . detail ( " Protected " , protectedWorker ) . detail ( " Unavailable " , unavailable ) . detail ( " Excluded " , excluded ) . detail ( " Cleared " , cleared ) . detail ( " ProtectedTotal " , protectedAddresses . size ( ) ) . detail ( " TLogPolicy " , tLogPolicy - > info ( ) ) . detail ( " StoragePolicy " , storagePolicy - > info ( ) ) ;
2017-05-26 04:48:44 +08:00
}
else if ( ( kt = = KillInstantly ) | | ( kt = = InjectFaults ) ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " DeadMachine " ) . detail ( " MachineId " , machineId ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " TotalProcesses " , machines . size ( ) ) . detail ( " ProcessesPerMachine " , processesPerMachine ) . detail ( " TLogPolicy " , tLogPolicy - > info ( ) ) . detail ( " StoragePolicy " , storagePolicy - > info ( ) ) ;
2017-05-27 05:20:11 +08:00
for ( auto process : processesLeft ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " DeadMachineSurvivors " ) . detail ( " MachineId " , machineId ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " SurvivingProcess " , process - > toString ( ) ) ;
2017-05-27 05:20:11 +08:00
}
2017-06-20 07:48:15 +08:00
for ( auto process : processesDead ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " DeadMachineVictims " ) . detail ( " MachineId " , machineId ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " VictimProcess " , process - > toString ( ) ) ;
2017-06-20 07:48:15 +08:00
}
2017-05-26 04:48:44 +08:00
}
else {
2019-03-15 09:40:28 +08:00
TraceEvent ( " ClearMachine " ) . detail ( " MachineId " , machineId ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " TotalProcesses " , machines . size ( ) ) . detail ( " ProcessesPerMachine " , processesPerMachine ) . detail ( " TLogPolicy " , tLogPolicy - > info ( ) ) . detail ( " StoragePolicy " , storagePolicy - > info ( ) ) ;
2017-06-20 07:48:15 +08:00
for ( auto process : processesLeft ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " ClearMachineSurvivors " ) . detail ( " MachineId " , machineId ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " SurvivingProcess " , process - > toString ( ) ) ;
2017-06-20 07:48:15 +08:00
}
for ( auto process : processesDead ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " ClearMachineVictims " ) . detail ( " MachineId " , machineId ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " VictimProcess " , process - > toString ( ) ) ;
2017-06-20 07:48:15 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
2017-05-27 08:43:28 +08:00
TEST ( originalKt ! = kt ) ; // Kill type was changed from requested to reboot.
2017-05-26 04:48:44 +08:00
// Check if any processes on machine are rebooting
if ( processesOnMachine ! = processesPerMachine & & kt > = RebootAndDelete ) {
TEST ( true ) ; //Attempted reboot, but the target did not have all of its processes running
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " AbortedKill " ) . detail ( " KillType " , kt ) . detail ( " MachineId " , machineId ) . detail ( " Reason " , " Machine processes does not match number of processes per machine " ) . detail ( " Processes " , processesOnMachine ) . detail ( " ProcessesPerMachine " , processesPerMachine ) . backtrace ( ) ;
2017-10-05 18:07:20 +08:00
if ( ktFinal ) * ktFinal = None ;
2017-05-26 04:48:44 +08:00
return false ;
}
2017-06-20 07:48:15 +08:00
// Check if any processes on machine are rebooting
2018-07-05 04:22:32 +08:00
if ( processesOnMachine ! = processesPerMachine ) {
2017-06-20 07:48:15 +08:00
TEST ( true ) ; //Attempted reboot, but the target did not have all of its processes running
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " AbortedKill " ) . detail ( " KillType " , kt ) . detail ( " MachineId " , machineId ) . detail ( " Reason " , " Machine processes does not match number of processes per machine " ) . detail ( " Processes " , processesOnMachine ) . detail ( " ProcessesPerMachine " , processesPerMachine ) . backtrace ( ) ;
2017-10-05 18:07:20 +08:00
if ( ktFinal ) * ktFinal = None ;
2017-06-20 07:48:15 +08:00
return false ;
}
2019-03-15 09:40:28 +08:00
TraceEvent ( " KillMachine " ) . detail ( " MachineId " , machineId ) . detail ( " Kt " , kt ) . detail ( " KtOrig " , ktOrig ) . detail ( " KillableMachines " , processesOnMachine ) . detail ( " ProcessPerMachine " , processesPerMachine ) . detail ( " KillChanged " , kt ! = ktOrig ) ;
2018-07-05 04:22:32 +08:00
if ( kt < RebootAndDelete ) {
2019-01-19 07:42:48 +08:00
if ( kt = = InjectFaults & & machines [ machineId ] . machineProcess ! = nullptr )
killProcess_internal ( machines [ machineId ] . machineProcess , kt ) ;
for ( auto & process : machines [ machineId ] . processes ) {
2018-08-02 05:30:57 +08:00
TraceEvent ( " KillMachineProcess " ) . detail ( " KillType " , kt ) . detail ( " Process " , process - > toString ( ) ) . detail ( " StartingClass " , process - > startingClass . toString ( ) ) . detail ( " Failed " , process - > failed ) . detail ( " Excluded " , process - > excluded ) . detail ( " Cleared " , process - > cleared ) . detail ( " Rebooting " , process - > rebooting ) ;
2017-05-26 04:48:44 +08:00
if ( process - > startingClass ! = ProcessClass : : TesterClass )
killProcess_internal ( process , kt ) ;
}
}
2018-07-05 04:22:32 +08:00
else if ( kt = = Reboot | | kt = = RebootAndDelete ) {
2019-01-19 07:42:48 +08:00
for ( auto & process : machines [ machineId ] . processes ) {
2018-08-02 05:30:57 +08:00
TraceEvent ( " KillMachineProcess " ) . detail ( " KillType " , kt ) . detail ( " Process " , process - > toString ( ) ) . detail ( " StartingClass " , process - > startingClass . toString ( ) ) . detail ( " Failed " , process - > failed ) . detail ( " Excluded " , process - > excluded ) . detail ( " Cleared " , process - > cleared ) . detail ( " Rebooting " , process - > rebooting ) ;
2017-05-26 04:48:44 +08:00
if ( process - > startingClass ! = ProcessClass : : TesterClass )
doReboot ( process , kt ) ;
}
}
2017-09-27 02:15:39 +08:00
TEST ( kt = = RebootAndDelete ) ; // Resulted in a reboot and delete
TEST ( kt = = Reboot ) ; // Resulted in a reboot
TEST ( kt = = KillInstantly ) ; // Resulted in an instant kill
TEST ( kt = = InjectFaults ) ; // Resulted in a kill by injecting faults
2017-10-05 18:07:20 +08:00
if ( ktFinal ) * ktFinal = kt ;
2017-05-26 04:48:44 +08:00
return true ;
}
2018-09-18 09:32:39 +08:00
virtual bool killDataCenter ( Optional < Standalone < StringRef > > dcId , KillType kt , bool forceKill , KillType * ktFinal ) {
2017-05-26 04:48:44 +08:00
auto ktOrig = kt ;
auto processes = getAllProcesses ( ) ;
2019-01-19 07:42:48 +08:00
std : : map < Optional < Standalone < StringRef > > , int > datacenterMachines ;
2017-05-26 04:48:44 +08:00
int dcProcesses = 0 ;
// Switch to a reboot, if anything protected on machine
2017-08-29 02:25:37 +08:00
for ( auto & procRecord : processes ) {
auto processDcId = procRecord - > locality . dcId ( ) ;
2019-01-19 07:42:48 +08:00
auto processMachineId = procRecord - > locality . machineId ( ) ;
ASSERT ( processMachineId . present ( ) ) ;
2017-05-26 04:48:44 +08:00
if ( processDcId . present ( ) & & ( processDcId = = dcId ) ) {
2017-08-29 02:25:37 +08:00
if ( ( kt ! = Reboot ) & & ( protectedAddresses . count ( procRecord - > address ) ) ) {
2017-05-26 04:48:44 +08:00
kt = Reboot ;
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " DcKillChanged " ) . detail ( " DataCenter " , dcId ) . detail ( " KillType " , kt ) . detail ( " OrigKillType " , ktOrig )
2018-10-27 00:23:12 +08:00
. detail ( " Reason " , " Datacenter has protected process " ) . detail ( " ProcessAddress " , procRecord - > address ) . detail ( " Failed " , procRecord - > failed ) . detail ( " Rebooting " , procRecord - > rebooting ) . detail ( " Excluded " , procRecord - > excluded ) . detail ( " Cleared " , procRecord - > cleared ) . detail ( " Process " , procRecord - > toString ( ) ) ;
2017-08-29 02:25:37 +08:00
}
2019-01-19 07:42:48 +08:00
datacenterMachines [ processMachineId . get ( ) ] + + ;
2017-05-26 04:48:44 +08:00
dcProcesses + + ;
}
}
// Check if machine can be removed, if requested
2018-09-18 09:32:39 +08:00
if ( ! forceKill & & ( ( kt = = KillInstantly ) | | ( kt = = InjectFaults ) | | ( kt = = RebootAndDelete ) | | ( kt = = RebootProcessAndDelete ) ) )
2017-05-26 04:48:44 +08:00
{
std : : vector < ProcessInfo * > processesLeft , processesDead ;
2018-07-06 12:13:56 +08:00
for ( auto processInfo : getAllProcesses ( ) ) {
if ( processInfo - > isAvailableClass ( ) ) {
2018-07-17 06:56:43 +08:00
if ( processInfo - > isExcluded ( ) | | processInfo - > isCleared ( ) | | ! processInfo - > isAvailable ( ) ) {
2018-07-06 12:13:56 +08:00
processesDead . push_back ( processInfo ) ;
2019-01-19 07:42:48 +08:00
} else if ( protectedAddresses . count ( processInfo - > address ) | | datacenterMachines . find ( processInfo - > locality . machineId ( ) ) = = datacenterMachines . end ( ) ) {
2018-07-06 12:13:56 +08:00
processesLeft . push_back ( processInfo ) ;
2018-07-17 06:56:43 +08:00
} else {
2018-07-06 12:13:56 +08:00
processesDead . push_back ( processInfo ) ;
2018-07-17 06:56:43 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
2017-05-27 05:20:11 +08:00
if ( ! canKillProcesses ( processesLeft , processesDead , kt , & kt ) ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( SevWarn , " DcKillChanged " ) . detail ( " DataCenter " , dcId ) . detail ( " KillType " , kt ) . detail ( " OrigKillType " , ktOrig ) ;
2017-05-26 04:48:44 +08:00
}
else {
2019-03-15 09:40:28 +08:00
TraceEvent ( " DeadDataCenter " ) . detail ( " DataCenter " , dcId ) . detail ( " KillType " , kt ) . detail ( " DcZones " , datacenterMachines . size ( ) ) . detail ( " DcProcesses " , dcProcesses ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " TLogPolicy " , tLogPolicy - > info ( ) ) . detail ( " StoragePolicy " , storagePolicy - > info ( ) ) ;
2017-09-13 04:33:13 +08:00
for ( auto process : processesLeft ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " DeadDcSurvivors " ) . detail ( " MachineId " , process - > locality . machineId ( ) ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " SurvivingProcess " , process - > toString ( ) ) ;
2017-09-13 04:33:13 +08:00
}
for ( auto process : processesDead ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " DeadDcVictims " ) . detail ( " MachineId " , process - > locality . machineId ( ) ) . detail ( " KillType " , kt ) . detail ( " ProcessesLeft " , processesLeft . size ( ) ) . detail ( " ProcessesDead " , processesDead . size ( ) ) . detail ( " VictimProcess " , process - > toString ( ) ) ;
2017-09-13 04:33:13 +08:00
}
2017-05-26 04:48:44 +08:00
}
}
2017-10-05 18:07:20 +08:00
KillType ktResult , ktMin = kt ;
2019-01-19 07:42:48 +08:00
for ( auto & datacenterMachine : datacenterMachines ) {
2019-02-19 07:32:51 +08:00
if ( g_random - > random01 ( ) < 0.99 ) {
killMachine ( datacenterMachine . first , kt , true , & ktResult ) ;
if ( ktResult ! = kt ) {
TraceEvent ( SevWarn , " KillDCFail " )
2019-03-15 09:40:28 +08:00
. detail ( " Zone " , datacenterMachine . first )
2019-02-19 07:32:51 +08:00
. detail ( " KillType " , kt )
. detail ( " KillTypeResult " , ktResult )
. detail ( " KillTypeOrig " , ktOrig ) ;
ASSERT ( ktResult = = None ) ;
}
ktMin = std : : min < KillType > ( ktResult , ktMin ) ;
2017-10-05 18:07:20 +08:00
}
}
2018-06-09 02:11:08 +08:00
TraceEvent ( " KillDataCenter " )
2019-01-19 07:42:48 +08:00
. detail ( " DcZones " , datacenterMachines . size ( ) )
2017-05-26 04:48:44 +08:00
. detail ( " DcProcesses " , dcProcesses )
2019-03-15 09:40:28 +08:00
. detail ( " DCID " , dcId )
2017-08-29 02:25:37 +08:00
. detail ( " KillType " , kt )
2017-10-05 18:07:20 +08:00
. detail ( " KillTypeOrig " , ktOrig )
. detail ( " KillTypeMin " , ktMin )
. detail ( " KilledDC " , kt = = ktMin ) ;
2017-05-26 04:48:44 +08:00
2017-10-05 18:07:20 +08:00
TEST ( kt ! = ktMin ) ; // DataCenter kill was rejected by killMachine
TEST ( ( kt = = ktMin ) & & ( kt = = RebootAndDelete ) ) ; // Resulted in a reboot and delete
TEST ( ( kt = = ktMin ) & & ( kt = = Reboot ) ) ; // Resulted in a reboot
TEST ( ( kt = = ktMin ) & & ( kt = = KillInstantly ) ) ; // Resulted in an instant kill
TEST ( ( kt = = ktMin ) & & ( kt = = InjectFaults ) ) ; // Resulted in a kill by injecting faults
TEST ( ( kt = = ktMin ) & & ( kt ! = ktOrig ) ) ; // Kill request was downgraded
TEST ( ( kt = = ktMin ) & & ( kt = = ktOrig ) ) ; // Requested kill was done
if ( ktFinal ) * ktFinal = ktMin ;
2017-10-03 03:04:28 +08:00
2017-10-05 18:07:20 +08:00
return ( kt = = ktMin ) ;
2017-05-26 04:48:44 +08:00
}
2019-02-27 10:04:03 +08:00
virtual void clogInterface ( const IPAddress & ip , double seconds , ClogMode mode = ClogDefault ) {
2017-05-26 04:48:44 +08:00
if ( mode = = ClogDefault ) {
double a = g_random - > random01 ( ) ;
if ( a < 0.3 ) mode = ClogSend ;
else if ( a < 0.6 ) mode = ClogReceive ;
else mode = ClogAll ;
}
2019-02-27 10:04:03 +08:00
TraceEvent ( " ClogInterface " )
. detail ( " IP " , ip . toString ( ) )
. detail ( " Delay " , seconds )
. detail ( " Queue " , mode = = ClogSend ? " Send " : mode = = ClogReceive ? " Receive " : " All " ) ;
2017-05-26 04:48:44 +08:00
if ( mode = = ClogSend | | mode = = ClogAll )
g_clogging . clogSendFor ( ip , seconds ) ;
if ( mode = = ClogReceive | | mode = = ClogAll )
g_clogging . clogRecvFor ( ip , seconds ) ;
}
2019-02-27 10:04:03 +08:00
virtual void clogPair ( const IPAddress & from , const IPAddress & to , double seconds ) {
2017-05-26 04:48:44 +08:00
g_clogging . clogPairFor ( from , to , seconds ) ;
}
2017-05-27 05:20:11 +08:00
virtual std : : vector < ProcessInfo * > getAllProcesses ( ) const {
2017-05-26 04:48:44 +08:00
std : : vector < ProcessInfo * > processes ;
2018-07-06 12:13:56 +08:00
for ( auto & c : machines ) {
processes . insert ( processes . end ( ) , c . second . processes . begin ( ) , c . second . processes . end ( ) ) ;
}
for ( auto & c : currentlyRebootingProcesses ) {
processes . push_back ( c . second ) ;
}
2017-05-26 04:48:44 +08:00
return processes ;
}
virtual ProcessInfo * getProcessByAddress ( NetworkAddress const & address ) {
NetworkAddress normalizedAddress ( address . ip , address . port , true , false ) ;
ASSERT ( addressMap . count ( normalizedAddress ) ) ;
return addressMap [ normalizedAddress ] ;
}
virtual MachineInfo * getMachineByNetworkAddress ( NetworkAddress const & address ) {
2019-01-19 07:42:48 +08:00
return & machines [ addressMap [ address ] - > locality . machineId ( ) ] ;
2017-05-26 04:48:44 +08:00
}
2019-01-19 07:42:48 +08:00
virtual MachineInfo * getMachineById ( Optional < Standalone < StringRef > > const & machineId ) {
return & machines [ machineId ] ;
2017-05-26 04:48:44 +08:00
}
2019-01-19 07:42:48 +08:00
virtual void destroyMachine ( Optional < Standalone < StringRef > > const & machineId ) {
auto & machine = machines [ machineId ] ;
2017-05-26 04:48:44 +08:00
for ( auto process : machine . processes ) {
ASSERT ( process - > failed ) ;
}
if ( machine . machineProcess ) {
killProcess_internal ( machine . machineProcess , KillInstantly ) ;
}
2019-01-19 07:42:48 +08:00
machines . erase ( machineId ) ;
2017-05-26 04:48:44 +08:00
}
Sim2 ( ) : time ( 0.0 ) , taskCount ( 0 ) , yielded ( false ) , yield_limit ( 0 ) , currentTaskID ( - 1 ) {
// Not letting currentProcess be NULL eliminates some annoying special cases
2018-12-07 03:48:50 +08:00
currentProcess = new ProcessInfo ( " NoMachine " , LocalityData ( Optional < Standalone < StringRef > > ( ) , StringRef ( ) , StringRef ( ) , StringRef ( ) ) , ProcessClass ( ) , { NetworkAddress ( ) } , this , " " , " " ) ;
2018-10-30 06:26:28 +08:00
g_network = net2 = newNet2 ( false , true ) ;
2017-05-26 04:48:44 +08:00
Net2FileSystem : : newFileSystem ( ) ;
check_yield ( 0 ) ;
}
// Implementation
struct Task {
int taskID ;
double time ;
uint64_t stable ;
ProcessInfo * machine ;
Promise < Void > action ;
Task ( double time , int taskID , uint64_t stable , ProcessInfo * machine , Promise < Void > & & action ) : time ( time ) , taskID ( taskID ) , stable ( stable ) , machine ( machine ) , action ( std : : move ( action ) ) { }
Task ( double time , int taskID , uint64_t stable , ProcessInfo * machine , Future < Void > & future ) : time ( time ) , taskID ( taskID ) , stable ( stable ) , machine ( machine ) { future = action . getFuture ( ) ; }
2019-01-26 08:49:59 +08:00
Task ( Task & & rhs ) BOOST_NOEXCEPT : time ( rhs . time ) , taskID ( rhs . taskID ) , stable ( rhs . stable ) , machine ( rhs . machine ) , action ( std : : move ( rhs . action ) ) { }
2017-05-26 04:48:44 +08:00
void operator = ( Task const & rhs ) { taskID = rhs . taskID ; time = rhs . time ; stable = rhs . stable ; machine = rhs . machine ; action = rhs . action ; }
Task ( Task const & rhs ) : taskID ( rhs . taskID ) , time ( rhs . time ) , stable ( rhs . stable ) , machine ( rhs . machine ) , action ( rhs . action ) { }
2019-01-26 08:49:59 +08:00
void operator = ( Task & & rhs ) BOOST_NOEXCEPT { time = rhs . time ; taskID = rhs . taskID ; stable = rhs . stable ; machine = rhs . machine ; action = std : : move ( rhs . action ) ; }
2017-05-26 04:48:44 +08:00
bool operator < ( Task const & rhs ) const {
// Ordering is reversed for priority_queue
if ( time ! = rhs . time ) return time > rhs . time ;
return stable > rhs . stable ;
}
} ;
void execTask ( struct Task & t ) {
if ( t . machine - > failed ) {
t . action . send ( Never ( ) ) ;
}
else {
mutex . enter ( ) ;
this - > time = t . time ;
mutex . leave ( ) ;
this - > currentProcess = t . machine ;
try {
//auto before = getCPUTicks();
t . action . send ( Void ( ) ) ;
ASSERT ( this - > currentProcess = = t . machine ) ;
/*auto elapsed = getCPUTicks() - before;
currentProcess - > cpuTicks + = elapsed ;
if ( g_random - > random01 ( ) < 0.01 ) {
2018-06-09 02:11:08 +08:00
TraceEvent ( " TaskDuration " ) . detail ( " CpuTicks " , currentProcess - > cpuTicks ) ;
2017-05-26 04:48:44 +08:00
currentProcess - > cpuTicks = 0 ;
} */
} catch ( Error & e ) {
TraceEvent ( SevError , " UnhandledSimulationEventError " ) . error ( e , true ) ;
killProcess ( t . machine , KillInstantly ) ;
}
//if( this->time > 45.522817 ) {
// printf("foo\n");
//}
if ( randLog )
fprintf ( randLog , " T %f %d %s %lld \n " , this - > time , int ( g_random - > peek ( ) % 10000 ) , t . machine ? t . machine - > name : " none " , t . stable ) ;
}
}
virtual void onMainThread ( Promise < Void > & & signal , int taskID ) {
// This is presumably coming from either a "fake" thread pool thread, i.e. it is actually on this thread
// or a thread created with g_network->startThread
ASSERT ( getCurrentProcess ( ) ) ;
mutex . enter ( ) ;
ASSERT ( taskID > = TaskMinPriority & & taskID < = TaskMaxPriority ) ;
tasks . push ( Task ( time , taskID , taskCount + + , getCurrentProcess ( ) , std : : move ( signal ) ) ) ;
mutex . leave ( ) ;
}
virtual Future < Void > onProcess ( ISimulator : : ProcessInfo * process , int taskID ) {
return delay ( 0 , taskID , process ) ;
}
virtual Future < Void > onMachine ( ISimulator : : ProcessInfo * process , int taskID ) {
if ( process - > machine = = 0 )
return Void ( ) ;
return delay ( 0 , taskID , process - > machine - > machineProcess ) ;
}
//time is guarded by ISimulator::mutex. It is not necessary to guard reads on the main thread because
//time should only be modified from the main thread.
double time ;
int currentTaskID ;
//taskCount is guarded by ISimulator::mutex
uint64_t taskCount ;
std : : map < Optional < Standalone < StringRef > > , MachineInfo > machines ;
std : : map < NetworkAddress , ProcessInfo * > addressMap ;
std : : map < ProcessInfo * , Promise < Void > > filesDeadMap ;
//tasks is guarded by ISimulator::mutex
std : : priority_queue < Task , std : : vector < Task > > tasks ;
//Sim2Net network;
INetwork * net2 ;
//Map from machine IP -> machine disk space info
2019-02-27 10:04:03 +08:00
std : : map < IPAddress , SimDiskSpace > diskSpaceMap ;
2017-05-26 04:48:44 +08:00
//Whether or not yield has returned true during the current iteration of the run loop
bool yielded ;
int yield_limit ; // how many more times yield may return false before next returning true
} ;
void startNewSimulator ( ) {
ASSERT ( ! g_network ) ;
g_network = g_pSimulator = new Sim2 ( ) ;
2017-09-19 03:46:29 +08:00
g_simulator . connectionFailuresDisableDuration = g_random - > random01 ( ) < 0.5 ? 0 : 1e6 ;
2017-05-26 04:48:44 +08:00
}
ACTOR void doReboot ( ISimulator : : ProcessInfo * p , ISimulator : : KillType kt ) {
2019-03-15 09:40:28 +08:00
TraceEvent ( " RebootingProcessAttempt " ) . detail ( " ZoneId " , p - > locality . zoneId ( ) ) . detail ( " KillType " , kt ) . detail ( " Process " , p - > toString ( ) ) . detail ( " StartingClass " , p - > startingClass . toString ( ) ) . detail ( " Failed " , p - > failed ) . detail ( " Excluded " , p - > excluded ) . detail ( " Cleared " , p - > cleared ) . detail ( " Rebooting " , p - > rebooting ) . detail ( " TaskDefaultDelay " , TaskDefaultDelay ) ;
2017-08-29 02:25:37 +08:00
2018-08-11 04:57:10 +08:00
wait ( g_sim2 . delay ( 0 , TaskDefaultDelay , p ) ) ; // Switch to the machine in question
2017-05-26 04:48:44 +08:00
try {
ASSERT ( kt = = ISimulator : : RebootProcess | | kt = = ISimulator : : Reboot | | kt = = ISimulator : : RebootAndDelete | | kt = = ISimulator : : RebootProcessAndDelete ) ;
TEST ( kt = = ISimulator : : RebootProcess ) ; // Simulated process rebooted
TEST ( kt = = ISimulator : : Reboot ) ; // Simulated machine rebooted
TEST ( kt = = ISimulator : : RebootAndDelete ) ; // Simulated machine rebooted with data and coordination state deletion
TEST ( kt = = ISimulator : : RebootProcessAndDelete ) ; // Simulated process rebooted with data and coordination state deletion
if ( p - > rebooting )
return ;
2019-03-15 09:40:28 +08:00
TraceEvent ( " RebootingProcess " ) . detail ( " KillType " , kt ) . detail ( " Address " , p - > address ) . detail ( " ZoneId " , p - > locality . zoneId ( ) ) . detail ( " DataHall " , p - > locality . dataHallId ( ) ) . detail ( " Locality " , p - > locality . toString ( ) ) . detail ( " Failed " , p - > failed ) . detail ( " Excluded " , p - > excluded ) . detail ( " Cleared " , p - > cleared ) . backtrace ( ) ;
2017-05-26 04:48:44 +08:00
p - > rebooting = true ;
2017-10-04 01:48:16 +08:00
if ( ( kt = = ISimulator : : RebootAndDelete ) | | ( kt = = ISimulator : : RebootProcessAndDelete ) ) {
p - > cleared = true ;
g_simulator . clearAddress ( p - > address ) ;
}
2017-05-26 04:48:44 +08:00
p - > shutdownSignal . send ( kt ) ;
} catch ( Error & e ) {
TraceEvent ( SevError , " RebootError " ) . error ( e ) ;
p - > shutdownSignal . sendError ( e ) ; // ?
throw ; // goes nowhere!
}
}
//Simulates delays for performing operations on disk
Future < Void > waitUntilDiskReady ( Reference < DiskParameters > diskParameters , int64_t size , bool sync ) {
2017-09-19 03:46:29 +08:00
if ( g_simulator . connectionFailuresDisableDuration > 1e4 )
2017-05-26 04:48:44 +08:00
return delay ( 0.0001 ) ;
if ( diskParameters - > nextOperation < now ( ) ) diskParameters - > nextOperation = now ( ) ;
diskParameters - > nextOperation + = ( 1.0 / diskParameters - > iops ) + ( size / diskParameters - > bandwidth ) ;
double randomLatency ;
if ( sync ) {
randomLatency = .005 + g_random - > random01 ( ) * ( BUGGIFY ? 1.0 : .010 ) ;
} else
randomLatency = 10 * g_random - > random01 ( ) / diskParameters - > iops ;
return delayUntil ( diskParameters - > nextOperation + randomLatency ) ;
}
# if defined(_WIN32)
/* Opening with FILE_SHARE_DELETE lets simulation actually work on windows - previously renames were always failing.
FIXME : Use an actual platform abstraction for this stuff ! Is there any reason we can ' t use underlying net2 for example ? */
# include <Windows.h>
int sf_open ( const char * filename , int flags , int convFlags , int mode ) {
HANDLE wh = CreateFile ( filename , GENERIC_READ | ( ( flags & IAsyncFile : : OPEN_READWRITE ) ? GENERIC_WRITE : 0 ) ,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE , NULL ,
( flags & IAsyncFile : : OPEN_EXCLUSIVE ) ? CREATE_NEW :
( flags & IAsyncFile : : OPEN_CREATE ) ? OPEN_ALWAYS :
OPEN_EXISTING ,
FILE_ATTRIBUTE_NORMAL ,
NULL ) ;
int h = - 1 ;
if ( wh ! = INVALID_HANDLE_VALUE ) h = _open_osfhandle ( ( intptr_t ) wh , convFlags ) ;
else errno = GetLastError ( ) = = ERROR_FILE_NOT_FOUND ? ENOENT : EFAULT ;
return h ;
}
# endif
// Opens a file for asynchronous I/O
Future < Reference < class IAsyncFile > > Sim2FileSystem : : open ( std : : string filename , int64_t flags , int64_t mode )
{
ASSERT ( ( flags & IAsyncFile : : OPEN_ATOMIC_WRITE_AND_CREATE ) | |
! ( flags & IAsyncFile : : OPEN_CREATE ) | |
StringRef ( filename ) . endsWith ( LiteralStringRef ( " .fdb-lock " ) ) ) ; // We don't use "ordinary" non-atomic file creation right now except for folder locking, and we don't have code to simulate its unsafeness.
if ( ( flags & IAsyncFile : : OPEN_EXCLUSIVE ) ) ASSERT ( flags & IAsyncFile : : OPEN_CREATE ) ;
if ( flags & IAsyncFile : : OPEN_UNCACHED ) {
auto & machineCache = g_simulator . getCurrentProcess ( ) - > machine - > openFiles ;
2019-01-08 05:07:19 +08:00
std : : string actualFilename = filename ;
if ( machineCache . find ( filename ) = = machineCache . end ( ) ) {
2017-05-26 04:48:44 +08:00
if ( flags & IAsyncFile : : OPEN_ATOMIC_WRITE_AND_CREATE ) {
actualFilename = filename + " .part " ;
auto partFile = machineCache . find ( actualFilename ) ;
if ( partFile ! = machineCache . end ( ) ) {
2017-06-16 17:14:19 +08:00
Future < Reference < IAsyncFile > > f = AsyncFileDetachable : : open ( partFile - > second ) ;
if ( FLOW_KNOBS - > PAGE_WRITE_CHECKSUM_HISTORY > 0 )
f = map ( f , [ = ] ( Reference < IAsyncFile > r ) { return Reference < IAsyncFile > ( new AsyncFileWriteChecker ( r ) ) ; } ) ;
return f ;
2017-05-26 04:48:44 +08:00
}
}
//Simulated disk parameters are shared by the AsyncFileNonDurable and the underlying SimpleFile. This way, they can both keep up with the time to start the next operation
Reference < DiskParameters > diskParameters ( new DiskParameters ( FLOW_KNOBS - > SIM_DISK_IOPS , FLOW_KNOBS - > SIM_DISK_BANDWIDTH ) ) ;
machineCache [ actualFilename ] = AsyncFileNonDurable : : open ( filename , actualFilename , SimpleFile : : open ( filename , flags , mode , diskParameters , false ) , diskParameters ) ;
}
2017-06-16 17:14:19 +08:00
Future < Reference < IAsyncFile > > f = AsyncFileDetachable : : open ( machineCache [ actualFilename ] ) ;
if ( FLOW_KNOBS - > PAGE_WRITE_CHECKSUM_HISTORY > 0 )
f = map ( f , [ = ] ( Reference < IAsyncFile > r ) { return Reference < IAsyncFile > ( new AsyncFileWriteChecker ( r ) ) ; } ) ;
return f ;
2017-05-26 04:48:44 +08:00
}
else
return AsyncFileCached : : open ( filename , flags , mode ) ;
}
// Deletes the given file. If mustBeDurable, returns only when the file is guaranteed to be deleted even after a power failure.
Future < Void > Sim2FileSystem : : deleteFile ( std : : string filename , bool mustBeDurable )
{
2019-01-08 05:07:19 +08:00
return Sim2 : : deleteFileImpl ( & g_sim2 , filename , mustBeDurable ) ;
2017-05-26 04:48:44 +08:00
}
2018-07-07 09:51:36 +08:00
Future < std : : time_t > Sim2FileSystem : : lastWriteTime ( std : : string filename ) {
2018-07-25 08:20:31 +08:00
// TODO: update this map upon file writes.
static std : : map < std : : string , double > fileWrites ;
if ( BUGGIFY & & g_random - > random01 ( ) < 0.01 ) {
fileWrites [ filename ] = now ( ) ;
}
return fileWrites [ filename ] ;
2018-07-07 09:51:36 +08:00
}
2017-05-26 04:48:44 +08:00
void Sim2FileSystem : : newFileSystem ( )
{
g_network - > setGlobal ( INetwork : : enFileSystem , ( flowGlobalType ) new Sim2FileSystem ( ) ) ;
}