2017-05-26 04:48:44 +08:00
/*
* SimulatedCluster . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
*
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
*
* http : //www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
# include <fstream>
# include "flow/actorcompiler.h"
# include "fdbrpc/simulator.h"
# include "fdbclient/FailureMonitorClient.h"
# include "fdbclient/DatabaseContext.h"
# include "TesterInterface.h"
# include "WorkerInterface.h"
# include "fdbclient/ClusterInterface.h"
# include "Knobs.h"
# include "ClusterRecruitmentInterface.h"
# include "fdbserver/CoordinationInterface.h"
# include "fdbmonitor/SimpleIni.h"
# include "fdbrpc/AsyncFileNonDurable.actor.h"
# include "fdbrpc/TLSConnection.h"
# include "fdbclient/ManagementAPI.h"
# include "fdbclient/NativeAPI.h"
# include "fdbclient/BackupAgent.h"
# ifndef WIN32
# include "versions.h"
# endif
# undef max
# undef min
extern bool buggifyActivated ;
extern " C " int g_expect_full_pointermap ;
extern const char * getHGVersion ( ) ;
const int PROCESS_START_TIME = 4 ;
const int MACHINE_REBOOT_TIME = 10 ;
bool destructed = false ;
static const char * certBytes =
" -----BEGIN CERTIFICATE----- \n "
" MIIEGzCCAwOgAwIBAgIJANUQj1rRA2XMMA0GCSqGSIb3DQEBBQUAMIGjMQswCQYD \n "
" VQQGEwJVUzELMAkGA1UECAwCVkExDzANBgNVBAcMBlZpZW5uYTEaMBgGA1UECgwR \n "
" Rm91bmRhdGlvbkRCLCBMTEMxGTAXBgNVBAsMEFRlc3QgZW5naW5lZXJpbmcxFTAT \n "
" BgNVBAMMDE1yLiBCaWcgVHVuYTEoMCYGCSqGSIb3DQEJARYZYmlnLnR1bmFAZm91 \n "
" bmRhdGlvbmRiLmNvbTAeFw0xNDEyMDUxNTEyMjFaFw0yNDEyMDIxNTEyMjFaMIGj \n "
" MQswCQYDVQQGEwJVUzELMAkGA1UECAwCVkExDzANBgNVBAcMBlZpZW5uYTEaMBgG \n "
" A1UECgwRRm91bmRhdGlvbkRCLCBMTEMxGTAXBgNVBAsMEFRlc3QgZW5naW5lZXJp \n "
" bmcxFTATBgNVBAMMDE1yLiBCaWcgVHVuYTEoMCYGCSqGSIb3DQEJARYZYmlnLnR1 \n "
" bmFAZm91bmRhdGlvbmRiLmNvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC \n "
" ggEBAKZTL2edDkiet4HBTZnjysn6gOVZH2MP02KVBIv/H7e+3w7ZOIRvcPzhZe9M \n "
" 3cGH1t/pkr9DSXvzIb42EffMVlpLD2VQn2H8VC2QSdJCIQcf802u+Taf+XtW6K1h \n "
" p/YPL1uhdopUs3c1oon8ykKwnOfrQYgv5pUa7jQdMkltI2MQJU3uFq3Z/LHTvIKe \n "
" FN+bqK0iYhZthwMG7Rld4+RgKZoT4u1B6w/duEWk9KLjgs7fTf3Oe6JHCYNqwBJi \n "
" 78sJalwXz9Wf8wmMaYSG0XNA7vBOdpTFhVPSsh6e3rkydf5HydMade/II98MWpMe \n "
" hFg7FFMaJP6ig8p5iL+9QP2VMCkCAwEAAaNQME4wHQYDVR0OBBYEFIXGmIcKptBP \n "
" v3i9WS/mK78o5E/MMB8GA1UdIwQYMBaAFIXGmIcKptBPv3i9WS/mK78o5E/MMAwG \n "
" A1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEBAJkVgNGOXT+ZHCNEYLjr/6OM \n "
" UCHvwlMeaEyqxaOmK26J2kAADPhjBZ7lZOHWb2Wzb+BiQUIFGwNIMoRvsg8skpJa \n "
" OCqpVciHVXY/U8BiYY70DKozRza93Ab9om3pySGDJ/akdCjqbMT1Cb7Kloyw+hNh \n "
" XD4MML0lYiUE9KK35xyK6FgTx4A7IXl4b3lWBgglqTh4+P5J1+xy8AYJ0VfPoP7y \n "
" OoZgwAmkpkMnalReNkN7LALHGqMzv/qH04ODlkU/HUGgExtnINMxK9VEDIe/yLGm \n "
" DHy7gcQMj5Hyymack/d4ZF8CSrYpGZQeZGXoxOmTDwWcXgnYA+2o7lOYPb5Uu08= \n "
" -----END CERTIFICATE----- \n "
" -----BEGIN PRIVATE KEY----- \n "
" MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQCmUy9nnQ5InreB \n "
" wU2Z48rJ+oDlWR9jD9NilQSL/x+3vt8O2TiEb3D84WXvTN3Bh9bf6ZK/Q0l78yG+ \n "
" NhH3zFZaSw9lUJ9h/FQtkEnSQiEHH/NNrvk2n/l7VuitYaf2Dy9boXaKVLN3NaKJ \n "
" /MpCsJzn60GIL+aVGu40HTJJbSNjECVN7hat2fyx07yCnhTfm6itImIWbYcDBu0Z \n "
" XePkYCmaE+LtQesP3bhFpPSi44LO3039znuiRwmDasASYu/LCWpcF8/Vn/MJjGmE \n "
" htFzQO7wTnaUxYVT0rIent65MnX+R8nTGnXvyCPfDFqTHoRYOxRTGiT+ooPKeYi/ \n "
" vUD9lTApAgMBAAECggEBAIYCmDtfq9aPK0P8v82yX/4FPD2OZV+nrKXNc3BpCuE9 \n "
" hPOtyX/LWrol0b/Rqwr3rAWVaIt6Z4bbCuD7J9cEaL8voyP6pbCJYjmj/BbQ+VOI \n "
" Rrzcsid1Fcpu5+JqwK3c5kdp/NzQChmOuXt8lmrNal7iilZ0YdDZdfu/WnkW2mBB \n "
" oQHkujlnWr4PNYdwMOnBU6TwdOuz+inPVMLohOO0Vr585OxPsGzG2Ud3yQ/t34Cq \n "
" F9nmOXQoszftGKsL1yuh/3fGj/O86g/CRsUy05qZhDDBEYQD6qZCvD5+yp8oOWIR \n "
" SljM3GXDBnJqRPhP+Nyf6e6/GoQtfVZ9MPRzDDPzIBECgYEA2kX/zAs6taOiNqCb \n "
" 6nVGe7/3uQJz/CkmOSKIFKUu7lCEUjmMYpK3Xzp26RTUR9cT+g9y+cnJO1Vbaxtf \n "
" Qidje6K+Oi1pQyUGQ6W+U8cPJHz43PVa7IB5Az5i/sS2tu0BGhvGo9G6iYQjxXeD \n "
" 1197DRACgnm5AORQMum616XvSPMCgYEAwxKbkAzJzfZF6A3Ys+/0kycNfDP8xZoC \n "
" 1zV3d1b2JncsdAPCHYSKtpniRrQN9ASa3RMdkh+wrMN/KlbtU9Ddoc4NHxSTFV7F \n "
" wypFMzLZslqkQ6uHnVVewHV7prfoKsMci2c9iHO7W8TEv4aqW8XDd8OozP3/q2j4 \n "
" hvL7VIAVqXMCgYEAwAFnfOQ75uBkp00tGlfDgsRhc5vWz3CbMRNRRWfxGq41V+dL \n "
" uMJ7EAfr5ijue6uU5RmF+HkqzUjOvC894oGnn3CPibm8qNX+5q7799JZXa2ZdTVX \n "
" oEd7LAFLL/V3DP77Qy4/1Id/Ycydcu0pSuGw6tK0gnX06fXtHnxAYcaT8UUCgYAE \n "
" MytcP5o8r/ezVlD7Fsh6PpYAvZHMo1M6VPFchWfJTjmLyeTtA8SEx+1iPlAql8rJ \n "
" xbaWRc5k+dSMEdEMQ+vxpuELcUL1a9PwLsHMp2SefWsZ9eB2l7bxh9YAsebyvL6p \n "
" lbBydqNrB2KBCSIz1Z8uveytdS6C/0CSjzqwCA3vVwKBgQDAXqjo3xrzMlHeXm5o \n "
" qH/OjajjqbnPXHolHDitbLubyQ4E6KhMBMxfChBe/8VptB/Gs0efVbMVGuabxY7Q \n "
" iastGId8HyONy3UPGPxCn4b95cIxKvdpt+hvWtYHIBCfHXluQK7zsDMgvtXjYNiz \n "
" peZRikYlwmu1K2YRTf7oLE2Ogw== \n "
" -----END PRIVATE KEY----- \n " ;
template < class T >
T simulate ( const T & in ) {
BinaryWriter writer ( AssumeVersion ( currentProtocolVersion ) ) ;
writer < < in ;
BinaryReader reader ( writer . getData ( ) , writer . getLength ( ) , AssumeVersion ( currentProtocolVersion ) ) ;
T out ;
reader > > out ;
return out ;
}
static void simInitTLS ( ) {
Reference < TLSOptions > options ( new TLSOptions ) ;
options - > set_cert_data ( certBytes ) ;
options - > set_key_data ( certBytes ) ;
options - > register_network ( ) ;
}
ACTOR Future < Void > runBackup ( Reference < ClusterConnectionFile > connFile ) {
state std : : vector < Future < Void > > agentFutures ;
while ( g_simulator . backupAgents = = ISimulator : : WaitForType ) {
Void _ = wait ( delay ( 1.0 ) ) ;
}
if ( g_simulator . backupAgents = = ISimulator : : BackupToFile ) {
Reference < Cluster > cluster = Cluster : : createCluster ( connFile , - 1 ) ;
Database cx = cluster - > createDatabase ( LiteralStringRef ( " DB " ) ) . get ( ) ;
state FileBackupAgent fileAgent ;
state double backupPollDelay = 1.0 / CLIENT_KNOBS - > BACKUP_AGGREGATE_POLL_RATE ;
agentFutures . push_back ( fileAgent . run ( cx , & backupPollDelay , CLIENT_KNOBS - > SIM_BACKUP_TASKS_PER_AGENT ) ) ;
while ( g_simulator . backupAgents = = ISimulator : : BackupToFile ) {
Void _ = wait ( delay ( 1.0 ) ) ;
}
for ( auto it : agentFutures ) {
it . cancel ( ) ;
}
}
else if ( g_simulator . backupAgents = = ISimulator : : BackupToDB ) {
Reference < Cluster > cluster = Cluster : : createCluster ( connFile , - 1 ) ;
Database cx = cluster - > createDatabase ( LiteralStringRef ( " DB " ) ) . get ( ) ;
Reference < ClusterConnectionFile > extraFile ( new ClusterConnectionFile ( * g_simulator . extraDB ) ) ;
Reference < Cluster > extraCluster = Cluster : : createCluster ( extraFile , - 1 ) ;
state Database extraDB = extraCluster - > createDatabase ( LiteralStringRef ( " DB " ) ) . get ( ) ;
TraceEvent ( " StartingBackupAgents " ) . detail ( " connFile " , connFile - > getConnectionString ( ) . toString ( ) ) . detail ( " extraString " , extraFile - > getConnectionString ( ) . toString ( ) ) ;
state DatabaseBackupAgent dbAgent = DatabaseBackupAgent ( cx ) ;
state DatabaseBackupAgent extraAgent = DatabaseBackupAgent ( extraDB ) ;
state double dr1PollDelay = 1.0 / CLIENT_KNOBS - > BACKUP_AGGREGATE_POLL_RATE ;
state double dr2PollDelay = 1.0 / CLIENT_KNOBS - > BACKUP_AGGREGATE_POLL_RATE ;
agentFutures . push_back ( extraAgent . run ( cx , & dr1PollDelay , CLIENT_KNOBS - > SIM_BACKUP_TASKS_PER_AGENT ) ) ;
agentFutures . push_back ( dbAgent . run ( extraDB , & dr2PollDelay , CLIENT_KNOBS - > SIM_BACKUP_TASKS_PER_AGENT ) ) ;
while ( g_simulator . backupAgents = = ISimulator : : BackupToDB ) {
Void _ = wait ( delay ( 1.0 ) ) ;
}
TraceEvent ( " StoppingBackupAgents " ) ;
for ( auto it : agentFutures ) {
it . cancel ( ) ;
}
}
Void _ = wait ( Future < Void > ( Never ( ) ) ) ;
throw internal_error ( ) ;
}
// SOMEDAY: when a process can be rebooted in isolation from the other on that machine,
// a loop{} will be needed around the waiting on simulatedFDBD(). For now this simply
// takes care of house-keeping such as context switching and file closing.
ACTOR Future < ISimulator : : KillType > simulatedFDBDRebooter (
Reference < ClusterConnectionFile > connFile ,
uint32_t ip ,
bool useSSL ,
uint16_t port ,
LocalityData localities ,
ProcessClass processClass ,
std : : string * dataFolder ,
std : : string * coordFolder ,
std : : string baseFolder ,
ClusterConnectionString connStr ,
bool useSeedFile ,
bool runBackupAgents )
{
state ISimulator : : ProcessInfo * simProcess = g_simulator . getCurrentProcess ( ) ;
2017-06-20 07:48:15 +08:00
state UID randomId = g_nondeterministic_random - > randomUniqueID ( ) ;
state int cycles = 0 ;
2017-05-26 04:48:44 +08:00
loop {
auto waitTime = SERVER_KNOBS - > MIN_REBOOT_TIME + ( SERVER_KNOBS - > MAX_REBOOT_TIME - SERVER_KNOBS - > MIN_REBOOT_TIME ) * g_random - > random01 ( ) ;
cycles + + ;
2017-06-20 07:48:15 +08:00
TraceEvent ( " SimulatedFDBDWait " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " ProcessAddress " , NetworkAddress ( ip , port , true , false ) )
. detailext ( " ZoneId " , localities . zoneId ( ) )
2017-06-20 07:48:15 +08:00
. detail ( " waitTime " , waitTime ) . detail ( " Port " , port ) ;
2017-05-26 04:48:44 +08:00
Void _ = wait ( delay ( waitTime ) ) ;
state ISimulator : : ProcessInfo * process = g_simulator . newProcess ( " Server " , ip , port , localities , processClass , dataFolder - > c_str ( ) , coordFolder - > c_str ( ) ) ;
Void _ = wait ( g_simulator . onProcess ( process , TaskDefaultYield ) ) ; // Now switch execution to the process on which we will run
state Future < ISimulator : : KillType > onShutdown = process - > onShutdown ( ) ;
try {
2017-06-20 07:48:15 +08:00
TraceEvent ( " SimulatedRebooterStarting " , localities . zoneId ( ) ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) )
. detail ( " ProcessAddress " , process - > address . toString ( ) )
2017-06-20 07:48:15 +08:00
. detail ( " ProcessExcluded " , process - > excluded )
2017-05-26 04:48:44 +08:00
. detail ( " UsingSSL " , useSSL ) ;
TraceEvent ( " ProgramStart " ) . detail ( " Cycles " , cycles )
. detail ( " SourceVersion " , getHGVersion ( ) )
. detail ( " Version " , FDB_VT_VERSION )
. detail ( " PackageName " , FDB_VT_PACKAGE_NAME )
. detail ( " DataFolder " , * dataFolder )
. detail ( " ConnectionString " , connFile ? connFile - > getConnectionString ( ) . toString ( ) : " " )
. detailf ( " ActualTime " , " %lld " , DEBUG_DETERMINISM ? 0 : time ( NULL ) )
. detail ( " CommandLine " , " fdbserver -r simulation " )
. detail ( " BuggifyEnabled " , buggifyActivated )
. detail ( " Simulated " , true )
. trackLatest ( " ProgramStart " ) ;
try {
//SOMEDAY: test lower memory limits, without making them too small and causing the database to stop making progress
FlowTransport : : createInstance ( 1 ) ;
Sim2FileSystem : : newFileSystem ( ) ;
simInitTLS ( ) ;
NetworkAddress n ( ip , port , true , useSSL ) ;
Future < Void > listen = FlowTransport : : transport ( ) . bind ( n , n ) ;
Future < Void > fd = fdbd ( connFile , localities , processClass , * dataFolder , * coordFolder , 500e6 , " " , " " ) ;
Future < Void > backup = runBackupAgents ? runBackup ( connFile ) : Future < Void > ( Never ( ) ) ;
Void _ = wait ( listen | | fd | | success ( onShutdown ) | | backup ) ;
} catch ( Error & e ) {
2017-05-27 08:43:28 +08:00
// If in simulation, if we make it here with an error other than io_timeout but enASIOTimedOut is set then somewhere an io_timeout was converted to a different error.
if ( g_network - > isSimulated ( ) & & e . code ( ) ! = error_code_io_timeout & & ( bool ) g_network - > global ( INetwork : : enASIOTimedOut ) )
TraceEvent ( SevError , " IOTimeoutErrorSuppressed " ) . detail ( " ErrorCode " , e . code ( ) ) . backtrace ( ) ;
2017-05-26 04:48:44 +08:00
if ( onShutdown . isReady ( ) & & onShutdown . isError ( ) ) throw onShutdown . getError ( ) ;
if ( e . code ( ) ! = error_code_actor_cancelled )
printf ( " SimulatedFDBDTerminated: %s \n " , e . what ( ) ) ;
ASSERT ( destructed | | g_simulator . getCurrentProcess ( ) = = process ) ; // simulatedFDBD catch called on different process
2017-05-27 05:20:11 +08:00
TraceEvent ( e . code ( ) = = error_code_actor_cancelled | | e . code ( ) = = error_code_file_not_found | | destructed ? SevInfo : SevError , " SimulatedFDBDTerminated " , localities . zoneId ( ) ) . error ( e , true ) ;
2017-05-26 04:48:44 +08:00
}
2017-06-20 07:48:15 +08:00
TraceEvent ( " SimulatedFDBDDone " , localities . zoneId ( ) ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " ProcessAddress " , process - > address )
2017-06-20 07:48:15 +08:00
. detail ( " ProcessExcluded " , process - > excluded )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , onShutdown . isReady ( ) ? onShutdown . get ( ) : ISimulator : : None ) ;
if ( ! onShutdown . isReady ( ) )
onShutdown = ISimulator : : InjectFaults ;
} catch ( Error & e ) {
2017-05-27 05:20:11 +08:00
TraceEvent ( destructed ? SevInfo : SevError , " SimulatedFDBDRebooterError " , localities . zoneId ( ) ) . error ( e , true ) ;
2017-05-26 04:48:44 +08:00
onShutdown = e ;
}
ASSERT ( destructed | | g_simulator . getCurrentProcess ( ) = = process ) ;
if ( ! process - > shutdownSignal . isSet ( ) & & ! destructed ) {
process - > rebooting = true ;
process - > shutdownSignal . send ( ISimulator : : None ) ;
}
Void _ = wait ( g_simulator . onProcess ( simProcess ) ) ;
Void _ = wait ( delay ( 0.00001 + FLOW_KNOBS - > MAX_BUGGIFIED_DELAY ) ) ; // One last chance for the process to clean up?
g_simulator . destroyProcess ( process ) ; // Leak memory here; the process may be used in other parts of the simulation
auto shutdownResult = onShutdown . get ( ) ;
2017-06-20 07:48:15 +08:00
TraceEvent ( " SimulatedFDBDShutdown " , localities . zoneId ( ) ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " ProcessAddress " , process - > address )
2017-06-20 07:48:15 +08:00
. detail ( " ProcessExcluded " , process - > excluded )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
if ( shutdownResult < ISimulator : : RebootProcessAndDelete ) {
2017-06-20 07:48:15 +08:00
TraceEvent ( " SimulatedFDBDLowerReboot " , localities . zoneId ( ) ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " ProcessAddress " , process - > address )
2017-06-20 07:48:15 +08:00
. detail ( " ProcessExcluded " , process - > excluded )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
return onShutdown . get ( ) ;
}
if ( onShutdown . get ( ) = = ISimulator : : RebootProcessAndDelete ) {
2017-06-20 07:48:15 +08:00
TraceEvent ( " SimulatedFDBDRebootAndDelete " , localities . zoneId ( ) ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " ProcessAddress " , process - > address )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
* coordFolder = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
* dataFolder = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
platform : : createDirectory ( * dataFolder ) ;
if ( ! useSeedFile ) {
writeFile ( joinPath ( * dataFolder , " fdb.cluster " ) , connStr . toString ( ) ) ;
connFile = Reference < ClusterConnectionFile > ( new ClusterConnectionFile ( joinPath ( * dataFolder , " fdb.cluster " ) ) ) ;
}
else {
connFile = Reference < ClusterConnectionFile > ( new ClusterConnectionFile ( joinPath ( * dataFolder , " fdb.cluster " ) , connStr . toString ( ) ) ) ;
}
}
else {
2017-06-20 07:48:15 +08:00
TraceEvent ( " SimulatedFDBDJustRepeat " , localities . zoneId ( ) ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " ProcessAddress " , process - > address )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
}
}
}
template < >
std : : string describe ( bool const & val ) {
return val ? " true " : " false " ;
}
template < >
std : : string describe ( int const & val ) {
return format ( " %d " , val ) ;
}
// Since a datacenter kill is considered to be the same as killing a machine, files cannot be swapped across datacenters
std : : map < Optional < Standalone < StringRef > > , std : : vector < std : : vector < std : : string > > > availableFolders ;
// process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per process
ACTOR Future < Void > simulatedMachine (
ClusterConnectionString connStr ,
std : : vector < uint32_t > ips ,
bool sslEnabled ,
LocalityData localities ,
ProcessClass processClass ,
std : : string baseFolder ,
bool restarting ,
bool useSeedFile ,
bool runBackupAgents )
{
state int bootCount = 0 ;
state std : : vector < std : : string > myFolders ;
state std : : vector < std : : string > coordFolders ;
try {
CSimpleIni ini ;
ini . SetUnicode ( ) ;
ini . LoadFile ( joinPath ( baseFolder , " restartInfo.ini " ) . c_str ( ) ) ;
for ( int i = 0 ; i < ips . size ( ) ; i + + ) {
if ( restarting ) {
myFolders . push_back ( ini . GetValue ( printable ( localities . zoneId ( ) ) . c_str ( ) , format ( " %d " , i ) . c_str ( ) , joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) . c_str ( ) ) ) ;
if ( i = = 0 ) {
std : : string coordinationFolder = ini . GetValue ( printable ( localities . zoneId ( ) ) . c_str ( ) , " coordinationFolder " , " " ) ;
if ( ! coordinationFolder . size ( ) )
coordinationFolder = ini . GetValue ( printable ( localities . zoneId ( ) ) . c_str ( ) , format ( " c%d " , i ) . c_str ( ) , joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) . c_str ( ) ) ;
coordFolders . push_back ( coordinationFolder ) ;
} else {
coordFolders . push_back ( ini . GetValue ( printable ( localities . zoneId ( ) ) . c_str ( ) , format ( " c%d " , i ) . c_str ( ) , joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) . c_str ( ) ) ) ;
}
}
else {
coordFolders . push_back ( joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ) ;
std : : string thisFolder = g_random - > randomUniqueID ( ) . toString ( ) ;
myFolders . push_back ( joinPath ( baseFolder , thisFolder ) ) ;
platform : : createDirectory ( myFolders [ i ] ) ;
if ( ! useSeedFile )
writeFile ( joinPath ( myFolders [ i ] , " fdb.cluster " ) , connStr . toString ( ) ) ;
}
}
loop {
state std : : vector < Future < ISimulator : : KillType > > processes ;
for ( int i = 0 ; i < ips . size ( ) ; i + + ) {
std : : string path = joinPath ( myFolders [ i ] , " fdb.cluster " ) ;
Reference < ClusterConnectionFile > clusterFile ( useSeedFile ? new ClusterConnectionFile ( path , connStr . toString ( ) ) : new ClusterConnectionFile ( path ) ) ;
processes . push_back ( simulatedFDBDRebooter ( clusterFile , ips [ i ] , sslEnabled , i + 1 , localities , processClass , & myFolders [ i ] , & coordFolders [ i ] , baseFolder , connStr , useSeedFile , runBackupAgents ) ) ;
}
TEST ( bootCount > = 1 ) ; // Simulated machine rebooted
TEST ( bootCount > = 2 ) ; // Simulated machine rebooted twice
TEST ( bootCount > = 3 ) ; // Simulated machine rebooted three times
+ + bootCount ;
TraceEvent ( " SimulatedMachineStart " )
. detail ( " Folder0 " , myFolders [ 0 ] )
. detail ( " CFolder0 " , coordFolders [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) )
. detail ( " SSL " , sslEnabled )
. detail ( " processes " , processes . size ( ) )
. detail ( " bootCount " , bootCount )
. detail ( " ProcessClass " , processClass . toString ( ) )
. detail ( " Restarting " , restarting )
. detail ( " UseSeedFile " , useSeedFile )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) )
. detail ( " Locality " , localities . toString ( ) ) ;
Void _ = wait ( waitForAll ( processes ) ) ;
TraceEvent ( " SimulatedMachineRebootStart " )
. detail ( " Folder0 " , myFolders [ 0 ] )
. detail ( " CFolder0 " , coordFolders [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
//Kill all open files, which may cause them to write invalid data.
auto & machineCache = g_simulator . getMachineById ( localities . zoneId ( ) ) - > openFiles ;
//Copy the file pointers to a vector because the map may be modified while we are killing files
std : : vector < AsyncFileNonDurable * > files ;
for ( auto fileItr = machineCache . begin ( ) ; fileItr ! = machineCache . end ( ) ; + + fileItr ) {
ASSERT ( fileItr - > second . isReady ( ) ) ;
files . push_back ( ( AsyncFileNonDurable * ) fileItr - > second . get ( ) . getPtr ( ) ) ;
}
std : : vector < Future < Void > > killFutures ;
for ( auto fileItr = files . begin ( ) ; fileItr ! = files . end ( ) ; + + fileItr )
killFutures . push_back ( ( * fileItr ) - > kill ( ) ) ;
Void _ = wait ( waitForAll ( killFutures ) ) ;
state std : : set < std : : string > filenames ;
state std : : string closingStr ;
auto & machineCache = g_simulator . getMachineById ( localities . zoneId ( ) ) - > openFiles ;
for ( auto it : machineCache ) {
filenames . insert ( it . first ) ;
closingStr + = it . first + " , " ;
ASSERT ( it . second . isReady ( ) & & ! it . second . isError ( ) ) ;
}
TraceEvent ( " SimulatedMachineRebootAfterKills " )
. detail ( " Folder0 " , myFolders [ 0 ] )
. detail ( " CFolder0 " , coordFolders [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) )
. detail ( " Closing " , closingStr )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
ISimulator : : MachineInfo * machine = g_simulator . getMachineById ( localities . zoneId ( ) ) ;
machine - > closingFiles = filenames ;
g_simulator . getMachineById ( localities . zoneId ( ) ) - > openFiles . clear ( ) ;
// During a reboot:
// The process is expected to close all files and be inactive in zero time, but not necessarily
// without delay(0)-equivalents, so delay(0) a few times waiting for it to achieve that goal.
// After an injected fault:
// The process is expected to shut down eventually, but not necessarily instantly. Wait up to 60 seconds.
state int shutdownDelayCount = 0 ;
state double backoff = 0 ;
loop {
auto & machineCache = g_simulator . getMachineById ( localities . zoneId ( ) ) - > closingFiles ;
if ( ! machineCache . empty ( ) ) {
std : : string openFiles ;
int i = 0 ;
for ( auto it = machineCache . begin ( ) ; it ! = machineCache . end ( ) & & i < 5 ; + + it ) {
openFiles + = * it + " , " ;
i + + ;
}
TraceEvent ( " MachineFilesOpen " ) . detail ( " PAddr " , toIPVectorString ( ips ) ) . detail ( " OpenFiles " , openFiles ) ;
} else
break ;
if ( shutdownDelayCount + + > = 50 ) { // Worker doesn't shut down instantly on reboot
TraceEvent ( SevError , " SimulatedFDBDFilesCheck " )
. detail ( " PAddrs " , toIPVectorString ( ips ) )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
ASSERT ( false ) ;
}
Void _ = wait ( delay ( backoff ) ) ;
backoff = std : : min ( backoff + 1.0 , 6.0 ) ;
}
TraceEvent ( " SimulatedFDBDFilesClosed " )
. detail ( " ProcessAddress " , toIPVectorString ( ips ) )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
g_simulator . destroyMachine ( localities . zoneId ( ) ) ;
// SOMEDAY: when processes can be rebooted, this check will be needed
//ASSERT( this machine is rebooting );
// Since processes can end with different codes, take the highest (least severe) to detmine what to do
state ISimulator : : KillType killType = processes [ 0 ] . get ( ) ;
for ( int i = 1 ; i < ips . size ( ) ; i + + )
killType = std : : max ( processes [ i ] . get ( ) , killType ) ;
TEST ( true ) ; // Simulated machine has been rebooted
state bool swap = killType = = ISimulator : : Reboot & & BUGGIFY_WITH_PROB ( 0.75 ) & & g_simulator . canSwapToMachine ( localities . zoneId ( ) ) ;
if ( swap )
availableFolders [ localities . dcId ( ) ] . push_back ( myFolders ) ;
auto rebootTime = g_random - > random01 ( ) * MACHINE_REBOOT_TIME ;
TraceEvent ( " SimulatedMachineShutdown " )
. detail ( " Swap " , swap )
. detail ( " KillType " , killType )
. detail ( " RebootTime " , rebootTime )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) )
. detail ( " MachineIPs " , toIPVectorString ( ips ) ) ;
Void _ = wait ( delay ( rebootTime ) ) ;
if ( swap ) {
auto & avail = availableFolders [ localities . dcId ( ) ] ;
int i = g_random - > randomInt ( 0 , avail . size ( ) ) ;
if ( i ! = avail . size ( ) - 1 )
std : : swap ( avail [ i ] , avail . back ( ) ) ;
auto toRebootFrom = avail . back ( ) ;
avail . pop_back ( ) ;
if ( myFolders ! = toRebootFrom ) {
TEST ( true ) ; // Simulated machine swapped data folders
TraceEvent ( " SimulatedMachineFolderSwap " )
. detail ( " OldFolder0 " , myFolders [ 0 ] ) . detail ( " NewFolder0 " , toRebootFrom [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) ) ;
}
myFolders = toRebootFrom ;
if ( ! useSeedFile ) {
for ( auto f : toRebootFrom ) {
if ( ! fileExists ( joinPath ( f , " fdb.cluster " ) ) ) {
writeFile ( joinPath ( f , " fdb.cluster " ) , connStr . toString ( ) ) ;
}
}
}
} else if ( killType = = ISimulator : : RebootAndDelete ) {
for ( int i = 0 ; i < ips . size ( ) ; i + + ) {
coordFolders [ i ] = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
myFolders [ i ] = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
platform : : createDirectory ( myFolders [ i ] ) ;
if ( ! useSeedFile ) {
writeFile ( joinPath ( myFolders [ i ] , " fdb.cluster " ) , connStr . toString ( ) ) ;
}
}
TEST ( true ) ; // Simulated machine rebooted with data loss
}
//this machine is rebooting = false;
}
} catch ( Error & e ) {
g_simulator . getMachineById ( localities . zoneId ( ) ) - > openFiles . clear ( ) ;
throw ;
}
}
# include "fdbclient/MonitorLeader.h"
ACTOR Future < Void > restartSimulatedSystem ( vector < Future < Void > > * systemActors , std : : string baseFolder ,
int * pTesterCount , Optional < ClusterConnectionString > * pConnString ) {
CSimpleIni ini ;
ini . SetUnicode ( ) ;
ini . LoadFile ( joinPath ( baseFolder , " restartInfo.ini " ) . c_str ( ) ) ;
// allows multiple ipAddr entries
ini . SetMultiKey ( ) ;
try {
int dataCenters = atoi ( ini . GetValue ( " META " , " dataCenters " ) ) ;
int killableMachines = atoi ( ini . GetValue ( " META " , " killableMachines " ) ) ;
int machineCount = atoi ( ini . GetValue ( " META " , " machineCount " ) ) ;
int machinesNeededForProgress = atoi ( ini . GetValue ( " META " , " machinesNeededForProgress " ) ) ;
int processesPerMachine = atoi ( ini . GetValue ( " META " , " processesPerMachine " ) ) ;
int desiredCoordinators = atoi ( ini . GetValue ( " META " , " desiredCoordinators " ) ) ;
int testerCount = atoi ( ini . GetValue ( " META " , " testerCount " ) ) ;
ClusterConnectionString conn ( ini . GetValue ( " META " , " connectionString " ) ) ;
* pConnString = conn ;
* pTesterCount = testerCount ;
bool usingSSL = conn . toString ( ) . find ( " :tls " ) ! = std : : string : : npos ;
int useSeedForMachine = g_random - > randomInt ( 0 , machineCount ) ;
for ( int i = 0 ; i < machineCount ; i + + ) {
Optional < Standalone < StringRef > > dcUID ;
std : : string zoneIdString = ini . GetValue ( " META " , format ( " %d " , i ) . c_str ( ) ) ;
Standalone < StringRef > zoneId = StringRef ( zoneIdString ) ;
std : : string dcUIDini = ini . GetValue ( zoneIdString . c_str ( ) , " dcUID " ) ;
if ( ! dcUIDini . empty ( ) ) dcUID = StringRef ( dcUIDini ) ;
ProcessClass processClass = ProcessClass ( ( ProcessClass : : ClassType ) atoi ( ini . GetValue ( zoneIdString . c_str ( ) , " mClass " ) ) , ProcessClass : : CommandLineSource ) ;
std : : vector < uint32_t > ipAddrs ;
int processes = atoi ( ini . GetValue ( zoneIdString . c_str ( ) , " processes " ) ) ;
auto ip = ini . GetValue ( zoneIdString . c_str ( ) , " ipAddr " ) ;
if ( ip = = NULL ) {
for ( int i = 0 ; i < processes ; i + + ) {
ipAddrs . push_back ( strtoul ( ini . GetValue ( zoneIdString . c_str ( ) , format ( " ipAddr%d " , i ) . c_str ( ) ) , NULL , 10 ) ) ;
}
}
else {
// old way
ipAddrs . push_back ( strtoul ( ip , NULL , 10 ) ) ;
for ( int i = 1 ; i < processes ; i + + ) {
ipAddrs . push_back ( ipAddrs . back ( ) + 1 ) ;
}
}
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , zoneId , zoneId , dcUID ) ;
localities . set ( LiteralStringRef ( " data_hall " ) , dcUID ) ;
systemActors - > push_back ( reportErrors ( simulatedMachine (
conn , ipAddrs , usingSSL , localities , processClass , baseFolder , true , i = = useSeedForMachine , false ) ,
processClass = = ProcessClass : : TesterClass ? " SimulatedTesterMachine " : " SimulatedMachine " ) ) ;
}
g_simulator . killableMachines = killableMachines ;
g_simulator . neededDatacenters = dataCenters ;
g_simulator . maxCoordinatorsInDatacenter = ( ( desiredCoordinators - 1 ) / dataCenters ) + 1 ;
g_simulator . killableDatacenters = 0 ;
g_simulator . machinesNeededForProgress = machinesNeededForProgress ;
g_simulator . desiredCoordinators = desiredCoordinators ;
g_simulator . processesPerMachine = processesPerMachine ;
}
catch ( Error & e ) {
TraceEvent ( SevError , " restartSimulationError " ) . error ( e ) ;
}
TraceEvent ( " RestartSimulatorSettings " )
. detail ( " killableMachines " , g_simulator . killableMachines )
. detail ( " neededDatacenters " , g_simulator . neededDatacenters )
. detail ( " killableDatacenters " , g_simulator . killableDatacenters )
. detail ( " machinesNeededForProgress " , g_simulator . machinesNeededForProgress )
. detail ( " maxCoordinatorsInDatacenter " , g_simulator . maxCoordinatorsInDatacenter )
. detail ( " desiredCoordinators " , g_simulator . desiredCoordinators )
. detail ( " processesPerMachine " , g_simulator . processesPerMachine ) ;
Void _ = wait ( delay ( 1.0 ) ) ;
return Void ( ) ;
}
2017-08-12 06:06:36 +08:00
struct SimulationConfig {
explicit SimulationConfig ( int extraDB ) ;
int extraDB ;
const int NOT_SPECIFIED = - 1 ;
int logs ;
int proxies ;
int resolvers ;
// Predefined redundancy mode name
const char * config_name ;
// Custom redundancy mode
// Only used if config_name == nullptr
int log_replicas = NOT_SPECIFIED ;
int log_anti_quorum = NOT_SPECIFIED ;
int storage_replicas = NOT_SPECIFIED ;
int storage_quorum = NOT_SPECIFIED ;
int replica_datacenters = NOT_SPECIFIED ;
int min_replica_datacenters = NOT_SPECIFIED ;
// Simulation layout
int datacenters ;
int machine_count ; // Total, not per DC.
int processes_per_machine ;
int coordinators ;
const char * storage_engine ;
int32_t minMachinesRequired ( ) const { return std : : max ( log_replicas , storage_replicas ) ; }
int32_t maxMachineFailuresTolerated ( ) const { return std : : min ( log_replicas - 1 - log_anti_quorum , storage_quorum - 1 ) ; }
std : : string toString ( ) ;
private :
void generateNormalConfig ( ) ;
} ;
SimulationConfig : : SimulationConfig ( int extraDB ) : extraDB ( extraDB ) {
// Set up the simplest most default config.
logs = CLIENT_KNOBS - > DEFAULT_AUTO_LOGS ;
proxies = CLIENT_KNOBS - > DEFAULT_AUTO_PROXIES ;
resolvers = CLIENT_KNOBS - > DEFAULT_AUTO_RESOLVERS ;
log_replicas = logs ;
storage_replicas = storage_quorum = logs ;
log_anti_quorum = 0 ;
config_name = " single " ;
storage_engine = " memory " ;
datacenters = 1 ;
machine_count = 1 ;
processes_per_machine = 1 ;
coordinators = 1 ;
generateNormalConfig ( ) ;
}
2017-05-26 04:48:44 +08:00
2017-08-12 06:06:36 +08:00
void SimulationConfig : : generateNormalConfig ( ) {
datacenters = g_random - > randomInt ( 1 , 4 ) ;
if ( g_random - > random01 ( ) < 0.25 ) logs = g_random - > randomInt ( 1 , 7 ) ;
if ( g_random - > random01 ( ) < 0.25 ) proxies = g_random - > randomInt ( 1 , 7 ) ;
if ( g_random - > random01 ( ) < 0.25 ) resolvers = g_random - > randomInt ( 1 , 7 ) ;
storage_engine = g_random - > random01 ( ) < 0.5 ? " ssd " : " memory " ;
int replication_type = g_random - > randomInt ( 1 , 6 ) ;
//replication_type = 1; //ahm
switch ( replication_type ) {
case 0 : {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster using custom redundancy mode
2017-08-12 06:06:36 +08:00
config_name = nullptr ;
storage_replicas = g_random - > randomInt ( 1 , 5 ) ;
storage_quorum = storage_replicas ;
log_replicas = g_random - > randomInt ( 1 , 5 ) ;
log_anti_quorum = g_random - > randomInt ( 0 , log_replicas ) ;
replica_datacenters = 1 ;
min_replica_datacenters = 1 ;
break ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
case 1 : {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in single redundancy mode
2017-08-12 06:06:36 +08:00
config_name = " single " ;
storage_replicas = storage_quorum = 1 ;
log_replicas = 1 ;
log_anti_quorum = 0 ;
break ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
case 2 : {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in double redundancy mode
2017-08-12 06:06:36 +08:00
config_name = " double " ;
storage_replicas = storage_quorum = 2 ;
log_replicas = 2 ;
log_anti_quorum = 0 ;
break ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
case 3 : {
if ( datacenters = = 1 ) {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in triple redundancy mode
2017-08-12 06:06:36 +08:00
config_name = " triple " ;
storage_replicas = storage_quorum = 3 ;
log_replicas = 3 ;
log_anti_quorum = 0 ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
else if ( datacenters = = 2 ) {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in 2 datacenter mode
2017-08-12 06:06:36 +08:00
config_name = " two_datacenter " ;
storage_replicas = storage_quorum = 3 ;
log_replicas = 3 ;
log_anti_quorum = 0 ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
else if ( datacenters = = 3 ) {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in 3 data-hall mode
2017-08-12 06:06:36 +08:00
config_name = " three_data_hall " ;
storage_replicas = storage_quorum = 4 ;
log_replicas = 4 ;
log_anti_quorum = 0 ;
2017-05-26 04:48:44 +08:00
}
else {
ASSERT ( false ) ;
}
2017-08-12 06:06:36 +08:00
break ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
case 4 :
case 5 :
// Leave all settings at their default
break ;
default :
ASSERT ( false ) ; // Programmer forgot to adjust cases.
}
machine_count = g_random - > randomInt ( std : : max ( 2 + datacenters , minMachinesRequired ( ) ) , extraDB ? 6 : 10 ) ;
processes_per_machine = g_random - > randomInt ( 1 , ( extraDB ? 14 : 28 ) / machine_count + 2 ) ;
coordinators = BUGGIFY ? g_random - > randomInt ( 1 , machine_count + 1 ) : std : : min ( machine_count , maxMachineFailuresTolerated ( ) * 2 + 1 ) ;
}
2017-05-26 04:48:44 +08:00
2017-08-12 06:06:36 +08:00
std : : string SimulationConfig : : toString ( ) {
std : : stringstream config ;
config < < " new " ;
# define COPY_INT(x) do { ASSERT(x != NOT_SPECIFIED); config << " " #x ":=" << x; } while(0)
# define COPY_STR(x) do { ASSERT(x != nullptr); config << " " << x; } while(0)
if ( config_name ! = nullptr ) {
COPY_STR ( config_name ) ;
} else {
COPY_INT ( log_replicas ) ;
COPY_INT ( log_anti_quorum ) ;
COPY_INT ( storage_replicas ) ;
COPY_INT ( storage_quorum ) ;
COPY_INT ( replica_datacenters ) ;
COPY_INT ( min_replica_datacenters ) ;
}
COPY_INT ( logs ) ;
COPY_INT ( proxies ) ;
COPY_INT ( resolvers ) ;
COPY_STR ( storage_engine ) ;
# undef COPY_INT
# undef COPY_STR
return config . str ( ) ;
2017-05-26 04:48:44 +08:00
}
void setupSimulatedSystem ( vector < Future < Void > > * systemActors , std : : string baseFolder ,
int * pTesterCount , Optional < ClusterConnectionString > * pConnString ,
Standalone < StringRef > * pStartingConfiguration , int extraDB )
{
// SOMEDAY: this does not test multi-interface configurations
2017-08-12 06:06:36 +08:00
SimulationConfig simconfig ( extraDB ) ;
std : : string startingConfigString = simconfig . toString ( ) ;
2017-05-26 04:48:44 +08:00
std : : map < std : : string , std : : string > startingConfigMap ;
ASSERT ( buildConfiguration ( startingConfigString , startingConfigMap ) = = ConfigurationResult : : SUCCESS ) ;
DatabaseConfiguration startingConfig ;
for ( auto kv : startingConfigMap ) startingConfig . set ( kv . first , kv . second ) ;
g_simulator . storagePolicy = startingConfig . storagePolicy ;
g_simulator . tLogPolicy = startingConfig . tLogPolicy ;
g_simulator . tLogWriteAntiQuorum = startingConfig . tLogWriteAntiQuorum ;
ASSERT ( g_simulator . storagePolicy ) ;
ASSERT ( g_simulator . tLogPolicy ) ;
TraceEvent ( " simulatorConfig " ) . detail ( " tLogPolicy " , g_simulator . tLogPolicy - > info ( ) ) . detail ( " storagePolicy " , g_simulator . storagePolicy - > info ( ) ) . detail ( " tLogWriteAntiQuorum " , g_simulator . tLogWriteAntiQuorum ) . detail ( " ConfigString " , startingConfigString ) ;
2017-08-12 06:06:36 +08:00
int dataCenters = simconfig . datacenters ;
int machineCount = simconfig . machine_count ;
2017-05-26 04:48:44 +08:00
// half the time, when we have more than 4 machines that are not the first in their dataCenter, assign classes
bool assignClasses = machineCount - dataCenters > 4 & & g_random - > random01 ( ) < 0.5 ;
2017-08-12 06:06:36 +08:00
int processesPerMachine = simconfig . processes_per_machine ;
2017-05-26 04:48:44 +08:00
// Use SSL half the time
bool sslEnabled = g_random - > random01 ( ) < 0.05 ;
TEST ( sslEnabled ) ; // SSL enabled
TEST ( ! sslEnabled ) ; // SSL disabled
// Pick coordination processes.
2017-08-12 06:06:36 +08:00
int coordinatorCount = simconfig . coordinators ;
2017-05-26 04:48:44 +08:00
vector < NetworkAddress > coordinatorAddresses ;
for ( int dc = 0 ; dc < dataCenters ; dc + + ) {
int machines = machineCount / dataCenters + ( dc < machineCount % dataCenters ) ; // add remainder of machines to first datacenter
int dcCoordinators = coordinatorCount / dataCenters + ( dc < coordinatorCount % dataCenters ) ;
for ( int m = 0 ; m < dcCoordinators ; m + + ) {
uint32_t ip = 2 < < 24 | dc < < 16 | 1 < < 8 | m ;
coordinatorAddresses . push_back ( NetworkAddress ( ip , 1 , true , sslEnabled ) ) ;
TraceEvent ( " SelectedCoordinator " ) . detail ( " Address " , coordinatorAddresses . back ( ) ) ;
}
}
2017-05-27 05:20:11 +08:00
2017-05-26 04:48:44 +08:00
g_random - > randomShuffle ( coordinatorAddresses ) ;
for ( int i = 0 ; i < ( coordinatorAddresses . size ( ) / 2 ) + 1 ; i + + ) {
2017-06-20 07:48:15 +08:00
TraceEvent ( " ProtectCoordinator " ) . detail ( " Address " , coordinatorAddresses [ i ] ) . detail ( " Coordinators " , describe ( coordinatorAddresses ) ) . backtrace ( ) ;
2017-06-03 04:52:21 +08:00
g_simulator . protectedAddresses . insert ( NetworkAddress ( coordinatorAddresses [ i ] . ip , coordinatorAddresses [ i ] . port , true , false ) ) ;
2017-05-26 04:48:44 +08:00
}
g_random - > randomShuffle ( coordinatorAddresses ) ;
2017-05-27 05:20:11 +08:00
2017-05-26 04:48:44 +08:00
ASSERT ( coordinatorAddresses . size ( ) = = coordinatorCount ) ;
ClusterConnectionString conn ( coordinatorAddresses , LiteralStringRef ( " TestCluster:0 " ) ) ;
g_simulator . extraDB = extraDB ? new ClusterConnectionString ( coordinatorAddresses , ( ( extraDB = = 1 & & BUGGIFY ) ? LiteralStringRef ( " TestCluster:0 " ) : LiteralStringRef ( " ExtraCluster:0 " ) ) ) : NULL ;
* pConnString = conn ;
TraceEvent ( " SimulatedConnectionString " ) . detail ( " String " , conn . toString ( ) ) . detail ( " ConfigString " , startingConfigString ) ;
int assignedMachines = 0 , nonVersatileMachines = 0 ;
for ( int dc = 0 ; dc < dataCenters ; dc + + ) {
Optional < Standalone < StringRef > > dcUID ;
if ( ( dc > 0 ) | | ( g_random - > random01 ( ) > 0.01 ) ) dcUID = StringRef ( format ( " %d " , dc ) ) ;
std : : vector < UID > machineIdentities ;
int machines = machineCount / dataCenters + ( dc < machineCount % dataCenters ) ; // add remainder of machines to first datacenter
int dcCoordinators = coordinatorCount / dataCenters + ( dc < coordinatorCount % dataCenters ) ;
printf ( " Datacenter %d: %d/%d machines, %d/%d coordinators \n " , dc , machines , machineCount , dcCoordinators , coordinatorCount ) ;
ASSERT ( dcCoordinators < = machines ) ;
int useSeedForMachine = g_random - > randomInt ( 0 , machines ) ;
for ( int machine = 0 ; machine < machines ; machine + + ) {
Standalone < StringRef > zoneId ( g_random - > randomUniqueID ( ) . toString ( ) ) ;
//Choose a machine class
ProcessClass processClass = ProcessClass ( ProcessClass : : UnsetClass , ProcessClass : : CommandLineSource ) ;
if ( assignClasses ) {
if ( assignedMachines < 4 )
processClass = ProcessClass ( ( ProcessClass : : ClassType ) g_random - > randomInt ( 0 , 2 ) , ProcessClass : : CommandLineSource ) ; //Unset or Storage
else if ( assignedMachines = = 4 )
processClass = ProcessClass ( ( ProcessClass : : ClassType ) ( g_random - > randomInt ( 0 , 2 ) * ProcessClass : : ResolutionClass ) , ProcessClass : : CommandLineSource ) ; //Unset or Resolution
else
processClass = ProcessClass ( ( ProcessClass : : ClassType ) g_random - > randomInt ( 0 , 3 ) , ProcessClass : : CommandLineSource ) ; //Unset, Storage, or Transaction
if ( processClass = = ProcessClass : : ResolutionClass ) // *can't* be assigned to other roles, even in an emergency
nonVersatileMachines + + ;
}
std : : vector < uint32_t > ips ;
for ( int i = 0 ; i < processesPerMachine ; i + + ) {
ips . push_back ( 2 < < 24 | dc < < 16 | g_random - > randomInt ( 1 , i + 2 ) < < 8 | machine ) ;
}
// check the sslEnablementMap using only one ip(
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , zoneId , zoneId , dcUID ) ;
localities . set ( LiteralStringRef ( " data_hall " ) , dcUID ) ;
systemActors - > push_back ( reportErrors ( simulatedMachine ( conn , ips , sslEnabled ,
localities , processClass , baseFolder , false , machine = = useSeedForMachine , true ) , " SimulatedMachine " ) ) ;
if ( extraDB ) {
std : : vector < uint32_t > extraIps ;
for ( int i = 0 ; i < processesPerMachine ; i + + ) {
extraIps . push_back ( 4 < < 24 | dc < < 16 | g_random - > randomInt ( 1 , i + 2 ) < < 8 | machine ) ;
}
Standalone < StringRef > newZoneId = Standalone < StringRef > ( g_random - > randomUniqueID ( ) . toString ( ) ) ;
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , newZoneId , newZoneId , dcUID ) ;
localities . set ( LiteralStringRef ( " data_hall " ) , dcUID ) ;
systemActors - > push_back ( reportErrors ( simulatedMachine ( * g_simulator . extraDB , extraIps , sslEnabled ,
localities ,
processClass , baseFolder , false , machine = = useSeedForMachine , false ) , " SimulatedMachine " ) ) ;
}
assignedMachines + + ;
}
}
g_simulator . desiredCoordinators = coordinatorCount ;
g_simulator . killableMachines = startingConfig . maxMachineFailuresTolerated ( ) ;
2017-06-27 07:27:10 +08:00
g_simulator . neededDatacenters = 1 ;
g_simulator . killableDatacenters = 0 ;
2017-05-26 04:48:44 +08:00
g_simulator . physicalDatacenters = dataCenters ;
g_simulator . maxCoordinatorsInDatacenter = ( ( coordinatorCount - 1 ) / dataCenters ) + 1 ;
g_simulator . machinesNeededForProgress = startingConfig . minMachinesRequired ( ) + nonVersatileMachines ;
g_simulator . processesPerMachine = processesPerMachine ;
TraceEvent ( " SetupSimulatorSettings " )
. detail ( " killableMachines " , g_simulator . killableMachines )
. detail ( " neededDatacenters " , g_simulator . neededDatacenters )
. detail ( " killableDatacenters " , g_simulator . killableDatacenters )
. detail ( " machinesNeededForProgress " , g_simulator . machinesNeededForProgress )
. detail ( " maxCoordinatorsInDatacenter " , g_simulator . maxCoordinatorsInDatacenter )
. detail ( " desiredCoordinators " , g_simulator . desiredCoordinators )
. detail ( " processesPerMachine " , g_simulator . processesPerMachine ) ;
// SOMEDAY: add locality for testers to simulate network topology
// FIXME: Start workers with tester class instead, at least sometimes run tests with the testers-only flag
int testerCount = * pTesterCount = g_random - > randomInt ( 4 , 9 ) ;
int useSeedForMachine = g_random - > randomInt ( 0 , testerCount ) ;
for ( int i = 0 ; i < testerCount ; i + + ) {
std : : vector < uint32_t > ips ;
ips . push_back ( 0x03040301 + i ) ;
Standalone < StringRef > newZoneId = Standalone < StringRef > ( g_random - > randomUniqueID ( ) . toString ( ) ) ;
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , newZoneId , newZoneId , Optional < Standalone < StringRef > > ( ) ) ;
systemActors - > push_back ( reportErrors ( simulatedMachine (
conn , ips , sslEnabled ,
localities , ProcessClass ( ProcessClass : : TesterClass , ProcessClass : : CommandLineSource ) ,
baseFolder , false , i = = useSeedForMachine , false ) ,
" SimulatedTesterMachine " ) ) ;
}
/*int testerCount = g_random->randomInt(4, 9);
for ( int i = 0 ; i < testerCount ; i + + )
g_simulator . asNewProcess ( " TestWorker " , 0x03040301 + i , LocalityData ( g_random - > randomUniqueID ( ) . toString ( ) , Optional < Standalone < StringRef > > ( ) ) , [ & ] {
vector < Future < Void > > v ;
Reference < AsyncVar < ClusterControllerFullInterface > > cc ( new AsyncVar < ClusterControllerFullInterface > ) ;
Reference < AsyncVar < ClusterInterface > > ci ( new AsyncVar < ClusterInterface > ) ;
v . push_back ( monitorLeader ( coordinators , cc ) ) ;
v . push_back ( extractClusterInterface ( cc , ci ) ) ;
v . push_back ( failureMonitorClient ( ci ) ) ;
v . push_back ( testerServer ( cc ) ) ;
systemActors - > push_back ( waitForAll ( v ) ) ;
} ) ; */
* pStartingConfiguration = startingConfigString ;
// save some state that we only need when restarting the simulator.
g_simulator . connectionString = conn . toString ( ) ;
g_simulator . testerCount = testerCount ;
TraceEvent ( " SimulatedClusterStarted " )
. detail ( " KillableMachines " , g_simulator . killableMachines )
. detail ( " DataCenters " , dataCenters )
. detail ( " NeededDataCenters " , g_simulator . neededDatacenters )
. detail ( " ServerMachineCount " , machineCount )
. detail ( " ServersNeededForProgress " , g_simulator . machinesNeededForProgress )
. detail ( " ProcessesPerServer " , processesPerMachine )
. detail ( " SSLEnabled " , sslEnabled )
. detail ( " ClassesAssigned " , assignClasses )
. detail ( " StartingConfiguration " , pStartingConfiguration - > toString ( ) )
//.detail("TesterCount", testerCount)
;
// FIXME
/*Void _ = wait( DatabaseContext::configureDatabase( *pZookeeper, ClusterInterface::DEFAULT, mode ) );
Void _ = wait ( DatabaseContext : : configureDatabase ( * pZookeeper , ClusterInterface : : ALL , mode ) ) ; */
}
int checkExtraDB ( const char * testFile ) {
std : : ifstream ifs ;
ifs . open ( testFile , std : : ifstream : : in ) ;
if ( ! ifs . good ( ) )
return 0 ;
std : : string cline ;
while ( ifs . good ( ) ) {
getline ( ifs , cline ) ;
std : : string line = removeWhitespace ( std : : string ( cline ) ) ;
if ( ! line . size ( ) | | line . find ( ' ; ' ) = = 0 )
continue ;
size_t found = line . find ( ' = ' ) ;
if ( found = = std : : string : : npos )
// hmmm, not good
continue ;
std : : string attrib = removeWhitespace ( line . substr ( 0 , found ) ) ;
std : : string value = removeWhitespace ( line . substr ( found + 1 ) ) ;
if ( attrib = = " extraDB " ) {
int v = 0 ;
sscanf ( value . c_str ( ) , " %d " , & v ) ;
ifs . close ( ) ;
return v ;
}
}
ifs . close ( ) ;
return 0 ;
}
ACTOR void setupAndRun ( std : : string dataFolder , const char * testFile , bool rebooting ) {
state vector < Future < Void > > systemActors ;
state Optional < ClusterConnectionString > connFile ;
state Standalone < StringRef > startingConfiguration ;
state int testerCount = 1 ;
state int extraDB = checkExtraDB ( testFile ) ;
Void _ = wait ( g_simulator . onProcess ( g_simulator . newProcess (
2017-05-27 05:20:11 +08:00
" TestSystem " , 0x01010101 , 1 , LocalityData ( Optional < Standalone < StringRef > > ( ) , Standalone < StringRef > ( g_random - > randomUniqueID ( ) . toString ( ) ) , Optional < Standalone < StringRef > > ( ) , Optional < Standalone < StringRef > > ( ) ) , ProcessClass ( ProcessClass : : TesterClass , ProcessClass : : CommandLineSource ) , " " , " " ) , TaskDefaultYield ) ) ;
2017-05-26 04:48:44 +08:00
Sim2FileSystem : : newFileSystem ( ) ;
FlowTransport : : createInstance ( 1 ) ;
simInitTLS ( ) ;
TEST ( true ) ; // Simulation start
try {
//systemActors.push_back( startSystemMonitor(dataFolder) );
if ( rebooting ) {
Void _ = wait ( timeoutError ( restartSimulatedSystem ( & systemActors , dataFolder , & testerCount , & connFile ) , 100.0 ) ) ;
}
else {
g_expect_full_pointermap = 1 ;
setupSimulatedSystem ( & systemActors , dataFolder , & testerCount , & connFile , & startingConfiguration , extraDB ) ;
Void _ = wait ( delay ( 1.0 ) ) ; // FIXME: WHY!!! //wait for machines to boot
}
std : : string clusterFileDir = joinPath ( dataFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
platform : : createDirectory ( clusterFileDir ) ;
writeFile ( joinPath ( clusterFileDir , " fdb.cluster " ) , connFile . get ( ) . toString ( ) ) ;
Void _ = wait ( timeoutError ( runTests ( Reference < ClusterConnectionFile > ( new ClusterConnectionFile ( joinPath ( clusterFileDir , " fdb.cluster " ) ) ) , TEST_TYPE_FROM_FILE , TEST_ON_TESTERS , testerCount , testFile , startingConfiguration ) , buggifyActivated ? 36000.0 : 5400.0 ) ) ;
} catch ( Error & e ) {
TraceEvent ( SevError , " setupAndRunError " ) . error ( e ) ;
}
TraceEvent ( " SimulatedSystemDestruct " ) ;
destructed = true ;
systemActors . clear ( ) ;
g_simulator . stop ( ) ;
}