2017-05-26 04:48:44 +08:00
/*
* SimulatedCluster . actor . cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013 - 2018 Apple Inc . and the FoundationDB project authors
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* Licensed under the Apache License , Version 2.0 ( the " License " ) ;
* you may not use this file except in compliance with the License .
* You may obtain a copy of the License at
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* http : //www.apache.org/licenses/LICENSE-2.0
2018-02-22 02:25:11 +08:00
*
2017-05-26 04:48:44 +08:00
* Unless required by applicable law or agreed to in writing , software
* distributed under the License is distributed on an " AS IS " BASIS ,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
* See the License for the specific language governing permissions and
* limitations under the License .
*/
# include <fstream>
# include "fdbrpc/simulator.h"
# include "fdbclient/FailureMonitorClient.h"
# include "fdbclient/DatabaseContext.h"
2019-02-18 11:25:16 +08:00
# include "fdbserver/TesterInterface.actor.h"
2019-02-18 11:13:26 +08:00
# include "fdbserver/WorkerInterface.actor.h"
2017-05-26 04:48:44 +08:00
# include "fdbclient/ClusterInterface.h"
2018-10-20 01:30:13 +08:00
# include "fdbserver/Knobs.h"
# include "fdbserver/ClusterRecruitmentInterface.h"
2017-05-26 04:48:44 +08:00
# include "fdbserver/CoordinationInterface.h"
# include "fdbmonitor/SimpleIni.h"
# include "fdbrpc/AsyncFileNonDurable.actor.h"
# include "fdbrpc/TLSConnection.h"
2019-02-18 09:38:13 +08:00
# include "fdbclient/ManagementAPI.actor.h"
2019-02-18 07:41:16 +08:00
# include "fdbclient/NativeAPI.actor.h"
2019-02-18 07:19:05 +08:00
# include "fdbclient/BackupAgent.actor.h"
2019-02-09 04:24:32 +08:00
# if defined(CMAKE_BUILD) || !defined(WIN32)
2017-05-26 04:48:44 +08:00
# include "versions.h"
# endif
2018-08-11 06:18:24 +08:00
# include "flow/actorcompiler.h" // This must be the last #include.
2017-05-26 04:48:44 +08:00
# undef max
# undef min
extern bool buggifyActivated ;
extern " C " int g_expect_full_pointermap ;
extern const char * getHGVersion ( ) ;
const int PROCESS_START_TIME = 4 ;
const int MACHINE_REBOOT_TIME = 10 ;
bool destructed = false ;
static const char * certBytes =
" -----BEGIN CERTIFICATE----- \n "
" MIIEGzCCAwOgAwIBAgIJANUQj1rRA2XMMA0GCSqGSIb3DQEBBQUAMIGjMQswCQYD \n "
" VQQGEwJVUzELMAkGA1UECAwCVkExDzANBgNVBAcMBlZpZW5uYTEaMBgGA1UECgwR \n "
" Rm91bmRhdGlvbkRCLCBMTEMxGTAXBgNVBAsMEFRlc3QgZW5naW5lZXJpbmcxFTAT \n "
" BgNVBAMMDE1yLiBCaWcgVHVuYTEoMCYGCSqGSIb3DQEJARYZYmlnLnR1bmFAZm91 \n "
" bmRhdGlvbmRiLmNvbTAeFw0xNDEyMDUxNTEyMjFaFw0yNDEyMDIxNTEyMjFaMIGj \n "
" MQswCQYDVQQGEwJVUzELMAkGA1UECAwCVkExDzANBgNVBAcMBlZpZW5uYTEaMBgG \n "
" A1UECgwRRm91bmRhdGlvbkRCLCBMTEMxGTAXBgNVBAsMEFRlc3QgZW5naW5lZXJp \n "
" bmcxFTATBgNVBAMMDE1yLiBCaWcgVHVuYTEoMCYGCSqGSIb3DQEJARYZYmlnLnR1 \n "
" bmFAZm91bmRhdGlvbmRiLmNvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoC \n "
" ggEBAKZTL2edDkiet4HBTZnjysn6gOVZH2MP02KVBIv/H7e+3w7ZOIRvcPzhZe9M \n "
" 3cGH1t/pkr9DSXvzIb42EffMVlpLD2VQn2H8VC2QSdJCIQcf802u+Taf+XtW6K1h \n "
" p/YPL1uhdopUs3c1oon8ykKwnOfrQYgv5pUa7jQdMkltI2MQJU3uFq3Z/LHTvIKe \n "
" FN+bqK0iYhZthwMG7Rld4+RgKZoT4u1B6w/duEWk9KLjgs7fTf3Oe6JHCYNqwBJi \n "
" 78sJalwXz9Wf8wmMaYSG0XNA7vBOdpTFhVPSsh6e3rkydf5HydMade/II98MWpMe \n "
" hFg7FFMaJP6ig8p5iL+9QP2VMCkCAwEAAaNQME4wHQYDVR0OBBYEFIXGmIcKptBP \n "
" v3i9WS/mK78o5E/MMB8GA1UdIwQYMBaAFIXGmIcKptBPv3i9WS/mK78o5E/MMAwG \n "
" A1UdEwQFMAMBAf8wDQYJKoZIhvcNAQEFBQADggEBAJkVgNGOXT+ZHCNEYLjr/6OM \n "
" UCHvwlMeaEyqxaOmK26J2kAADPhjBZ7lZOHWb2Wzb+BiQUIFGwNIMoRvsg8skpJa \n "
" OCqpVciHVXY/U8BiYY70DKozRza93Ab9om3pySGDJ/akdCjqbMT1Cb7Kloyw+hNh \n "
" XD4MML0lYiUE9KK35xyK6FgTx4A7IXl4b3lWBgglqTh4+P5J1+xy8AYJ0VfPoP7y \n "
" OoZgwAmkpkMnalReNkN7LALHGqMzv/qH04ODlkU/HUGgExtnINMxK9VEDIe/yLGm \n "
" DHy7gcQMj5Hyymack/d4ZF8CSrYpGZQeZGXoxOmTDwWcXgnYA+2o7lOYPb5Uu08= \n "
" -----END CERTIFICATE----- \n "
" -----BEGIN PRIVATE KEY----- \n "
" MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQCmUy9nnQ5InreB \n "
" wU2Z48rJ+oDlWR9jD9NilQSL/x+3vt8O2TiEb3D84WXvTN3Bh9bf6ZK/Q0l78yG+ \n "
" NhH3zFZaSw9lUJ9h/FQtkEnSQiEHH/NNrvk2n/l7VuitYaf2Dy9boXaKVLN3NaKJ \n "
" /MpCsJzn60GIL+aVGu40HTJJbSNjECVN7hat2fyx07yCnhTfm6itImIWbYcDBu0Z \n "
" XePkYCmaE+LtQesP3bhFpPSi44LO3039znuiRwmDasASYu/LCWpcF8/Vn/MJjGmE \n "
" htFzQO7wTnaUxYVT0rIent65MnX+R8nTGnXvyCPfDFqTHoRYOxRTGiT+ooPKeYi/ \n "
" vUD9lTApAgMBAAECggEBAIYCmDtfq9aPK0P8v82yX/4FPD2OZV+nrKXNc3BpCuE9 \n "
" hPOtyX/LWrol0b/Rqwr3rAWVaIt6Z4bbCuD7J9cEaL8voyP6pbCJYjmj/BbQ+VOI \n "
" Rrzcsid1Fcpu5+JqwK3c5kdp/NzQChmOuXt8lmrNal7iilZ0YdDZdfu/WnkW2mBB \n "
" oQHkujlnWr4PNYdwMOnBU6TwdOuz+inPVMLohOO0Vr585OxPsGzG2Ud3yQ/t34Cq \n "
" F9nmOXQoszftGKsL1yuh/3fGj/O86g/CRsUy05qZhDDBEYQD6qZCvD5+yp8oOWIR \n "
" SljM3GXDBnJqRPhP+Nyf6e6/GoQtfVZ9MPRzDDPzIBECgYEA2kX/zAs6taOiNqCb \n "
" 6nVGe7/3uQJz/CkmOSKIFKUu7lCEUjmMYpK3Xzp26RTUR9cT+g9y+cnJO1Vbaxtf \n "
" Qidje6K+Oi1pQyUGQ6W+U8cPJHz43PVa7IB5Az5i/sS2tu0BGhvGo9G6iYQjxXeD \n "
" 1197DRACgnm5AORQMum616XvSPMCgYEAwxKbkAzJzfZF6A3Ys+/0kycNfDP8xZoC \n "
" 1zV3d1b2JncsdAPCHYSKtpniRrQN9ASa3RMdkh+wrMN/KlbtU9Ddoc4NHxSTFV7F \n "
" wypFMzLZslqkQ6uHnVVewHV7prfoKsMci2c9iHO7W8TEv4aqW8XDd8OozP3/q2j4 \n "
" hvL7VIAVqXMCgYEAwAFnfOQ75uBkp00tGlfDgsRhc5vWz3CbMRNRRWfxGq41V+dL \n "
" uMJ7EAfr5ijue6uU5RmF+HkqzUjOvC894oGnn3CPibm8qNX+5q7799JZXa2ZdTVX \n "
" oEd7LAFLL/V3DP77Qy4/1Id/Ycydcu0pSuGw6tK0gnX06fXtHnxAYcaT8UUCgYAE \n "
" MytcP5o8r/ezVlD7Fsh6PpYAvZHMo1M6VPFchWfJTjmLyeTtA8SEx+1iPlAql8rJ \n "
" xbaWRc5k+dSMEdEMQ+vxpuELcUL1a9PwLsHMp2SefWsZ9eB2l7bxh9YAsebyvL6p \n "
" lbBydqNrB2KBCSIz1Z8uveytdS6C/0CSjzqwCA3vVwKBgQDAXqjo3xrzMlHeXm5o \n "
" qH/OjajjqbnPXHolHDitbLubyQ4E6KhMBMxfChBe/8VptB/Gs0efVbMVGuabxY7Q \n "
" iastGId8HyONy3UPGPxCn4b95cIxKvdpt+hvWtYHIBCfHXluQK7zsDMgvtXjYNiz \n "
" peZRikYlwmu1K2YRTf7oLE2Ogw== \n "
" -----END PRIVATE KEY----- \n " ;
template < class T >
T simulate ( const T & in ) {
BinaryWriter writer ( AssumeVersion ( currentProtocolVersion ) ) ;
writer < < in ;
BinaryReader reader ( writer . getData ( ) , writer . getLength ( ) , AssumeVersion ( currentProtocolVersion ) ) ;
T out ;
reader > > out ;
return out ;
}
2018-04-26 09:29:29 +08:00
static void simInitTLS ( Reference < TLSOptions > tlsOptions ) {
tlsOptions - > set_cert_data ( certBytes ) ;
tlsOptions - > set_key_data ( certBytes ) ;
2018-06-22 08:05:11 +08:00
tlsOptions - > set_verify_peers ( std : : vector < std : : string > ( 1 , " Check.Valid=0 " ) ) ;
2018-04-26 09:29:29 +08:00
tlsOptions - > register_network ( ) ;
2017-05-26 04:48:44 +08:00
}
ACTOR Future < Void > runBackup ( Reference < ClusterConnectionFile > connFile ) {
state std : : vector < Future < Void > > agentFutures ;
while ( g_simulator . backupAgents = = ISimulator : : WaitForType ) {
2018-08-11 04:57:10 +08:00
wait ( delay ( 1.0 ) ) ;
2017-05-26 04:48:44 +08:00
}
if ( g_simulator . backupAgents = = ISimulator : : BackupToFile ) {
2018-09-22 06:58:14 +08:00
Database cx = Database : : createDatabase ( connFile , - 1 ) ;
2017-05-26 04:48:44 +08:00
state FileBackupAgent fileAgent ;
state double backupPollDelay = 1.0 / CLIENT_KNOBS - > BACKUP_AGGREGATE_POLL_RATE ;
agentFutures . push_back ( fileAgent . run ( cx , & backupPollDelay , CLIENT_KNOBS - > SIM_BACKUP_TASKS_PER_AGENT ) ) ;
while ( g_simulator . backupAgents = = ISimulator : : BackupToFile ) {
2018-08-11 04:57:10 +08:00
wait ( delay ( 1.0 ) ) ;
2017-05-26 04:48:44 +08:00
}
for ( auto it : agentFutures ) {
it . cancel ( ) ;
}
}
2018-02-21 05:22:31 +08:00
2018-08-11 08:25:43 +08:00
wait ( Future < Void > ( Never ( ) ) ) ;
2018-02-21 05:22:31 +08:00
throw internal_error ( ) ;
}
ACTOR Future < Void > runDr ( Reference < ClusterConnectionFile > connFile ) {
state std : : vector < Future < Void > > agentFutures ;
while ( g_simulator . drAgents = = ISimulator : : WaitForType ) {
2018-08-11 04:57:10 +08:00
wait ( delay ( 1.0 ) ) ;
2018-02-21 05:22:31 +08:00
}
if ( g_simulator . drAgents = = ISimulator : : BackupToDB ) {
2018-09-22 06:58:14 +08:00
Database cx = Database : : createDatabase ( connFile , - 1 ) ;
2017-05-26 04:48:44 +08:00
Reference < ClusterConnectionFile > extraFile ( new ClusterConnectionFile ( * g_simulator . extraDB ) ) ;
2018-09-22 06:58:14 +08:00
state Database extraDB = Database : : createDatabase ( extraFile , - 1 ) ;
2017-05-26 04:48:44 +08:00
2018-06-09 02:11:08 +08:00
TraceEvent ( " StartingDrAgents " ) . detail ( " ConnFile " , connFile - > getConnectionString ( ) . toString ( ) ) . detail ( " ExtraString " , extraFile - > getConnectionString ( ) . toString ( ) ) ;
2017-05-26 04:48:44 +08:00
state DatabaseBackupAgent dbAgent = DatabaseBackupAgent ( cx ) ;
state DatabaseBackupAgent extraAgent = DatabaseBackupAgent ( extraDB ) ;
state double dr1PollDelay = 1.0 / CLIENT_KNOBS - > BACKUP_AGGREGATE_POLL_RATE ;
state double dr2PollDelay = 1.0 / CLIENT_KNOBS - > BACKUP_AGGREGATE_POLL_RATE ;
agentFutures . push_back ( extraAgent . run ( cx , & dr1PollDelay , CLIENT_KNOBS - > SIM_BACKUP_TASKS_PER_AGENT ) ) ;
agentFutures . push_back ( dbAgent . run ( extraDB , & dr2PollDelay , CLIENT_KNOBS - > SIM_BACKUP_TASKS_PER_AGENT ) ) ;
2018-02-21 05:22:31 +08:00
while ( g_simulator . drAgents = = ISimulator : : BackupToDB ) {
2018-08-11 04:57:10 +08:00
wait ( delay ( 1.0 ) ) ;
2017-05-26 04:48:44 +08:00
}
2018-02-21 05:22:31 +08:00
TraceEvent ( " StoppingDrAgents " ) ;
2017-05-26 04:48:44 +08:00
for ( auto it : agentFutures ) {
it . cancel ( ) ;
}
}
2018-08-11 08:25:43 +08:00
wait ( Future < Void > ( Never ( ) ) ) ;
2017-05-26 04:48:44 +08:00
throw internal_error ( ) ;
}
// SOMEDAY: when a process can be rebooted in isolation from the other on that machine,
// a loop{} will be needed around the waiting on simulatedFDBD(). For now this simply
// takes care of house-keeping such as context switching and file closing.
ACTOR Future < ISimulator : : KillType > simulatedFDBDRebooter (
Reference < ClusterConnectionFile > connFile ,
uint32_t ip ,
2018-04-26 09:29:29 +08:00
bool sslEnabled ,
Reference < TLSOptions > tlsOptions ,
2017-05-26 04:48:44 +08:00
uint16_t port ,
2018-12-07 03:48:50 +08:00
uint16_t listenPerProcess ,
2017-05-26 04:48:44 +08:00
LocalityData localities ,
ProcessClass processClass ,
std : : string * dataFolder ,
std : : string * coordFolder ,
std : : string baseFolder ,
ClusterConnectionString connStr ,
bool useSeedFile ,
bool runBackupAgents )
{
state ISimulator : : ProcessInfo * simProcess = g_simulator . getCurrentProcess ( ) ;
2017-06-20 07:48:15 +08:00
state UID randomId = g_nondeterministic_random - > randomUniqueID ( ) ;
state int cycles = 0 ;
2017-05-26 04:48:44 +08:00
loop {
auto waitTime = SERVER_KNOBS - > MIN_REBOOT_TIME + ( SERVER_KNOBS - > MAX_REBOOT_TIME - SERVER_KNOBS - > MIN_REBOOT_TIME ) * g_random - > random01 ( ) ;
cycles + + ;
2017-08-29 02:25:37 +08:00
TraceEvent ( " SimulatedFDBDPreWait " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
. detail ( " Address " , NetworkAddress ( ip , port , true , false ) )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
2018-06-09 02:11:08 +08:00
. detail ( " WaitTime " , waitTime ) . detail ( " Port " , port ) ;
2017-05-26 04:48:44 +08:00
2018-08-11 04:57:10 +08:00
wait ( delay ( waitTime ) ) ;
2017-05-26 04:48:44 +08:00
2018-12-07 03:48:50 +08:00
state ISimulator : : ProcessInfo * process = g_simulator . newProcess ( " Server " , ip , port , listenPerProcess , localities , processClass , dataFolder - > c_str ( ) , coordFolder - > c_str ( ) ) ;
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( process , TaskDefaultYield ) ) ; // Now switch execution to the process on which we will run
2017-05-26 04:48:44 +08:00
state Future < ISimulator : : KillType > onShutdown = process - > onShutdown ( ) ;
try {
2018-08-02 05:30:57 +08:00
TraceEvent ( " SimulatedRebooterStarting " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) )
2017-08-29 02:25:37 +08:00
. detail ( " Address " , process - > address . toString ( ) )
. detail ( " Excluded " , process - > excluded )
2018-04-26 09:29:29 +08:00
. detail ( " UsingSSL " , sslEnabled ) ;
2017-08-29 02:25:37 +08:00
TraceEvent ( " ProgramStart " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " SourceVersion " , getHGVersion ( ) )
. detail ( " Version " , FDB_VT_VERSION )
. detail ( " PackageName " , FDB_VT_PACKAGE_NAME )
. detail ( " DataFolder " , * dataFolder )
. detail ( " ConnectionString " , connFile ? connFile - > getConnectionString ( ) . toString ( ) : " " )
. detailf ( " ActualTime " , " %lld " , DEBUG_DETERMINISM ? 0 : time ( NULL ) )
. detail ( " CommandLine " , " fdbserver -r simulation " )
. detail ( " BuggifyEnabled " , buggifyActivated )
. detail ( " Simulated " , true )
. trackLatest ( " ProgramStart " ) ;
try {
//SOMEDAY: test lower memory limits, without making them too small and causing the database to stop making progress
FlowTransport : : createInstance ( 1 ) ;
Sim2FileSystem : : newFileSystem ( ) ;
2018-04-26 09:29:29 +08:00
if ( sslEnabled ) {
tlsOptions - > register_network ( ) ;
2018-04-25 07:46:01 +08:00
}
2018-12-07 03:48:50 +08:00
vector < Future < Void > > futures ;
for ( int listenPort = port ; listenPort < port + listenPerProcess ; + + listenPort ) {
2019-02-01 10:20:14 +08:00
NetworkAddress n ( ip , listenPort , true , sslEnabled & & listenPort = = port ) ;
2018-12-07 03:48:50 +08:00
futures . push_back ( FlowTransport : : transport ( ) . bind ( n , n ) ) ;
}
2017-05-26 04:48:44 +08:00
Future < Void > fd = fdbd ( connFile , localities , processClass , * dataFolder , * coordFolder , 500e6 , " " , " " ) ;
Future < Void > backup = runBackupAgents ? runBackup ( connFile ) : Future < Void > ( Never ( ) ) ;
2018-02-21 05:22:31 +08:00
Future < Void > dr = runBackupAgents ? runDr ( connFile ) : Future < Void > ( Never ( ) ) ;
2017-05-26 04:48:44 +08:00
2018-12-07 03:48:50 +08:00
futures . push_back ( fd ) ;
futures . push_back ( backup ) ;
futures . push_back ( dr ) ;
futures . push_back ( success ( onShutdown ) ) ;
wait ( waitForAny ( futures ) ) ;
2017-05-26 04:48:44 +08:00
} catch ( Error & e ) {
2017-05-27 08:43:28 +08:00
// If in simulation, if we make it here with an error other than io_timeout but enASIOTimedOut is set then somewhere an io_timeout was converted to a different error.
if ( g_network - > isSimulated ( ) & & e . code ( ) ! = error_code_io_timeout & & ( bool ) g_network - > global ( INetwork : : enASIOTimedOut ) )
2017-08-29 02:25:37 +08:00
TraceEvent ( SevError , " IOTimeoutErrorSuppressed " ) . detail ( " ErrorCode " , e . code ( ) ) . detail ( " RandomId " , randomId ) . backtrace ( ) ;
2017-05-27 08:43:28 +08:00
2017-05-26 04:48:44 +08:00
if ( onShutdown . isReady ( ) & & onShutdown . isError ( ) ) throw onShutdown . getError ( ) ;
if ( e . code ( ) ! = error_code_actor_cancelled )
printf ( " SimulatedFDBDTerminated: %s \n " , e . what ( ) ) ;
ASSERT ( destructed | | g_simulator . getCurrentProcess ( ) = = process ) ; // simulatedFDBD catch called on different process
2018-08-02 05:30:57 +08:00
TraceEvent ( e . code ( ) = = error_code_actor_cancelled | | e . code ( ) = = error_code_file_not_found | | destructed ? SevInfo : SevError , " SimulatedFDBDTerminated " ) . error ( e , true ) . detailext ( " ZoneId " , localities . zoneId ( ) ) ;
2017-05-26 04:48:44 +08:00
}
2018-08-02 05:30:57 +08:00
TraceEvent ( " SimulatedFDBDDone " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-08-29 02:25:37 +08:00
. detail ( " Address " , process - > address )
. detail ( " Excluded " , process - > excluded )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , onShutdown . isReady ( ) ? onShutdown . get ( ) : ISimulator : : None ) ;
if ( ! onShutdown . isReady ( ) )
onShutdown = ISimulator : : InjectFaults ;
} catch ( Error & e ) {
2018-08-02 05:30:57 +08:00
TraceEvent ( destructed ? SevInfo : SevError , " SimulatedFDBDRebooterError " ) . error ( e , true ) . detailext ( " ZoneId " , localities . zoneId ( ) ) . detail ( " RandomId " , randomId ) ;
2017-05-26 04:48:44 +08:00
onShutdown = e ;
}
ASSERT ( destructed | | g_simulator . getCurrentProcess ( ) = = process ) ;
if ( ! process - > shutdownSignal . isSet ( ) & & ! destructed ) {
process - > rebooting = true ;
process - > shutdownSignal . send ( ISimulator : : None ) ;
}
2018-08-02 05:30:57 +08:00
TraceEvent ( " SimulatedFDBDWait " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-08-29 02:25:37 +08:00
. detail ( " Address " , process - > address )
. detail ( " Excluded " , process - > excluded )
. detail ( " Rebooting " , process - > rebooting )
. detailext ( " ZoneId " , localities . zoneId ( ) ) ;
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( simProcess ) ) ;
2017-05-26 04:48:44 +08:00
2018-08-11 04:57:10 +08:00
wait ( delay ( 0.00001 + FLOW_KNOBS - > MAX_BUGGIFIED_DELAY ) ) ; // One last chance for the process to clean up?
2017-05-26 04:48:44 +08:00
g_simulator . destroyProcess ( process ) ; // Leak memory here; the process may be used in other parts of the simulation
auto shutdownResult = onShutdown . get ( ) ;
2018-08-02 05:30:57 +08:00
TraceEvent ( " SimulatedFDBDShutdown " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-08-29 02:25:37 +08:00
. detail ( " Address " , process - > address )
. detail ( " Excluded " , process - > excluded )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
if ( shutdownResult < ISimulator : : RebootProcessAndDelete ) {
2018-08-02 05:30:57 +08:00
TraceEvent ( " SimulatedFDBDLowerReboot " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-08-29 02:25:37 +08:00
. detail ( " Address " , process - > address )
. detail ( " Excluded " , process - > excluded )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
return onShutdown . get ( ) ;
}
if ( onShutdown . get ( ) = = ISimulator : : RebootProcessAndDelete ) {
2018-08-02 05:30:57 +08:00
TraceEvent ( " SimulatedFDBDRebootAndDelete " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-08-29 02:25:37 +08:00
. detail ( " Address " , process - > address )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
* coordFolder = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
* dataFolder = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
platform : : createDirectory ( * dataFolder ) ;
if ( ! useSeedFile ) {
writeFile ( joinPath ( * dataFolder , " fdb.cluster " ) , connStr . toString ( ) ) ;
connFile = Reference < ClusterConnectionFile > ( new ClusterConnectionFile ( joinPath ( * dataFolder , " fdb.cluster " ) ) ) ;
}
else {
connFile = Reference < ClusterConnectionFile > ( new ClusterConnectionFile ( joinPath ( * dataFolder , " fdb.cluster " ) , connStr . toString ( ) ) ) ;
}
}
else {
2018-08-02 05:30:57 +08:00
TraceEvent ( " SimulatedFDBDJustRepeat " ) . detail ( " Cycles " , cycles ) . detail ( " RandomId " , randomId )
2017-08-29 02:25:37 +08:00
. detail ( " Address " , process - > address )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detail ( " KillType " , shutdownResult ) ;
}
}
}
template < >
std : : string describe ( bool const & val ) {
return val ? " true " : " false " ;
}
template < >
std : : string describe ( int const & val ) {
return format ( " %d " , val ) ;
}
// Since a datacenter kill is considered to be the same as killing a machine, files cannot be swapped across datacenters
std : : map < Optional < Standalone < StringRef > > , std : : vector < std : : vector < std : : string > > > availableFolders ;
// process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per process
ACTOR Future < Void > simulatedMachine (
ClusterConnectionString connStr ,
std : : vector < uint32_t > ips ,
bool sslEnabled ,
2018-04-26 09:29:29 +08:00
Reference < TLSOptions > tlsOptions ,
2017-05-26 04:48:44 +08:00
LocalityData localities ,
ProcessClass processClass ,
std : : string baseFolder ,
bool restarting ,
bool useSeedFile ,
2019-02-01 10:20:14 +08:00
bool runBackupAgents ,
bool sslOnly )
2017-05-26 04:48:44 +08:00
{
state int bootCount = 0 ;
state std : : vector < std : : string > myFolders ;
state std : : vector < std : : string > coordFolders ;
2017-08-29 02:25:37 +08:00
state UID randomId = g_nondeterministic_random - > randomUniqueID ( ) ;
2019-02-01 10:20:14 +08:00
state int listenPerProcess = ( sslEnabled & & ! sslOnly ) ? 2 : 1 ;
2017-05-26 04:48:44 +08:00
try {
CSimpleIni ini ;
ini . SetUnicode ( ) ;
ini . LoadFile ( joinPath ( baseFolder , " restartInfo.ini " ) . c_str ( ) ) ;
for ( int i = 0 ; i < ips . size ( ) ; i + + ) {
if ( restarting ) {
2019-02-13 17:52:59 +08:00
myFolders . push_back ( ini . GetValue ( printable ( localities . machineId ( ) ) . c_str ( ) , format ( " %d " , i * listenPerProcess ) . c_str ( ) , joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) . c_str ( ) ) ) ;
2017-05-26 04:48:44 +08:00
if ( i = = 0 ) {
2019-01-19 07:42:48 +08:00
std : : string coordinationFolder = ini . GetValue ( printable ( localities . machineId ( ) ) . c_str ( ) , " coordinationFolder " , " " ) ;
2017-05-26 04:48:44 +08:00
if ( ! coordinationFolder . size ( ) )
2019-02-13 17:52:59 +08:00
coordinationFolder = ini . GetValue ( printable ( localities . machineId ( ) ) . c_str ( ) , format ( " c%d " , i * listenPerProcess ) . c_str ( ) , joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) . c_str ( ) ) ;
2017-05-26 04:48:44 +08:00
coordFolders . push_back ( coordinationFolder ) ;
} else {
2019-02-13 17:52:59 +08:00
coordFolders . push_back ( ini . GetValue ( printable ( localities . machineId ( ) ) . c_str ( ) , format ( " c%d " , i * listenPerProcess ) . c_str ( ) , joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) . c_str ( ) ) ) ;
2017-05-26 04:48:44 +08:00
}
}
else {
coordFolders . push_back ( joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ) ;
std : : string thisFolder = g_random - > randomUniqueID ( ) . toString ( ) ;
myFolders . push_back ( joinPath ( baseFolder , thisFolder ) ) ;
platform : : createDirectory ( myFolders [ i ] ) ;
if ( ! useSeedFile )
writeFile ( joinPath ( myFolders [ i ] , " fdb.cluster " ) , connStr . toString ( ) ) ;
}
}
loop {
state std : : vector < Future < ISimulator : : KillType > > processes ;
for ( int i = 0 ; i < ips . size ( ) ; i + + ) {
std : : string path = joinPath ( myFolders [ i ] , " fdb.cluster " ) ;
Reference < ClusterConnectionFile > clusterFile ( useSeedFile ? new ClusterConnectionFile ( path , connStr . toString ( ) ) : new ClusterConnectionFile ( path ) ) ;
2019-01-09 23:41:02 +08:00
const int listenPort = i * listenPerProcess + 1 ;
processes . push_back ( simulatedFDBDRebooter ( clusterFile , ips [ i ] , sslEnabled , tlsOptions , listenPort , listenPerProcess , localities , processClass , & myFolders [ i ] , & coordFolders [ i ] , baseFolder , connStr , useSeedFile , runBackupAgents ) ) ;
TraceEvent ( " SimulatedMachineProcess " , randomId ) . detail ( " Address " , NetworkAddress ( ips [ i ] , listenPort , true , false ) ) . detailext ( " ZoneId " , localities . zoneId ( ) ) . detailext ( " DataHall " , localities . dataHallId ( ) ) . detail ( " Folder " , myFolders [ i ] ) ;
2017-05-26 04:48:44 +08:00
}
TEST ( bootCount > = 1 ) ; // Simulated machine rebooted
TEST ( bootCount > = 2 ) ; // Simulated machine rebooted twice
TEST ( bootCount > = 3 ) ; // Simulated machine rebooted three times
+ + bootCount ;
2017-08-29 02:25:37 +08:00
TraceEvent ( " SimulatedMachineStart " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " Folder0 " , myFolders [ 0 ] )
. detail ( " CFolder0 " , coordFolders [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) )
. detail ( " SSL " , sslEnabled )
2018-06-09 02:11:08 +08:00
. detail ( " Processes " , processes . size ( ) )
. detail ( " BootCount " , bootCount )
2017-05-26 04:48:44 +08:00
. detail ( " ProcessClass " , processClass . toString ( ) )
. detail ( " Restarting " , restarting )
. detail ( " UseSeedFile " , useSeedFile )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) )
. detail ( " Locality " , localities . toString ( ) ) ;
2018-08-11 04:57:10 +08:00
wait ( waitForAll ( processes ) ) ;
2017-05-26 04:48:44 +08:00
2017-08-29 02:25:37 +08:00
TraceEvent ( " SimulatedMachineRebootStart " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " Folder0 " , myFolders [ 0 ] )
. detail ( " CFolder0 " , coordFolders [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
2019-02-18 10:46:59 +08:00
{
//Kill all open files, which may cause them to write invalid data.
auto & machineCache = g_simulator . getMachineById ( localities . machineId ( ) ) - > openFiles ;
//Copy the file pointers to a vector because the map may be modified while we are killing files
std : : vector < AsyncFileNonDurable * > files ;
for ( auto fileItr = machineCache . begin ( ) ; fileItr ! = machineCache . end ( ) ; + + fileItr ) {
ASSERT ( fileItr - > second . isReady ( ) ) ;
files . push_back ( ( AsyncFileNonDurable * ) fileItr - > second . get ( ) . getPtr ( ) ) ;
}
2017-05-26 04:48:44 +08:00
2019-02-18 10:46:59 +08:00
std : : vector < Future < Void > > killFutures ;
for ( auto fileItr = files . begin ( ) ; fileItr ! = files . end ( ) ; + + fileItr )
killFutures . push_back ( ( * fileItr ) - > kill ( ) ) ;
2017-05-26 04:48:44 +08:00
2019-02-18 10:46:59 +08:00
wait ( waitForAll ( killFutures ) ) ;
}
2017-05-26 04:48:44 +08:00
state std : : set < std : : string > filenames ;
state std : : string closingStr ;
2019-01-19 07:42:48 +08:00
auto & machineCache = g_simulator . getMachineById ( localities . machineId ( ) ) - > openFiles ;
2017-05-26 04:48:44 +08:00
for ( auto it : machineCache ) {
filenames . insert ( it . first ) ;
closingStr + = it . first + " , " ;
ASSERT ( it . second . isReady ( ) & & ! it . second . isError ( ) ) ;
}
2019-01-19 07:42:48 +08:00
for ( auto it : g_simulator . getMachineById ( localities . machineId ( ) ) - > deletingFiles ) {
2017-08-26 01:12:58 +08:00
filenames . insert ( it ) ;
closingStr + = it + " , " ;
}
2017-08-29 02:25:37 +08:00
TraceEvent ( " SimulatedMachineRebootAfterKills " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " Folder0 " , myFolders [ 0 ] )
. detail ( " CFolder0 " , coordFolders [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) )
. detail ( " Closing " , closingStr )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
2019-01-19 07:42:48 +08:00
ISimulator : : MachineInfo * machine = g_simulator . getMachineById ( localities . machineId ( ) ) ;
2017-05-26 04:48:44 +08:00
machine - > closingFiles = filenames ;
2019-01-19 07:42:48 +08:00
g_simulator . getMachineById ( localities . machineId ( ) ) - > openFiles . clear ( ) ;
2017-05-26 04:48:44 +08:00
// During a reboot:
// The process is expected to close all files and be inactive in zero time, but not necessarily
// without delay(0)-equivalents, so delay(0) a few times waiting for it to achieve that goal.
// After an injected fault:
// The process is expected to shut down eventually, but not necessarily instantly. Wait up to 60 seconds.
state int shutdownDelayCount = 0 ;
state double backoff = 0 ;
loop {
2019-01-19 07:42:48 +08:00
auto & machineCache = g_simulator . getMachineById ( localities . machineId ( ) ) - > closingFiles ;
2017-05-26 04:48:44 +08:00
if ( ! machineCache . empty ( ) ) {
std : : string openFiles ;
int i = 0 ;
for ( auto it = machineCache . begin ( ) ; it ! = machineCache . end ( ) & & i < 5 ; + + it ) {
openFiles + = * it + " , " ;
i + + ;
}
2017-08-29 02:25:37 +08:00
TraceEvent ( " MachineFilesOpen " , randomId ) . detail ( " PAddr " , toIPVectorString ( ips ) ) . detail ( " OpenFiles " , openFiles ) ;
2017-05-26 04:48:44 +08:00
} else
break ;
if ( shutdownDelayCount + + > = 50 ) { // Worker doesn't shut down instantly on reboot
2017-08-29 02:25:37 +08:00
TraceEvent ( SevError , " SimulatedFDBDFilesCheck " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " PAddrs " , toIPVectorString ( ips ) )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
ASSERT ( false ) ;
}
2018-08-11 04:57:10 +08:00
wait ( delay ( backoff ) ) ;
2017-05-26 04:48:44 +08:00
backoff = std : : min ( backoff + 1.0 , 6.0 ) ;
}
2017-08-29 02:25:37 +08:00
TraceEvent ( " SimulatedFDBDFilesClosed " , randomId )
. detail ( " Address " , toIPVectorString ( ips ) )
2017-05-26 04:48:44 +08:00
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) ) ;
2019-01-19 07:42:48 +08:00
g_simulator . destroyMachine ( localities . machineId ( ) ) ;
2017-05-26 04:48:44 +08:00
// SOMEDAY: when processes can be rebooted, this check will be needed
//ASSERT( this machine is rebooting );
// Since processes can end with different codes, take the highest (least severe) to detmine what to do
state ISimulator : : KillType killType = processes [ 0 ] . get ( ) ;
for ( int i = 1 ; i < ips . size ( ) ; i + + )
killType = std : : max ( processes [ i ] . get ( ) , killType ) ;
TEST ( true ) ; // Simulated machine has been rebooted
state bool swap = killType = = ISimulator : : Reboot & & BUGGIFY_WITH_PROB ( 0.75 ) & & g_simulator . canSwapToMachine ( localities . zoneId ( ) ) ;
if ( swap )
availableFolders [ localities . dcId ( ) ] . push_back ( myFolders ) ;
auto rebootTime = g_random - > random01 ( ) * MACHINE_REBOOT_TIME ;
2017-08-29 02:25:37 +08:00
TraceEvent ( " SimulatedMachineShutdown " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " Swap " , swap )
. detail ( " KillType " , killType )
. detail ( " RebootTime " , rebootTime )
. detailext ( " ZoneId " , localities . zoneId ( ) )
. detailext ( " DataHall " , localities . dataHallId ( ) )
. detail ( " MachineIPs " , toIPVectorString ( ips ) ) ;
2018-08-11 04:57:10 +08:00
wait ( delay ( rebootTime ) ) ;
2017-05-26 04:48:44 +08:00
if ( swap ) {
auto & avail = availableFolders [ localities . dcId ( ) ] ;
int i = g_random - > randomInt ( 0 , avail . size ( ) ) ;
if ( i ! = avail . size ( ) - 1 )
std : : swap ( avail [ i ] , avail . back ( ) ) ;
auto toRebootFrom = avail . back ( ) ;
avail . pop_back ( ) ;
if ( myFolders ! = toRebootFrom ) {
TEST ( true ) ; // Simulated machine swapped data folders
2017-08-29 02:25:37 +08:00
TraceEvent ( " SimulatedMachineFolderSwap " , randomId )
2017-05-26 04:48:44 +08:00
. detail ( " OldFolder0 " , myFolders [ 0 ] ) . detail ( " NewFolder0 " , toRebootFrom [ 0 ] )
. detail ( " MachineIPs " , toIPVectorString ( ips ) ) ;
}
myFolders = toRebootFrom ;
if ( ! useSeedFile ) {
for ( auto f : toRebootFrom ) {
if ( ! fileExists ( joinPath ( f , " fdb.cluster " ) ) ) {
writeFile ( joinPath ( f , " fdb.cluster " ) , connStr . toString ( ) ) ;
}
}
}
} else if ( killType = = ISimulator : : RebootAndDelete ) {
for ( int i = 0 ; i < ips . size ( ) ; i + + ) {
coordFolders [ i ] = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
myFolders [ i ] = joinPath ( baseFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
platform : : createDirectory ( myFolders [ i ] ) ;
if ( ! useSeedFile ) {
writeFile ( joinPath ( myFolders [ i ] , " fdb.cluster " ) , connStr . toString ( ) ) ;
}
}
TEST ( true ) ; // Simulated machine rebooted with data loss
}
//this machine is rebooting = false;
}
} catch ( Error & e ) {
2019-01-19 07:42:48 +08:00
g_simulator . getMachineById ( localities . machineId ( ) ) - > openFiles . clear ( ) ;
2017-05-26 04:48:44 +08:00
throw ;
}
}
# include "fdbclient/MonitorLeader.h"
2018-04-26 09:29:29 +08:00
ACTOR Future < Void > restartSimulatedSystem (
vector < Future < Void > > * systemActors , std : : string baseFolder , int * pTesterCount ,
2018-11-03 05:16:47 +08:00
Optional < ClusterConnectionString > * pConnString , Standalone < StringRef > * pStartingConfiguration , Reference < TLSOptions > tlsOptions , int extraDB ) {
2017-05-26 04:48:44 +08:00
CSimpleIni ini ;
ini . SetUnicode ( ) ;
ini . LoadFile ( joinPath ( baseFolder , " restartInfo.ini " ) . c_str ( ) ) ;
// allows multiple ipAddr entries
ini . SetMultiKey ( ) ;
try {
int machineCount = atoi ( ini . GetValue ( " META " , " machineCount " ) ) ;
int processesPerMachine = atoi ( ini . GetValue ( " META " , " processesPerMachine " ) ) ;
2019-02-01 10:20:14 +08:00
int listenersPerProcess = 1 ;
auto listenersPerProcessStr = ini . GetValue ( " META " , " listenersPerProcess " ) ;
if ( listenersPerProcessStr ! = NULL ) {
listenersPerProcess = atoi ( listenersPerProcessStr ) ;
}
2017-05-26 04:48:44 +08:00
int desiredCoordinators = atoi ( ini . GetValue ( " META " , " desiredCoordinators " ) ) ;
int testerCount = atoi ( ini . GetValue ( " META " , " testerCount " ) ) ;
2018-04-28 04:54:34 +08:00
bool enableExtraDB = ( extraDB = = 3 ) ;
2017-05-26 04:48:44 +08:00
ClusterConnectionString conn ( ini . GetValue ( " META " , " connectionString " ) ) ;
2018-04-28 04:54:34 +08:00
if ( enableExtraDB ) {
g_simulator . extraDB = new ClusterConnectionString ( ini . GetValue ( " META " , " connectionString " ) ) ;
}
2017-05-26 04:48:44 +08:00
* pConnString = conn ;
* pTesterCount = testerCount ;
2019-02-01 10:20:14 +08:00
bool usingSSL = conn . toString ( ) . find ( " :tls " ) ! = std : : string : : npos | | listenersPerProcess > 1 ;
2017-05-26 04:48:44 +08:00
int useSeedForMachine = g_random - > randomInt ( 0 , machineCount ) ;
2018-11-03 05:16:47 +08:00
std : : vector < std : : string > dcIds ;
2017-05-26 04:48:44 +08:00
for ( int i = 0 ; i < machineCount ; i + + ) {
Optional < Standalone < StringRef > > dcUID ;
2019-01-19 07:42:48 +08:00
Optional < Standalone < StringRef > > zoneId ;
std : : string machineIdString = ini . GetValue ( " META " , format ( " %d " , i ) . c_str ( ) ) ;
Standalone < StringRef > machineId = StringRef ( machineIdString ) ;
2019-02-01 10:20:14 +08:00
2019-01-19 07:42:48 +08:00
std : : string dcUIDini = ini . GetValue ( machineIdString . c_str ( ) , " dcUID " ) ;
2018-11-03 05:16:47 +08:00
if ( ! dcUIDini . empty ( ) ) {
dcUID = StringRef ( dcUIDini ) ;
}
2019-01-19 07:42:48 +08:00
auto zoneIDini = ini . GetValue ( machineIdString . c_str ( ) , " zoneId " ) ;
if ( zoneIDini = = NULL ) {
zoneId = machineId ;
} else {
zoneId = StringRef ( zoneIDini ) ;
}
ProcessClass processClass = ProcessClass ( ( ProcessClass : : ClassType ) atoi ( ini . GetValue ( machineIdString . c_str ( ) , " mClass " ) ) , ProcessClass : : CommandLineSource ) ;
2018-11-03 05:16:47 +08:00
if ( processClass ! = ProcessClass : : TesterClass ) {
dcIds . push_back ( dcUIDini ) ;
}
2017-05-26 04:48:44 +08:00
std : : vector < uint32_t > ipAddrs ;
2019-01-19 07:42:48 +08:00
int processes = atoi ( ini . GetValue ( machineIdString . c_str ( ) , " processes " ) ) ;
2017-05-26 04:48:44 +08:00
2019-01-19 07:42:48 +08:00
auto ip = ini . GetValue ( machineIdString . c_str ( ) , " ipAddr " ) ;
2017-05-26 04:48:44 +08:00
if ( ip = = NULL ) {
for ( int i = 0 ; i < processes ; i + + ) {
2019-02-13 17:52:59 +08:00
ipAddrs . push_back ( strtoul ( ini . GetValue ( machineIdString . c_str ( ) , format ( " ipAddr%d " , i * listenersPerProcess ) . c_str ( ) ) , NULL , 10 ) ) ;
2017-05-26 04:48:44 +08:00
}
}
else {
// old way
ipAddrs . push_back ( strtoul ( ip , NULL , 10 ) ) ;
for ( int i = 1 ; i < processes ; i + + ) {
ipAddrs . push_back ( ipAddrs . back ( ) + 1 ) ;
}
}
2019-01-19 07:42:48 +08:00
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , zoneId , machineId , dcUID ) ;
2017-05-26 04:48:44 +08:00
localities . set ( LiteralStringRef ( " data_hall " ) , dcUID ) ;
2018-04-28 04:54:34 +08:00
// SOMEDAY: parse backup agent from test file
2017-05-26 04:48:44 +08:00
systemActors - > push_back ( reportErrors ( simulatedMachine (
2019-02-05 03:39:06 +08:00
conn , ipAddrs , usingSSL , tlsOptions , localities , processClass , baseFolder , true , i = = useSeedForMachine , enableExtraDB , usingSSL & & ( listenersPerProcess = = 1 | | processClass = = ProcessClass : : TesterClass ) ) ,
2017-05-26 04:48:44 +08:00
processClass = = ProcessClass : : TesterClass ? " SimulatedTesterMachine " : " SimulatedMachine " ) ) ;
}
g_simulator . desiredCoordinators = desiredCoordinators ;
g_simulator . processesPerMachine = processesPerMachine ;
2018-11-03 05:16:47 +08:00
uniquify ( dcIds ) ;
if ( ! BUGGIFY & & dcIds . size ( ) = = 2 & & dcIds [ 0 ] ! = " " & & dcIds [ 1 ] ! = " " ) {
StatusObject primaryObj ;
StatusObject primaryDcObj ;
primaryDcObj [ " id " ] = dcIds [ 0 ] ;
primaryDcObj [ " priority " ] = 2 ;
StatusArray primaryDcArr ;
primaryDcArr . push_back ( primaryDcObj ) ;
StatusObject remoteObj ;
StatusObject remoteDcObj ;
remoteDcObj [ " id " ] = dcIds [ 1 ] ;
remoteDcObj [ " priority " ] = 1 ;
StatusArray remoteDcArr ;
remoteDcArr . push_back ( remoteDcObj ) ;
primaryObj [ " datacenters " ] = primaryDcArr ;
remoteObj [ " datacenters " ] = remoteDcArr ;
StatusArray regionArr ;
regionArr . push_back ( primaryObj ) ;
regionArr . push_back ( remoteObj ) ;
* pStartingConfiguration = " single usable_regions=2 regions= " + json_spirit : : write_string ( json_spirit : : mValue ( regionArr ) , json_spirit : : Output_options : : none ) ;
}
2019-02-05 03:39:06 +08:00
TraceEvent ( " RestartSimulatorSettings " )
. detail ( " DesiredCoordinators " , g_simulator . desiredCoordinators )
. detail ( " ProcessesPerMachine " , g_simulator . processesPerMachine )
. detail ( " ListenersPerProcess " , listenersPerProcess ) ;
2017-05-26 04:48:44 +08:00
}
catch ( Error & e ) {
2018-06-09 02:11:08 +08:00
TraceEvent ( SevError , " RestartSimulationError " ) . error ( e ) ;
2017-05-26 04:48:44 +08:00
}
2018-08-11 04:57:10 +08:00
wait ( delay ( 1.0 ) ) ;
2017-05-26 04:48:44 +08:00
return Void ( ) ;
}
2017-08-12 06:06:36 +08:00
struct SimulationConfig {
2018-09-18 09:32:39 +08:00
explicit SimulationConfig ( int extraDB , int minimumReplication , int minimumRegions ) ;
2017-08-22 13:29:56 +08:00
int extraDB ;
2017-08-12 06:06:36 +08:00
2017-08-22 13:29:56 +08:00
DatabaseConfiguration db ;
2017-08-12 06:06:36 +08:00
2017-08-22 13:29:56 +08:00
void set_config ( std : : string config ) ;
2017-08-12 06:06:36 +08:00
2017-08-22 13:29:56 +08:00
// Simulation layout
int datacenters ;
int machine_count ; // Total, not per DC.
int processes_per_machine ;
int coordinators ;
2017-08-12 06:06:36 +08:00
private :
2018-09-18 09:32:39 +08:00
void generateNormalConfig ( int minimumReplication , int minimumRegions ) ;
2017-08-12 06:06:36 +08:00
} ;
2018-09-18 09:32:39 +08:00
SimulationConfig : : SimulationConfig ( int extraDB , int minimumReplication , int minimumRegions ) : extraDB ( extraDB ) {
generateNormalConfig ( minimumReplication , minimumRegions ) ;
2017-08-22 13:29:56 +08:00
}
2017-08-12 06:06:36 +08:00
2017-08-22 13:29:56 +08:00
void SimulationConfig : : set_config ( std : : string config ) {
// The only mechanism we have for turning "single" into what single means
// is buildConfiguration()... :/
std : : map < std : : string , std : : string > hack_map ;
ASSERT ( buildConfiguration ( config , hack_map ) ) ;
for ( auto kv : hack_map ) db . set ( kv . first , kv . second ) ;
}
2017-08-12 06:06:36 +08:00
2017-08-22 13:29:56 +08:00
StringRef StringRefOf ( const char * s ) {
return StringRef ( ( uint8_t * ) s , strlen ( s ) ) ;
2017-08-12 06:06:36 +08:00
}
2017-05-26 04:48:44 +08:00
2018-09-18 09:32:39 +08:00
void SimulationConfig : : generateNormalConfig ( int minimumReplication , int minimumRegions ) {
2017-08-22 13:29:56 +08:00
set_config ( " new " ) ;
2018-10-25 16:25:41 +08:00
const bool simple = false ; // Set true to simplify simulation configs for easier debugging
bool generateFearless = simple ? false : ( minimumRegions > 1 | | g_random - > random01 ( ) < 0.5 ) ;
datacenters = simple ? 1 : ( generateFearless ? ( minimumReplication > 0 | | g_random - > random01 ( ) < 0.5 ? 4 : 6 ) : g_random - > randomInt ( 1 , 4 ) ) ;
2017-08-22 13:29:56 +08:00
if ( g_random - > random01 ( ) < 0.25 ) db . desiredTLogCount = g_random - > randomInt ( 1 , 7 ) ;
if ( g_random - > random01 ( ) < 0.25 ) db . masterProxyCount = g_random - > randomInt ( 1 , 7 ) ;
if ( g_random - > random01 ( ) < 0.25 ) db . resolverCount = g_random - > randomInt ( 1 , 7 ) ;
if ( g_random - > random01 ( ) < 0.5 ) {
set_config ( " ssd " ) ;
} else {
set_config ( " memory " ) ;
}
2018-10-25 16:25:41 +08:00
if ( simple ) {
db . desiredTLogCount = 1 ;
db . masterProxyCount = 1 ;
db . resolverCount = 1 ;
}
2017-08-22 13:29:56 +08:00
2018-10-25 16:25:41 +08:00
int replication_type = simple ? 1 : ( std : : max ( minimumReplication , datacenters > 4 ? g_random - > randomInt ( 1 , 3 ) : std : : min ( g_random - > randomInt ( 0 , 6 ) , 3 ) ) ) ;
2017-08-22 13:29:56 +08:00
switch ( replication_type ) {
case 0 : {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster using custom redundancy mode
2018-02-27 05:15:44 +08:00
int storage_servers = g_random - > randomInt ( 1 , generateFearless ? 4 : 5 ) ;
2018-06-11 11:22:58 +08:00
//FIXME: log replicas must be more than storage replicas because otherwise better master exists will not recognize it needs to change dcs
int replication_factor = g_random - > randomInt ( storage_servers , generateFearless ? 4 : 5 ) ;
2018-02-11 05:27:51 +08:00
int anti_quorum = g_random - > randomInt ( 0 , replication_factor ) ;
2017-08-22 13:29:56 +08:00
// Go through buildConfiguration, as it sets tLogPolicy/storagePolicy.
2018-06-19 01:24:57 +08:00
set_config ( format ( " storage_replicas:=%d log_replicas:=%d log_anti_quorum:=%d "
2017-08-22 13:29:56 +08:00
" replica_datacenters:=1 min_replica_datacenters:=1 " ,
2018-06-19 01:24:57 +08:00
storage_servers , replication_factor , anti_quorum ) ) ;
2017-08-22 13:29:56 +08:00
break ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
case 1 : {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in single redundancy mode
2017-08-22 13:29:56 +08:00
set_config ( " single " ) ;
break ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
case 2 : {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in double redundancy mode
2017-08-22 13:29:56 +08:00
set_config ( " double " ) ;
break ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
case 3 : {
2018-04-11 08:02:43 +08:00
if ( datacenters < = 2 | | generateFearless ) {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in triple redundancy mode
2017-08-22 13:29:56 +08:00
set_config ( " triple " ) ;
2017-05-26 04:48:44 +08:00
}
2017-08-12 06:06:36 +08:00
else if ( datacenters = = 3 ) {
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulated cluster running in 3 data-hall mode
2017-08-22 13:29:56 +08:00
set_config ( " three_data_hall " ) ;
2017-05-26 04:48:44 +08:00
}
else {
ASSERT ( false ) ;
}
2017-08-22 13:29:56 +08:00
break ;
}
default :
ASSERT ( false ) ; // Programmer forgot to adjust cases.
2017-05-26 04:48:44 +08:00
}
2017-08-22 13:29:56 +08:00
2019-02-27 09:14:41 +08:00
int logSpillType = g_random - > randomInt ( 0 , 3 ) ;
switch ( logSpillType ) {
case 0 :
// Let both be the default.
break ;
case 1 : {
set_config ( " log_spill:=1 " ) ; // VALUE
int logVersion = g_random - > randomInt ( 0 , 3 ) ;
switch ( logVersion ) {
case 0 :
break ;
case 1 :
2019-02-23 04:15:23 +08:00
set_config ( " log_version:=2 " ) ; // 6.0
2019-02-27 09:14:41 +08:00
break ;
case 2 :
2019-02-23 04:15:23 +08:00
set_config ( " log_version:=3 " ) ; // 6.1
2019-02-27 09:14:41 +08:00
break ;
2019-02-23 04:15:23 +08:00
}
2019-02-27 09:14:41 +08:00
break ;
}
case 2 :
2019-02-23 04:15:23 +08:00
set_config ( " log_version:=3 " ) ; // 6.1
set_config ( " log_spill:=2 " ) ; // REFERENCE
2019-02-27 09:14:41 +08:00
break ;
2019-02-20 14:02:44 +08:00
}
2018-02-19 04:59:43 +08:00
if ( generateFearless | | ( datacenters = = 2 & & g_random - > random01 ( ) < 0.5 ) ) {
2018-09-18 09:32:39 +08:00
//The kill region workload relies on the fact that all "0", "2", and "4" are all of the possible primary dcids.
2018-03-06 11:27:46 +08:00
StatusObject primaryObj ;
2018-06-13 07:18:54 +08:00
StatusObject primaryDcObj ;
primaryDcObj [ " id " ] = " 0 " ;
2018-06-14 09:35:28 +08:00
primaryDcObj [ " priority " ] = 2 ;
2018-06-13 07:18:54 +08:00
StatusArray primaryDcArr ;
primaryDcArr . push_back ( primaryDcObj ) ;
2018-03-06 11:27:46 +08:00
StatusObject remoteObj ;
2018-06-13 07:18:54 +08:00
StatusObject remoteDcObj ;
remoteDcObj [ " id " ] = " 1 " ;
remoteDcObj [ " priority " ] = 1 ;
StatusArray remoteDcArr ;
remoteDcArr . push_back ( remoteDcObj ) ;
2018-03-16 06:40:58 +08:00
bool needsRemote = generateFearless ;
2018-03-06 11:27:46 +08:00
if ( generateFearless ) {
StatusObject primarySatelliteObj ;
primarySatelliteObj [ " id " ] = " 2 " ;
primarySatelliteObj [ " priority " ] = 1 ;
2018-06-14 08:55:55 +08:00
primarySatelliteObj [ " satellite " ] = 1 ;
primaryDcArr . push_back ( primarySatelliteObj ) ;
2018-03-06 11:27:46 +08:00
StatusObject remoteSatelliteObj ;
remoteSatelliteObj [ " id " ] = " 3 " ;
remoteSatelliteObj [ " priority " ] = 1 ;
2018-06-14 08:55:55 +08:00
remoteSatelliteObj [ " satellite " ] = 1 ;
remoteDcArr . push_back ( remoteSatelliteObj ) ;
2018-03-06 11:27:46 +08:00
2018-07-10 13:01:46 +08:00
if ( datacenters > 4 ) {
StatusObject primarySatelliteObjB ;
primarySatelliteObjB [ " id " ] = " 4 " ;
primarySatelliteObjB [ " priority " ] = 1 ;
primarySatelliteObjB [ " satellite " ] = 1 ;
primaryDcArr . push_back ( primarySatelliteObjB ) ;
StatusObject remoteSatelliteObjB ;
remoteSatelliteObjB [ " id " ] = " 5 " ;
remoteSatelliteObjB [ " priority " ] = 1 ;
remoteSatelliteObjB [ " satellite " ] = 1 ;
remoteDcArr . push_back ( remoteSatelliteObjB ) ;
2018-03-06 11:27:46 +08:00
}
2018-07-10 13:01:46 +08:00
if ( datacenters > 4 ) {
//FIXME: we cannot use one satellite replication with more than one satellite per region because canKillProcesses does not respect usable_dcs
int satellite_replication_type = g_random - > randomInt ( 0 , 3 ) ;
switch ( satellite_replication_type ) {
case 0 : {
TEST ( true ) ; // Simulated cluster using no satellite redundancy mode
break ;
}
case 1 : {
TEST ( true ) ; // Simulated cluster using two satellite fast redundancy mode
primaryObj [ " satellite_redundancy_mode " ] = " two_satellite_fast " ;
remoteObj [ " satellite_redundancy_mode " ] = " two_satellite_fast " ;
break ;
}
case 2 : {
TEST ( true ) ; // Simulated cluster using two satellite safe redundancy mode
primaryObj [ " satellite_redundancy_mode " ] = " two_satellite_safe " ;
remoteObj [ " satellite_redundancy_mode " ] = " two_satellite_safe " ;
break ;
}
default :
ASSERT ( false ) ; // Programmer forgot to adjust cases.
}
} else {
int satellite_replication_type = g_random - > randomInt ( 0 , 5 ) ;
switch ( satellite_replication_type ) {
case 0 : {
//FIXME: implement
TEST ( true ) ; // Simulated cluster using custom satellite redundancy mode
break ;
}
case 1 : {
TEST ( true ) ; // Simulated cluster using no satellite redundancy mode
break ;
}
case 2 : {
TEST ( true ) ; // Simulated cluster using single satellite redundancy mode
primaryObj [ " satellite_redundancy_mode " ] = " one_satellite_single " ;
remoteObj [ " satellite_redundancy_mode " ] = " one_satellite_single " ;
break ;
}
case 3 : {
TEST ( true ) ; // Simulated cluster using double satellite redundancy mode
primaryObj [ " satellite_redundancy_mode " ] = " one_satellite_double " ;
remoteObj [ " satellite_redundancy_mode " ] = " one_satellite_double " ;
break ;
}
case 4 : {
TEST ( true ) ; // Simulated cluster using triple satellite redundancy mode
primaryObj [ " satellite_redundancy_mode " ] = " one_satellite_triple " ;
remoteObj [ " satellite_redundancy_mode " ] = " one_satellite_triple " ;
break ;
}
default :
ASSERT ( false ) ; // Programmer forgot to adjust cases.
}
2018-03-06 11:27:46 +08:00
}
2018-02-19 04:59:43 +08:00
2018-03-06 11:27:46 +08:00
if ( g_random - > random01 ( ) < 0.25 ) {
int logs = g_random - > randomInt ( 1 , 7 ) ;
primaryObj [ " satellite_logs " ] = logs ;
remoteObj [ " satellite_logs " ] = logs ;
}
2018-06-14 08:55:55 +08:00
2018-07-10 07:55:33 +08:00
//We cannot run with a remote DC when MAX_READ_TRANSACTION_LIFE_VERSIONS is too small, because the log routers will not be able to keep up.
2018-09-18 09:32:39 +08:00
if ( minimumRegions < = 1 & & ( g_random - > random01 ( ) < 0.25 | | SERVER_KNOBS - > MAX_READ_TRANSACTION_LIFE_VERSIONS < SERVER_KNOBS - > VERSIONS_PER_SECOND ) ) {
2018-06-18 10:31:15 +08:00
TEST ( true ) ; // Simulated cluster using one region
needsRemote = false ;
} else {
TEST ( true ) ; // Simulated cluster using two regions
db . usableRegions = 2 ;
}
2018-07-10 13:01:46 +08:00
int remote_replication_type = g_random - > randomInt ( 0 , datacenters > 4 ? 4 : 5 ) ;
2018-03-06 11:27:46 +08:00
switch ( remote_replication_type ) {
case 0 : {
2018-03-16 01:59:30 +08:00
//FIXME: implement
2018-03-06 11:27:46 +08:00
TEST ( true ) ; // Simulated cluster using custom remote redundancy mode
break ;
}
case 1 : {
2018-06-18 10:31:15 +08:00
TEST ( true ) ; // Simulated cluster using default remote redundancy mode
2018-03-06 11:27:46 +08:00
break ;
}
case 2 : {
TEST ( true ) ; // Simulated cluster using single remote redundancy mode
set_config ( " remote_single " ) ;
break ;
}
case 3 : {
TEST ( true ) ; // Simulated cluster using double remote redundancy mode
set_config ( " remote_double " ) ;
break ;
}
case 4 : {
TEST ( true ) ; // Simulated cluster using triple remote redundancy mode
set_config ( " remote_triple " ) ;
break ;
}
default :
ASSERT ( false ) ; // Programmer forgot to adjust cases.
}
2018-06-19 01:22:34 +08:00
2018-06-22 15:04:00 +08:00
if ( g_random - > random01 ( ) < 0.25 ) db . desiredLogRouterCount = g_random - > randomInt ( 1 , 7 ) ;
2018-06-19 01:22:34 +08:00
if ( g_random - > random01 ( ) < 0.25 ) db . remoteDesiredTLogCount = g_random - > randomInt ( 1 , 7 ) ;
2018-03-06 11:27:46 +08:00
}
2018-06-14 08:55:55 +08:00
primaryObj [ " datacenters " ] = primaryDcArr ;
remoteObj [ " datacenters " ] = remoteDcArr ;
2018-03-06 11:27:46 +08:00
StatusArray regionArr ;
regionArr . push_back ( primaryObj ) ;
2018-03-16 06:40:58 +08:00
if ( needsRemote | | g_random - > random01 ( ) < 0.5 ) {
regionArr . push_back ( remoteObj ) ;
}
2018-03-06 11:27:46 +08:00
set_config ( " regions= " + json_spirit : : write_string ( json_spirit : : mValue ( regionArr ) , json_spirit : : Output_options : : none ) ) ;
2018-07-17 01:06:57 +08:00
if ( needsRemote ) {
2018-09-18 09:32:39 +08:00
g_simulator . originalRegions = " regions= " + json_spirit : : write_string ( json_spirit : : mValue ( regionArr ) , json_spirit : : Output_options : : none ) ;
2018-07-17 01:06:57 +08:00
StatusArray disablePrimary = regionArr ;
disablePrimary [ 0 ] . get_obj ( ) [ " datacenters " ] . get_array ( ) [ 0 ] . get_obj ( ) [ " priority " ] = - 1 ;
g_simulator . disablePrimary = " regions= " + json_spirit : : write_string ( json_spirit : : mValue ( disablePrimary ) , json_spirit : : Output_options : : none ) ;
StatusArray disableRemote = regionArr ;
disableRemote [ 1 ] . get_obj ( ) [ " datacenters " ] . get_array ( ) [ 0 ] . get_obj ( ) [ " priority " ] = - 1 ;
g_simulator . disableRemote = " regions= " + json_spirit : : write_string ( json_spirit : : mValue ( disableRemote ) , json_spirit : : Output_options : : none ) ;
}
2018-02-19 04:59:43 +08:00
}
2018-06-14 08:55:55 +08:00
if ( generateFearless & & minimumReplication > 1 ) {
2018-05-08 02:28:25 +08:00
//low latency tests in fearless configurations need 4 machines per datacenter (3 for triple replication, 1 that is down during failures).
machine_count = 16 ;
} else if ( generateFearless ) {
2018-02-19 04:59:43 +08:00
machine_count = 12 ;
} else if ( db . tLogPolicy & & db . tLogPolicy - > info ( ) = = " data_hall^2 x zoneid^2 x 1 " ) {
2018-02-16 10:32:39 +08:00
machine_count = 9 ;
2018-02-11 05:27:51 +08:00
} else {
2018-02-16 10:32:39 +08:00
//datacenters+2 so that the configure database workload can configure into three_data_hall
2018-03-31 10:26:22 +08:00
machine_count = std : : max ( datacenters + 2 , ( ( db . minDatacentersRequired ( ) > 0 ) ? datacenters : 1 ) * std : : max ( 3 , db . minMachinesRequiredPerDatacenter ( ) ) ) ;
2018-02-16 10:32:39 +08:00
machine_count = g_random - > randomInt ( machine_count , std : : max ( machine_count + 1 , extraDB ? 6 : 10 ) ) ;
2018-02-11 05:27:51 +08:00
}
2018-02-16 10:32:39 +08:00
//because we protect a majority of coordinators from being killed, it is better to run with low numbers of coordinators to prevent too many processes from being protected
2019-01-19 07:42:48 +08:00
coordinators = ( minimumRegions < = 1 & & BUGGIFY ) ? g_random - > randomInt ( 1 , std : : max ( machine_count , 2 ) ) : 1 ;
2018-02-16 10:32:39 +08:00
2017-11-29 10:20:29 +08:00
if ( minimumReplication > 1 & & datacenters = = 3 ) {
//low latency tests in 3 data hall mode need 2 other data centers with 2 machines each to avoid waiting for logs to recover.
machine_count = std : : max ( machine_count , 6 ) ;
2018-02-16 10:32:39 +08:00
coordinators = 3 ;
2017-11-29 10:20:29 +08:00
}
2018-02-19 04:59:43 +08:00
if ( generateFearless ) {
processes_per_machine = 1 ;
} else {
processes_per_machine = g_random - > randomInt ( 1 , ( extraDB ? 14 : 28 ) / machine_count + 2 ) ;
}
2017-08-12 06:06:36 +08:00
}
2017-05-26 04:48:44 +08:00
void setupSimulatedSystem ( vector < Future < Void > > * systemActors , std : : string baseFolder ,
2018-04-26 09:29:29 +08:00
int * pTesterCount , Optional < ClusterConnectionString > * pConnString ,
2018-09-18 09:32:39 +08:00
Standalone < StringRef > * pStartingConfiguration , int extraDB , int minimumReplication , int minimumRegions , Reference < TLSOptions > tlsOptions )
2017-05-26 04:48:44 +08:00
{
// SOMEDAY: this does not test multi-interface configurations
2018-09-18 09:32:39 +08:00
SimulationConfig simconfig ( extraDB , minimumReplication , minimumRegions ) ;
2018-03-06 11:27:46 +08:00
StatusObject startingConfigJSON = simconfig . db . toJSON ( true ) ;
std : : string startingConfigString = " new " ;
for ( auto kv : startingConfigJSON ) {
startingConfigString + = " " ;
if ( kv . second . type ( ) = = json_spirit : : int_type ) {
2019-02-01 10:20:14 +08:00
startingConfigString + = kv . first + " := " + format ( " %d " , kv . second . get_int ( ) ) ;
2018-03-06 11:27:46 +08:00
} else if ( kv . second . type ( ) = = json_spirit : : str_type ) {
2019-02-01 10:20:14 +08:00
startingConfigString + = kv . second . get_str ( ) ;
2018-03-06 11:27:46 +08:00
} else if ( kv . second . type ( ) = = json_spirit : : array_type ) {
2019-02-01 10:20:14 +08:00
startingConfigString + = kv . first + " = " + json_spirit : : write_string ( json_spirit : : mValue ( kv . second . get_array ( ) ) , json_spirit : : Output_options : : none ) ;
2018-03-06 11:27:46 +08:00
} else {
ASSERT ( false ) ;
}
}
2017-08-22 13:29:56 +08:00
g_simulator . storagePolicy = simconfig . db . storagePolicy ;
g_simulator . tLogPolicy = simconfig . db . tLogPolicy ;
g_simulator . tLogWriteAntiQuorum = simconfig . db . tLogWriteAntiQuorum ;
2018-06-18 10:31:15 +08:00
g_simulator . remoteTLogPolicy = simconfig . db . getRemoteTLogPolicy ( ) ;
g_simulator . usableRegions = simconfig . db . usableRegions ;
2017-10-20 06:49:54 +08:00
2018-06-29 14:15:32 +08:00
if ( simconfig . db . regions . size ( ) > 0 ) {
2018-03-06 11:27:46 +08:00
g_simulator . primaryDcId = simconfig . db . regions [ 0 ] . dcId ;
2018-06-11 11:20:41 +08:00
g_simulator . hasSatelliteReplication = simconfig . db . regions [ 0 ] . satelliteTLogReplicationFactor > 0 ;
2018-06-29 14:15:32 +08:00
if ( simconfig . db . regions [ 0 ] . satelliteTLogUsableDcsFallback > 0 ) {
2018-07-10 13:01:46 +08:00
g_simulator . satelliteTLogPolicyFallback = simconfig . db . regions [ 0 ] . satelliteTLogPolicyFallback ;
g_simulator . satelliteTLogWriteAntiQuorumFallback = simconfig . db . regions [ 0 ] . satelliteTLogWriteAntiQuorumFallback ;
2018-06-29 14:15:32 +08:00
} else {
2018-07-10 13:01:46 +08:00
g_simulator . satelliteTLogPolicyFallback = simconfig . db . regions [ 0 ] . satelliteTLogPolicy ;
g_simulator . satelliteTLogWriteAntiQuorumFallback = simconfig . db . regions [ 0 ] . satelliteTLogWriteAntiQuorum ;
2018-03-06 11:27:46 +08:00
}
2018-07-10 13:01:46 +08:00
g_simulator . satelliteTLogPolicy = simconfig . db . regions [ 0 ] . satelliteTLogPolicy ;
g_simulator . satelliteTLogWriteAntiQuorum = simconfig . db . regions [ 0 ] . satelliteTLogWriteAntiQuorum ;
2018-03-16 01:59:30 +08:00
for ( auto s : simconfig . db . regions [ 0 ] . satellites ) {
g_simulator . primarySatelliteDcIds . push_back ( s . dcId ) ;
}
2018-03-06 11:27:46 +08:00
} else {
g_simulator . hasSatelliteReplication = false ;
g_simulator . satelliteTLogWriteAntiQuorum = 0 ;
}
2018-06-29 14:15:32 +08:00
if ( simconfig . db . regions . size ( ) = = 2 ) {
g_simulator . remoteDcId = simconfig . db . regions [ 1 ] . dcId ;
ASSERT ( ( ! simconfig . db . regions [ 0 ] . satelliteTLogPolicy & & ! simconfig . db . regions [ 1 ] . satelliteTLogPolicy ) | | simconfig . db . regions [ 0 ] . satelliteTLogPolicy - > info ( ) = = simconfig . db . regions [ 1 ] . satelliteTLogPolicy - > info ( ) ) ;
for ( auto s : simconfig . db . regions [ 1 ] . satellites ) {
g_simulator . remoteSatelliteDcIds . push_back ( s . dcId ) ;
}
}
2018-07-10 13:24:43 +08:00
if ( g_simulator . usableRegions < 2 | | ! g_simulator . hasSatelliteReplication ) {
g_simulator . allowLogSetKills = false ;
}
2017-10-20 06:49:54 +08:00
ASSERT ( g_simulator . storagePolicy & & g_simulator . tLogPolicy ) ;
ASSERT ( ! g_simulator . hasSatelliteReplication | | g_simulator . satelliteTLogPolicy ) ;
2018-06-09 02:11:08 +08:00
TraceEvent ( " SimulatorConfig " ) . detail ( " ConfigString " , printable ( StringRef ( startingConfigString ) ) ) ;
2017-05-26 04:48:44 +08:00
2017-08-22 13:29:56 +08:00
const int dataCenters = simconfig . datacenters ;
const int machineCount = simconfig . machine_count ;
const int coordinatorCount = simconfig . coordinators ;
const int processesPerMachine = simconfig . processes_per_machine ;
2017-05-26 04:48:44 +08:00
// half the time, when we have more than 4 machines that are not the first in their dataCenter, assign classes
2018-02-11 05:27:51 +08:00
bool assignClasses = machineCount - dataCenters > 4 & & g_random - > random01 ( ) < 0.5 ;
2017-05-26 04:48:44 +08:00
2017-10-30 23:31:01 +08:00
// Use SSL 5% of the time
2019-02-05 03:39:49 +08:00
bool sslEnabled = g_random - > random01 ( ) < 0.10 & & tlsOptions - > enabled ( ) ;
2019-02-01 10:20:14 +08:00
bool sslOnly = sslEnabled & & g_random - > coinflip ( ) ;
g_simulator . listenersPerProcess = sslEnabled & & ! sslOnly ? 2 : 1 ;
2017-05-26 04:48:44 +08:00
TEST ( sslEnabled ) ; // SSL enabled
TEST ( ! sslEnabled ) ; // SSL disabled
vector < NetworkAddress > coordinatorAddresses ;
2018-09-18 09:32:39 +08:00
if ( minimumRegions > 1 ) {
//do not put coordinators in the primary region so that we can kill that region safely
int nonPrimaryDcs = dataCenters / 2 ;
for ( int dc = 1 ; dc < dataCenters ; dc + = 2 ) {
int dcCoordinators = coordinatorCount / nonPrimaryDcs + ( ( dc - 1 ) / 2 < coordinatorCount % nonPrimaryDcs ) ;
for ( int m = 0 ; m < dcCoordinators ; m + + ) {
uint32_t ip = 2 < < 24 | dc < < 16 | 1 < < 8 | m ;
2019-02-01 10:20:14 +08:00
coordinatorAddresses . push_back ( NetworkAddress ( ip , sslEnabled & & ! sslOnly ? 2 : 1 , true , sslEnabled & & sslOnly ) ) ;
2018-09-18 09:32:39 +08:00
TraceEvent ( " SelectedCoordinator " ) . detail ( " Address " , coordinatorAddresses . back ( ) ) ;
}
}
} else {
2019-01-19 07:42:48 +08:00
int assignedMachines = 0 ;
int coordCount = coordinatorCount ;
if ( coordinatorCount > 4 ) {
+ + coordCount ;
}
2018-09-18 09:32:39 +08:00
for ( int dc = 0 ; dc < dataCenters ; dc + + ) {
2019-01-19 07:42:48 +08:00
int dcCoordinators = coordCount / dataCenters + ( dc < coordCount % dataCenters ) ;
int machines = machineCount / dataCenters + ( dc < machineCount % dataCenters ) ;
2018-09-18 09:32:39 +08:00
for ( int m = 0 ; m < dcCoordinators ; m + + ) {
2019-01-19 07:42:48 +08:00
if ( coordinatorCount > 4 & & ( assignedMachines = = 4 | | ( m + 1 = = dcCoordinators & & assignedMachines < 4 & & assignedMachines + machines - dcCoordinators > = 4 ) ) ) {
uint32_t ip = 2 < < 24 | dc < < 16 | 1 < < 8 | m ;
TraceEvent ( " SkippedCoordinator " ) . detail ( " Address " , ip ) . detail ( " M " , m ) . detail ( " Machines " , machines ) . detail ( " Assigned " , assignedMachines ) . detail ( " DcCoord " , dcCoordinators ) . detail ( " CoordinatorCount " , coordinatorCount ) ;
} else {
uint32_t ip = 2 < < 24 | dc < < 16 | 1 < < 8 | m ;
2019-02-13 17:52:59 +08:00
coordinatorAddresses . push_back ( NetworkAddress ( ip , sslEnabled & & ! sslOnly ? 2 : 1 , true , sslEnabled & & sslOnly ) ) ;
2019-01-19 07:42:48 +08:00
TraceEvent ( " SelectedCoordinator " ) . detail ( " Address " , coordinatorAddresses . back ( ) ) . detail ( " M " , m ) . detail ( " Machines " , machines ) . detail ( " Assigned " , assignedMachines ) . detail ( " DcCoord " , dcCoordinators ) . detail ( " P1 " , ( m + 1 = = dcCoordinators ) ) . detail ( " P2 " , ( assignedMachines < 4 ) ) . detail ( " P3 " , ( assignedMachines + machines - dcCoordinators > = 4 ) ) . detail ( " CoordinatorCount " , coordinatorCount ) ;
}
assignedMachines + + ;
2018-09-18 09:32:39 +08:00
}
2019-01-19 07:42:48 +08:00
assignedMachines + = machines - dcCoordinators ;
2017-05-26 04:48:44 +08:00
}
}
2017-05-27 05:20:11 +08:00
2017-05-26 04:48:44 +08:00
g_random - > randomShuffle ( coordinatorAddresses ) ;
for ( int i = 0 ; i < ( coordinatorAddresses . size ( ) / 2 ) + 1 ; i + + ) {
2017-06-20 07:48:15 +08:00
TraceEvent ( " ProtectCoordinator " ) . detail ( " Address " , coordinatorAddresses [ i ] ) . detail ( " Coordinators " , describe ( coordinatorAddresses ) ) . backtrace ( ) ;
2017-06-03 04:52:21 +08:00
g_simulator . protectedAddresses . insert ( NetworkAddress ( coordinatorAddresses [ i ] . ip , coordinatorAddresses [ i ] . port , true , false ) ) ;
2019-02-01 10:20:14 +08:00
if ( coordinatorAddresses [ i ] . port = = 2 ) {
g_simulator . protectedAddresses . insert ( NetworkAddress ( coordinatorAddresses [ i ] . ip , 1 , true , false ) ) ;
}
2017-05-26 04:48:44 +08:00
}
g_random - > randomShuffle ( coordinatorAddresses ) ;
2017-05-27 05:20:11 +08:00
2017-05-26 04:48:44 +08:00
ASSERT ( coordinatorAddresses . size ( ) = = coordinatorCount ) ;
ClusterConnectionString conn ( coordinatorAddresses , LiteralStringRef ( " TestCluster:0 " ) ) ;
2018-04-27 08:24:40 +08:00
2018-04-19 10:34:35 +08:00
// If extraDB==0, leave g_simulator.extraDB as null because the test does not use DR.
if ( extraDB = = 1 ) {
// The DR database can be either a new database or itself
g_simulator . extraDB = new ClusterConnectionString ( coordinatorAddresses , BUGGIFY ? LiteralStringRef ( " TestCluster:0 " ) : LiteralStringRef ( " ExtraCluster:0 " ) ) ;
} else if ( extraDB = = 2 ) {
// The DR database is a new database
g_simulator . extraDB = new ClusterConnectionString ( coordinatorAddresses , LiteralStringRef ( " ExtraCluster:0 " ) ) ;
} else if ( extraDB = = 3 ) {
// The DR database is the same database
g_simulator . extraDB = new ClusterConnectionString ( coordinatorAddresses , LiteralStringRef ( " TestCluster:0 " ) ) ;
}
2017-05-26 04:48:44 +08:00
* pConnString = conn ;
2018-03-06 11:27:46 +08:00
TraceEvent ( " SimulatedConnectionString " ) . detail ( " String " , conn . toString ( ) ) . detail ( " ConfigString " , printable ( StringRef ( startingConfigString ) ) ) ;
2017-05-26 04:48:44 +08:00
int assignedMachines = 0 , nonVersatileMachines = 0 ;
for ( int dc = 0 ; dc < dataCenters ; dc + + ) {
2018-02-11 05:27:51 +08:00
//FIXME: test unset dcID
2017-10-20 06:49:54 +08:00
Optional < Standalone < StringRef > > dcUID = StringRef ( format ( " %d " , dc ) ) ;
2017-05-26 04:48:44 +08:00
std : : vector < UID > machineIdentities ;
int machines = machineCount / dataCenters + ( dc < machineCount % dataCenters ) ; // add remainder of machines to first datacenter
int dcCoordinators = coordinatorCount / dataCenters + ( dc < coordinatorCount % dataCenters ) ;
printf ( " Datacenter %d: %d/%d machines, %d/%d coordinators \n " , dc , machines , machineCount , dcCoordinators , coordinatorCount ) ;
ASSERT ( dcCoordinators < = machines ) ;
int useSeedForMachine = g_random - > randomInt ( 0 , machines ) ;
2019-01-19 07:42:48 +08:00
Standalone < StringRef > zoneId ;
Standalone < StringRef > newZoneId ;
2017-05-26 04:48:44 +08:00
for ( int machine = 0 ; machine < machines ; machine + + ) {
2019-01-19 07:42:48 +08:00
Standalone < StringRef > machineId ( g_random - > randomUniqueID ( ) . toString ( ) ) ;
if ( machine = = 0 | | machineCount - dataCenters < = 4 | | assignedMachines ! = 4 | | simconfig . db . regions . size ( ) | | g_random - > random01 ( ) < 0.5 ) {
zoneId = g_random - > randomUniqueID ( ) . toString ( ) ;
newZoneId = g_random - > randomUniqueID ( ) . toString ( ) ;
}
2017-05-26 04:48:44 +08:00
//Choose a machine class
ProcessClass processClass = ProcessClass ( ProcessClass : : UnsetClass , ProcessClass : : CommandLineSource ) ;
if ( assignClasses ) {
if ( assignedMachines < 4 )
processClass = ProcessClass ( ( ProcessClass : : ClassType ) g_random - > randomInt ( 0 , 2 ) , ProcessClass : : CommandLineSource ) ; //Unset or Storage
2018-04-12 12:22:53 +08:00
else if ( assignedMachines = = 4 & & ! simconfig . db . regions . size ( ) )
2017-05-26 04:48:44 +08:00
processClass = ProcessClass ( ( ProcessClass : : ClassType ) ( g_random - > randomInt ( 0 , 2 ) * ProcessClass : : ResolutionClass ) , ProcessClass : : CommandLineSource ) ; //Unset or Resolution
else
processClass = ProcessClass ( ( ProcessClass : : ClassType ) g_random - > randomInt ( 0 , 3 ) , ProcessClass : : CommandLineSource ) ; //Unset, Storage, or Transaction
if ( processClass = = ProcessClass : : ResolutionClass ) // *can't* be assigned to other roles, even in an emergency
nonVersatileMachines + + ;
}
std : : vector < uint32_t > ips ;
for ( int i = 0 ; i < processesPerMachine ; i + + ) {
ips . push_back ( 2 < < 24 | dc < < 16 | g_random - > randomInt ( 1 , i + 2 ) < < 8 | machine ) ;
}
// check the sslEnablementMap using only one ip(
2019-01-19 07:42:48 +08:00
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , zoneId , machineId , dcUID ) ;
2017-05-26 04:48:44 +08:00
localities . set ( LiteralStringRef ( " data_hall " ) , dcUID ) ;
2018-04-26 09:29:29 +08:00
systemActors - > push_back ( reportErrors ( simulatedMachine ( conn , ips , sslEnabled , tlsOptions ,
2019-02-01 10:20:14 +08:00
localities , processClass , baseFolder , false , machine = = useSeedForMachine , true , sslOnly ) , " SimulatedMachine " ) ) ;
2017-05-26 04:48:44 +08:00
2018-04-19 10:34:35 +08:00
if ( extraDB & & g_simulator . extraDB - > toString ( ) ! = conn . toString ( ) ) {
2017-05-26 04:48:44 +08:00
std : : vector < uint32_t > extraIps ;
for ( int i = 0 ; i < processesPerMachine ; i + + ) {
extraIps . push_back ( 4 < < 24 | dc < < 16 | g_random - > randomInt ( 1 , i + 2 ) < < 8 | machine ) ;
}
2019-01-19 07:42:48 +08:00
Standalone < StringRef > newMachineId ( g_random - > randomUniqueID ( ) . toString ( ) ) ;
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , newZoneId , newMachineId , dcUID ) ;
2017-05-26 04:48:44 +08:00
localities . set ( LiteralStringRef ( " data_hall " ) , dcUID ) ;
2018-04-26 09:29:29 +08:00
systemActors - > push_back ( reportErrors ( simulatedMachine ( * g_simulator . extraDB , extraIps , sslEnabled , tlsOptions ,
2017-05-26 04:48:44 +08:00
localities ,
2019-02-01 10:20:14 +08:00
processClass , baseFolder , false , machine = = useSeedForMachine , false , sslOnly ) , " SimulatedMachine " ) ) ;
2017-05-26 04:48:44 +08:00
}
assignedMachines + + ;
}
}
g_simulator . desiredCoordinators = coordinatorCount ;
g_simulator . physicalDatacenters = dataCenters ;
g_simulator . processesPerMachine = processesPerMachine ;
TraceEvent ( " SetupSimulatorSettings " )
2018-06-09 02:11:08 +08:00
. detail ( " DesiredCoordinators " , g_simulator . desiredCoordinators )
. detail ( " PhysicalDatacenters " , g_simulator . physicalDatacenters )
. detail ( " ProcessesPerMachine " , g_simulator . processesPerMachine ) ;
2017-05-26 04:48:44 +08:00
// SOMEDAY: add locality for testers to simulate network topology
// FIXME: Start workers with tester class instead, at least sometimes run tests with the testers-only flag
int testerCount = * pTesterCount = g_random - > randomInt ( 4 , 9 ) ;
int useSeedForMachine = g_random - > randomInt ( 0 , testerCount ) ;
for ( int i = 0 ; i < testerCount ; i + + ) {
std : : vector < uint32_t > ips ;
ips . push_back ( 0x03040301 + i ) ;
Standalone < StringRef > newZoneId = Standalone < StringRef > ( g_random - > randomUniqueID ( ) . toString ( ) ) ;
LocalityData localities ( Optional < Standalone < StringRef > > ( ) , newZoneId , newZoneId , Optional < Standalone < StringRef > > ( ) ) ;
systemActors - > push_back ( reportErrors ( simulatedMachine (
2018-04-26 09:29:29 +08:00
conn , ips , sslEnabled , tlsOptions ,
2017-05-26 04:48:44 +08:00
localities , ProcessClass ( ProcessClass : : TesterClass , ProcessClass : : CommandLineSource ) ,
2019-02-01 10:20:14 +08:00
baseFolder , false , i = = useSeedForMachine , false , sslEnabled ) ,
2017-05-26 04:48:44 +08:00
" SimulatedTesterMachine " ) ) ;
}
* pStartingConfiguration = startingConfigString ;
// save some state that we only need when restarting the simulator.
g_simulator . connectionString = conn . toString ( ) ;
g_simulator . testerCount = testerCount ;
TraceEvent ( " SimulatedClusterStarted " )
. detail ( " DataCenters " , dataCenters )
. detail ( " ServerMachineCount " , machineCount )
. detail ( " ProcessesPerServer " , processesPerMachine )
. detail ( " SSLEnabled " , sslEnabled )
2019-02-01 10:20:14 +08:00
. detail ( " SSLOnly " , sslOnly )
2017-05-26 04:48:44 +08:00
. detail ( " ClassesAssigned " , assignClasses )
2018-02-27 05:15:44 +08:00
. detail ( " StartingConfiguration " , pStartingConfiguration - > toString ( ) ) ;
2017-05-26 04:48:44 +08:00
}
2018-09-18 09:32:39 +08:00
void checkExtraDB ( const char * testFile , int & extraDB , int & minimumReplication , int & minimumRegions ) {
2017-05-26 04:48:44 +08:00
std : : ifstream ifs ;
ifs . open ( testFile , std : : ifstream : : in ) ;
if ( ! ifs . good ( ) )
2017-09-16 08:55:01 +08:00
return ;
2017-05-26 04:48:44 +08:00
std : : string cline ;
while ( ifs . good ( ) ) {
getline ( ifs , cline ) ;
std : : string line = removeWhitespace ( std : : string ( cline ) ) ;
if ( ! line . size ( ) | | line . find ( ' ; ' ) = = 0 )
continue ;
size_t found = line . find ( ' = ' ) ;
if ( found = = std : : string : : npos )
// hmmm, not good
continue ;
std : : string attrib = removeWhitespace ( line . substr ( 0 , found ) ) ;
std : : string value = removeWhitespace ( line . substr ( found + 1 ) ) ;
if ( attrib = = " extraDB " ) {
2017-09-16 08:55:01 +08:00
sscanf ( value . c_str ( ) , " %d " , & extraDB ) ;
}
if ( attrib = = " minimumReplication " ) {
sscanf ( value . c_str ( ) , " %d " , & minimumReplication ) ;
2017-05-26 04:48:44 +08:00
}
2018-09-18 09:32:39 +08:00
if ( attrib = = " minimumRegions " ) {
sscanf ( value . c_str ( ) , " %d " , & minimumRegions ) ;
}
2017-05-26 04:48:44 +08:00
}
ifs . close ( ) ;
}
2018-04-26 09:29:29 +08:00
ACTOR void setupAndRun ( std : : string dataFolder , const char * testFile , bool rebooting , Reference < TLSOptions > tlsOptions ) {
2017-05-26 04:48:44 +08:00
state vector < Future < Void > > systemActors ;
state Optional < ClusterConnectionString > connFile ;
state Standalone < StringRef > startingConfiguration ;
state int testerCount = 1 ;
2017-09-16 08:55:01 +08:00
state int extraDB = 0 ;
state int minimumReplication = 0 ;
2018-09-18 09:32:39 +08:00
state int minimumRegions = 0 ;
checkExtraDB ( testFile , extraDB , minimumReplication , minimumRegions ) ;
2017-05-26 04:48:44 +08:00
2018-08-11 04:57:10 +08:00
wait ( g_simulator . onProcess ( g_simulator . newProcess (
2019-02-13 17:52:59 +08:00
" TestSystem " , 0x01010101 , 1 , 1 , LocalityData ( Optional < Standalone < StringRef > > ( ) , Standalone < StringRef > ( g_random - > randomUniqueID ( ) . toString ( ) ) , Standalone < StringRef > ( g_random - > randomUniqueID ( ) . toString ( ) ) , Optional < Standalone < StringRef > > ( ) ) , ProcessClass ( ProcessClass : : TesterClass , ProcessClass : : CommandLineSource ) , " " , " " ) , TaskDefaultYield ) ) ;
2017-05-26 04:48:44 +08:00
Sim2FileSystem : : newFileSystem ( ) ;
FlowTransport : : createInstance ( 1 ) ;
2018-04-26 09:29:29 +08:00
if ( tlsOptions - > enabled ( ) ) {
simInitTLS ( tlsOptions ) ;
2018-04-25 07:46:01 +08:00
}
2017-05-26 04:48:44 +08:00
TEST ( true ) ; // Simulation start
try {
//systemActors.push_back( startSystemMonitor(dataFolder) );
if ( rebooting ) {
2018-11-11 05:04:24 +08:00
wait ( timeoutError ( restartSimulatedSystem ( & systemActors , dataFolder , & testerCount , & connFile , & startingConfiguration , tlsOptions , extraDB ) , 100.0 ) ) ;
2017-05-26 04:48:44 +08:00
}
else {
g_expect_full_pointermap = 1 ;
2018-09-18 09:32:39 +08:00
setupSimulatedSystem ( & systemActors , dataFolder , & testerCount , & connFile , & startingConfiguration , extraDB , minimumReplication , minimumRegions , tlsOptions ) ;
2018-08-11 04:57:10 +08:00
wait ( delay ( 1.0 ) ) ; // FIXME: WHY!!! //wait for machines to boot
2017-05-26 04:48:44 +08:00
}
std : : string clusterFileDir = joinPath ( dataFolder , g_random - > randomUniqueID ( ) . toString ( ) ) ;
platform : : createDirectory ( clusterFileDir ) ;
writeFile ( joinPath ( clusterFileDir , " fdb.cluster " ) , connFile . get ( ) . toString ( ) ) ;
2018-08-11 04:57:10 +08:00
wait ( timeoutError ( runTests ( Reference < ClusterConnectionFile > ( new ClusterConnectionFile ( joinPath ( clusterFileDir , " fdb.cluster " ) ) ) , TEST_TYPE_FROM_FILE , TEST_ON_TESTERS , testerCount , testFile , startingConfiguration ) , buggifyActivated ? 36000.0 : 5400.0 ) ) ;
2017-05-26 04:48:44 +08:00
} catch ( Error & e ) {
2018-06-09 04:57:00 +08:00
TraceEvent ( SevError , " SetupAndRunError " ) . error ( e ) ;
2017-05-26 04:48:44 +08:00
}
TraceEvent ( " SimulatedSystemDestruct " ) ;
destructed = true ;
systemActors . clear ( ) ;
g_simulator . stop ( ) ;
}