simulated cluster supports fearless configurations
removed unused simulation variables run the simulation with only 1 coordinator most of the time, since we protect the coordinator from being killed, and protecting too many things is bad for simulation
This commit is contained in:
parent
ad19d3926b
commit
cb25564d38
|
@ -737,10 +737,6 @@ ACTOR Future<CoordinatorsResult::Type> changeQuorum( Database cx, Reference<IQuo
|
|||
state Transaction tr(cx);
|
||||
state int retries = 0;
|
||||
|
||||
//quorum changes do not balance coordinators evenly across datacenters
|
||||
if(g_network->isSimulated())
|
||||
g_simulator.maxCoordinatorsInDatacenter = g_simulator.killableMachines + 1;
|
||||
|
||||
loop {
|
||||
try {
|
||||
tr.setOption( FDBTransactionOptions::LOCK_AWARE );
|
||||
|
|
|
@ -1194,10 +1194,9 @@ public:
|
|||
}
|
||||
}
|
||||
virtual void killProcess( ProcessInfo* machine, KillType kt ) {
|
||||
TraceEvent("attemptingKillProcess").detail("killedMachines", killedMachines).detail("killableMachines", killableMachines);
|
||||
TraceEvent("attemptingKillProcess");
|
||||
if (kt < RebootAndDelete ) {
|
||||
killProcess_internal( machine, kt );
|
||||
killedMachines++;
|
||||
}
|
||||
}
|
||||
virtual void killInterface( NetworkAddress address, KillType kt ) {
|
||||
|
@ -1205,7 +1204,6 @@ public:
|
|||
std::vector<ProcessInfo*>& processes = machines[ addressMap[address]->locality.zoneId() ].processes;
|
||||
for( int i = 0; i < processes.size(); i++ )
|
||||
killProcess_internal( processes[i], kt );
|
||||
killedMachines++;
|
||||
}
|
||||
}
|
||||
virtual bool killMachine(Optional<Standalone<StringRef>> zoneId, KillType kt, bool killIsSafe, bool forceKill, KillType* ktFinal) {
|
||||
|
@ -1319,7 +1317,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
TraceEvent("KillMachine", zoneId).detailext("ZoneId", zoneId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KilledMachines", killedMachines).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig).detail("killIsSafe", killIsSafe);
|
||||
TraceEvent("KillMachine", zoneId).detailext("ZoneId", zoneId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig).detail("killIsSafe", killIsSafe);
|
||||
if (kt < RebootAndDelete ) {
|
||||
if(kt == InjectFaults && machines[zoneId].machineProcess != nullptr)
|
||||
killProcess_internal( machines[zoneId].machineProcess, kt );
|
||||
|
@ -1425,10 +1423,6 @@ public:
|
|||
}
|
||||
|
||||
TraceEvent("killDataCenter")
|
||||
.detail("killedMachines", killedMachines)
|
||||
.detail("killableMachines", killableMachines)
|
||||
.detail("killableDatacenters", killableDatacenters)
|
||||
.detail("maxCoordinatorsInDatacenter", maxCoordinatorsInDatacenter)
|
||||
.detail("DcZones", datacenterZones.size())
|
||||
.detail("DcProcesses", dcProcesses)
|
||||
.detailext("DCID", dcId)
|
||||
|
|
|
@ -34,7 +34,7 @@ enum ClogMode { ClogDefault, ClogAll, ClogSend, ClogReceive };
|
|||
|
||||
class ISimulator : public INetwork {
|
||||
public:
|
||||
ISimulator() : killedMachines(0), killableMachines(0), machinesNeededForProgress(3), neededDatacenters(1), killableDatacenters(0), killedDatacenters(0), maxCoordinatorsInDatacenter(0), desiredCoordinators(1), processesPerMachine(0), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), extraDB(NULL) {}
|
||||
ISimulator() : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), extraDB(NULL) {}
|
||||
|
||||
// Order matters!
|
||||
enum KillType { None, KillInstantly, InjectFaults, RebootAndDelete, Reboot, RebootProcessAndDelete, RebootProcess };
|
||||
|
@ -270,17 +270,8 @@ public:
|
|||
virtual void destroyProcess( ProcessInfo *p ) = 0;
|
||||
virtual void destroyMachine(Optional<Standalone<StringRef>> const& zoneId ) = 0;
|
||||
|
||||
// These are here for reasoning about whether it is possible to kill machines (or delete their data)
|
||||
// and maintain the durability of the database.
|
||||
int killedMachines;
|
||||
int killableMachines;
|
||||
int machinesNeededForProgress;
|
||||
int desiredCoordinators;
|
||||
int neededDatacenters;
|
||||
int killedDatacenters;
|
||||
int killableDatacenters;
|
||||
int physicalDatacenters;
|
||||
int maxCoordinatorsInDatacenter;
|
||||
int processesPerMachine;
|
||||
std::set<NetworkAddress> protectedAddresses;
|
||||
std::map<NetworkAddress, ProcessInfo*> currentlyRebootingProcesses;
|
||||
|
|
|
@ -65,10 +65,18 @@ void parseReplicationPolicy(IRepPolicyRef* policy, ValueRef const& v) {
|
|||
}
|
||||
|
||||
void DatabaseConfiguration::setDefaultReplicationPolicy() {
|
||||
storagePolicy = IRepPolicyRef(new PolicyAcross(storageTeamSize, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
tLogPolicy = IRepPolicyRef(new PolicyAcross(tLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
remoteTLogPolicy = IRepPolicyRef(new PolicyAcross(remoteTLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
satelliteTLogPolicy = IRepPolicyRef(new PolicyAcross(satelliteTLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
if(!storagePolicy) {
|
||||
storagePolicy = IRepPolicyRef(new PolicyAcross(storageTeamSize, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
}
|
||||
if(!tLogPolicy) {
|
||||
tLogPolicy = IRepPolicyRef(new PolicyAcross(tLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
}
|
||||
if(remoteTLogReplicationFactor > 0 && !remoteTLogPolicy) {
|
||||
remoteTLogPolicy = IRepPolicyRef(new PolicyAcross(remoteTLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
}
|
||||
if(satelliteTLogReplicationFactor > 0 && !satelliteTLogPolicy) {
|
||||
satelliteTLogPolicy = IRepPolicyRef(new PolicyAcross(satelliteTLogReplicationFactor, "zoneid", IRepPolicyRef(new PolicyOne())));
|
||||
}
|
||||
}
|
||||
|
||||
bool DatabaseConfiguration::isValid() const {
|
||||
|
@ -170,18 +178,33 @@ std::map<std::string, std::string> DatabaseConfiguration::toMap() const {
|
|||
result["remote_satellite_dcs"] = remoteDcStr;
|
||||
}
|
||||
|
||||
if(satelliteTLogReplicationFactor > 0) {
|
||||
result["satellite_replication"] = format("%d", satelliteTLogReplicationFactor);
|
||||
if(satelliteTLogReplicationFactor == 1 && satelliteTLogUsableDcs == 1 && satelliteTLogWriteAntiQuorum == 0) {
|
||||
result["satellite_redundancy_mode"] = "one_satellite_single";
|
||||
} else if(satelliteTLogReplicationFactor == 2 && satelliteTLogUsableDcs == 1 && satelliteTLogWriteAntiQuorum == 0) {
|
||||
result["satellite_redundancy_mode"] = "one_satellite_double";
|
||||
} else if(satelliteTLogReplicationFactor == 3 && satelliteTLogUsableDcs == 1 && satelliteTLogWriteAntiQuorum == 0) {
|
||||
result["satellite_redundancy_mode"] = "one_satellite_triple";
|
||||
} else if(satelliteTLogReplicationFactor == 4 && satelliteTLogUsableDcs == 2 && satelliteTLogWriteAntiQuorum == 0) {
|
||||
result["satellite_redundancy_mode"] = "two_satellite_safe";
|
||||
} else if(satelliteTLogReplicationFactor == 4 && satelliteTLogUsableDcs == 2 && satelliteTLogWriteAntiQuorum == 2) {
|
||||
result["satellite_redundancy_mode"] = "two_satellite_fast";
|
||||
} else if(satelliteTLogReplicationFactor == 0) {
|
||||
result["satellite_redundancy_mode"] = "none";
|
||||
} else {
|
||||
result["satellite_redundancy_mode"] = "custom";
|
||||
}
|
||||
|
||||
if( remoteTLogReplicationFactor == 1 )
|
||||
if( remoteTLogReplicationFactor == 1 ) {
|
||||
result["remote_redundancy_mode"] = "remote_single";
|
||||
else if( remoteTLogReplicationFactor == 2 )
|
||||
} else if( remoteTLogReplicationFactor == 2 ) {
|
||||
result["remote_redundancy_mode"] = "remote_double";
|
||||
else if( remoteTLogReplicationFactor == 3 )
|
||||
} else if( remoteTLogReplicationFactor == 3 ) {
|
||||
result["remote_redundancy_mode"] = "remote_triple";
|
||||
else if(remoteTLogReplicationFactor > 0)
|
||||
} else if(remoteTLogReplicationFactor == 0) {
|
||||
result["remote_redundancy_mode"] = "none";
|
||||
} else {
|
||||
result["remote_redundancy_mode"] = "custom";
|
||||
}
|
||||
|
||||
if( desiredTLogCount != -1 )
|
||||
result["logs"] = format("%d", desiredTLogCount);
|
||||
|
|
|
@ -54,8 +54,21 @@ struct DatabaseConfiguration {
|
|||
}
|
||||
|
||||
// SOMEDAY: think about changing storageTeamSize to durableStorageQuorum
|
||||
int32_t minMachinesRequired() const { return std::max(tLogReplicationFactor, storageTeamSize); }
|
||||
int32_t maxMachineFailuresTolerated() const { return std::min(tLogReplicationFactor - 1 - tLogWriteAntiQuorum, durableStorageQuorum - 1); }
|
||||
int32_t minDatacentersRequired() const {
|
||||
if(!primaryDcId.present()) return 1;
|
||||
return 2 + primarySatelliteDcIds.size() + remoteSatelliteDcIds.size();
|
||||
}
|
||||
int32_t minMachinesRequiredPerDatacenter() const { return std::max( satelliteTLogReplicationFactor/std::max(1,satelliteTLogUsableDcs), std::max( remoteTLogReplicationFactor, std::max(tLogReplicationFactor, storageTeamSize) ) ); }
|
||||
|
||||
//Killing an entire datacenter counts as killing one machine in modes that support it
|
||||
int32_t maxMachineFailuresTolerated() const {
|
||||
if(remoteTLogReplicationFactor > 0 && satelliteTLogReplicationFactor > 0) {
|
||||
return 1 + std::min(std::max(tLogReplicationFactor - 1 - tLogWriteAntiQuorum, satelliteTLogReplicationFactor - 1 - satelliteTLogWriteAntiQuorum), durableStorageQuorum - 1);
|
||||
} else if(satelliteTLogReplicationFactor > 0) {
|
||||
return std::min(tLogReplicationFactor + satelliteTLogReplicationFactor - 2 - tLogWriteAntiQuorum - satelliteTLogWriteAntiQuorum, durableStorageQuorum - 1);
|
||||
}
|
||||
return std::min(tLogReplicationFactor - 1 - tLogWriteAntiQuorum, durableStorageQuorum - 1);
|
||||
}
|
||||
|
||||
// MasterProxy Servers
|
||||
int32_t masterProxyCount;
|
||||
|
@ -120,9 +133,7 @@ struct DatabaseConfiguration {
|
|||
if (ar.isDeserializing) {
|
||||
for(auto c=rawConfiguration.begin(); c!=rawConfiguration.end(); ++c)
|
||||
setInternal(c->key, c->value);
|
||||
if(!storagePolicy || !tLogPolicy) {
|
||||
setDefaultReplicationPolicy();
|
||||
}
|
||||
setDefaultReplicationPolicy();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -131,9 +142,7 @@ struct DatabaseConfiguration {
|
|||
this->rawConfiguration = rawConfig;
|
||||
for(auto c=rawConfiguration.begin(); c!=rawConfiguration.end(); ++c)
|
||||
setInternal(c->key, c->value);
|
||||
if(!storagePolicy || !tLogPolicy) {
|
||||
setDefaultReplicationPolicy();
|
||||
}
|
||||
setDefaultReplicationPolicy();
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
|
@ -588,10 +588,7 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>> *systemActors, st
|
|||
ini.SetMultiKey();
|
||||
|
||||
try {
|
||||
int dataCenters = atoi(ini.GetValue("META", "dataCenters"));
|
||||
int killableMachines = atoi(ini.GetValue("META", "killableMachines"));
|
||||
int machineCount = atoi(ini.GetValue("META", "machineCount"));
|
||||
int machinesNeededForProgress = atoi(ini.GetValue("META", "machinesNeededForProgress"));
|
||||
int processesPerMachine = atoi(ini.GetValue("META", "processesPerMachine"));
|
||||
int desiredCoordinators = atoi(ini.GetValue("META", "desiredCoordinators"));
|
||||
int testerCount = atoi(ini.GetValue("META", "testerCount"));
|
||||
|
@ -634,11 +631,6 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>> *systemActors, st
|
|||
processClass == ProcessClass::TesterClass ? "SimulatedTesterMachine" : "SimulatedMachine") );
|
||||
}
|
||||
|
||||
g_simulator.killableMachines = killableMachines;
|
||||
g_simulator.neededDatacenters = dataCenters;
|
||||
g_simulator.maxCoordinatorsInDatacenter = ((desiredCoordinators-1)/dataCenters) + 1;
|
||||
g_simulator.killableDatacenters = 0;
|
||||
g_simulator.machinesNeededForProgress = machinesNeededForProgress;
|
||||
g_simulator.desiredCoordinators = desiredCoordinators;
|
||||
g_simulator.processesPerMachine = processesPerMachine;
|
||||
}
|
||||
|
@ -647,11 +639,6 @@ ACTOR Future<Void> restartSimulatedSystem(vector<Future<Void>> *systemActors, st
|
|||
}
|
||||
|
||||
TraceEvent("RestartSimulatorSettings")
|
||||
.detail("killableMachines", g_simulator.killableMachines)
|
||||
.detail("neededDatacenters", g_simulator.neededDatacenters)
|
||||
.detail("killableDatacenters", g_simulator.killableDatacenters)
|
||||
.detail("machinesNeededForProgress", g_simulator.machinesNeededForProgress)
|
||||
.detail("maxCoordinatorsInDatacenter", g_simulator.maxCoordinatorsInDatacenter)
|
||||
.detail("desiredCoordinators", g_simulator.desiredCoordinators)
|
||||
.detail("processesPerMachine", g_simulator.processesPerMachine);
|
||||
|
||||
|
@ -757,18 +744,26 @@ void SimulationConfig::generateNormalConfig(int minimumReplication) {
|
|||
|
||||
if(datacenters == 2 && g_random->random01() < 0.5) {
|
||||
db.primaryDcId = LiteralStringRef("0");
|
||||
db.remoteDcId = LiteralStringRef("1");
|
||||
machine_count = g_random->randomInt( std::max( 6, datacenters*db.minMachinesRequired() ), std::max(extraDB ? 7 : 10, datacenters*db.minMachinesRequired() + 1) );
|
||||
} else {
|
||||
machine_count = g_random->randomInt( std::max( 2+datacenters, db.minMachinesRequired() ), extraDB ? 6 : 10 );
|
||||
db.remoteDcId = LiteralStringRef("1");
|
||||
}
|
||||
|
||||
if(db.tLogPolicy && db.tLogPolicy->info() == "data_hall^2 x zoneid^2 x 1") {
|
||||
machine_count = 9;
|
||||
} else {
|
||||
//datacenters+2 so that the configure database workload can configure into three_data_hall
|
||||
machine_count = std::max(datacenters+2, ((db.minDatacentersRequired() > 1) ? datacenters : 1) * std::max(3, db.minMachinesRequiredPerDatacenter()));
|
||||
machine_count = g_random->randomInt( machine_count, std::max(machine_count+1, extraDB ? 6 : 10) );
|
||||
}
|
||||
|
||||
//because we protect a majority of coordinators from being killed, it is better to run with low numbers of coordinators to prevent too many processes from being protected
|
||||
coordinators = BUGGIFY ? g_random->randomInt(1, machine_count+1) : 1;
|
||||
|
||||
if(minimumReplication > 1 && datacenters == 3) {
|
||||
//low latency tests in 3 data hall mode need 2 other data centers with 2 machines each to avoid waiting for logs to recover.
|
||||
machine_count = std::max( machine_count, 6);
|
||||
coordinators = 3;
|
||||
}
|
||||
processes_per_machine = g_random->randomInt(1, (extraDB ? 14 : 28)/machine_count + 2 );
|
||||
coordinators = BUGGIFY ? g_random->randomInt(1, machine_count+1) : std::min( machine_count, db.maxMachineFailuresTolerated()*2 + 1 );
|
||||
}
|
||||
|
||||
std::string SimulationConfig::toString() {
|
||||
|
@ -785,20 +780,53 @@ std::string SimulationConfig::toString() {
|
|||
config << " " << "storage_quorum:=" << db.durableStorageQuorum;
|
||||
}
|
||||
|
||||
if(dbconfig["remote_redundancy_mode"] != "none") {
|
||||
if (dbconfig["remote_redundancy_mode"] != "custom") {
|
||||
config << " " << dbconfig["remote_redundancy_mode"];
|
||||
} else {
|
||||
config << " " << "remote_log_replicas:=" << db.remoteTLogReplicationFactor;
|
||||
}
|
||||
}
|
||||
|
||||
if(dbconfig["satellite_redundancy_mode"] != "none") {
|
||||
if (dbconfig["satellite_redundancy_mode"] != "custom") {
|
||||
config << " " << dbconfig["satellite_redundancy_mode"];
|
||||
} else {
|
||||
config << " " << "satellite_log_replicas:=" << db.satelliteTLogReplicationFactor;
|
||||
config << " " << "satellite_anti_quorum:=" << db.satelliteTLogWriteAntiQuorum;
|
||||
config << " " << "satellite_usable_dcs:=" << db.satelliteTLogUsableDcs;
|
||||
}
|
||||
}
|
||||
|
||||
config << " logs=" << db.getDesiredLogs();
|
||||
config << " proxies=" << db.getDesiredProxies();
|
||||
config << " resolvers=" << db.getDesiredResolvers();
|
||||
if(db.remoteDesiredTLogCount > 0) {
|
||||
config << " remote_logs=" << db.remoteDesiredTLogCount;
|
||||
|
||||
if(db.remoteTLogReplicationFactor > 0) {
|
||||
config << " remote_logs=" << db.getDesiredRemoteLogs();
|
||||
config << " log_routers=" << db.getDesiredLogRouters();
|
||||
}
|
||||
if(db.satelliteDesiredTLogCount > 0) {
|
||||
config << " satellite_logs=" << db.satelliteDesiredTLogCount;
|
||||
|
||||
if(db.satelliteTLogReplicationFactor > 0) {
|
||||
config << " satellite_logs=" << db.getDesiredSatelliteLogs();
|
||||
}
|
||||
|
||||
if(db.primaryDcId.present()) {
|
||||
config << " primary_dc=" << db.primaryDcId.get().printable();
|
||||
config << " remote_dc=" << db.remoteDcId.get().printable();
|
||||
}
|
||||
|
||||
if(db.primarySatelliteDcIds.size()) {
|
||||
config << " primary_satellite_dcs=" << db.primarySatelliteDcIds[0].get().printable();
|
||||
for(int i = 1; i < db.primarySatelliteDcIds.size(); i++) {
|
||||
config << "," << db.primarySatelliteDcIds[i].get().printable();
|
||||
}
|
||||
config << " remote_satellite_dcs=" << db.remoteSatelliteDcIds[0].get().printable();
|
||||
for(int i = 1; i < db.remoteSatelliteDcIds.size(); i++) {
|
||||
config << "," << db.remoteSatelliteDcIds[i].get().printable();
|
||||
}
|
||||
}
|
||||
|
||||
config << " " << dbconfig["storage_engine"];
|
||||
return config.str();
|
||||
}
|
||||
|
@ -925,21 +953,12 @@ void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseF
|
|||
}
|
||||
|
||||
g_simulator.desiredCoordinators = coordinatorCount;
|
||||
g_simulator.killableMachines = simconfig.db.maxMachineFailuresTolerated();
|
||||
g_simulator.neededDatacenters = 1;
|
||||
g_simulator.killableDatacenters = 0;
|
||||
g_simulator.physicalDatacenters = dataCenters;
|
||||
g_simulator.maxCoordinatorsInDatacenter = ((coordinatorCount-1)/dataCenters) + 1;
|
||||
g_simulator.machinesNeededForProgress = simconfig.db.minMachinesRequired() + nonVersatileMachines;
|
||||
g_simulator.processesPerMachine = processesPerMachine;
|
||||
|
||||
TraceEvent("SetupSimulatorSettings")
|
||||
.detail("killableMachines", g_simulator.killableMachines)
|
||||
.detail("neededDatacenters", g_simulator.neededDatacenters)
|
||||
.detail("killableDatacenters", g_simulator.killableDatacenters)
|
||||
.detail("machinesNeededForProgress", g_simulator.machinesNeededForProgress)
|
||||
.detail("maxCoordinatorsInDatacenter", g_simulator.maxCoordinatorsInDatacenter)
|
||||
.detail("desiredCoordinators", g_simulator.desiredCoordinators)
|
||||
.detail("physicalDatacenters", g_simulator.physicalDatacenters)
|
||||
.detail("processesPerMachine", g_simulator.processesPerMachine);
|
||||
|
||||
// SOMEDAY: add locality for testers to simulate network topology
|
||||
|
@ -980,11 +999,8 @@ void setupSimulatedSystem( vector<Future<Void>> *systemActors, std::string baseF
|
|||
g_simulator.testerCount = testerCount;
|
||||
|
||||
TraceEvent("SimulatedClusterStarted")
|
||||
.detail("KillableMachines", g_simulator.killableMachines)
|
||||
.detail("DataCenters", dataCenters)
|
||||
.detail("NeededDataCenters", g_simulator.neededDatacenters)
|
||||
.detail("ServerMachineCount", machineCount)
|
||||
.detail("ServersNeededForProgress", g_simulator.machinesNeededForProgress)
|
||||
.detail("ProcessesPerServer", processesPerMachine)
|
||||
.detail("SSLEnabled", sslEnabled)
|
||||
.detail("ClassesAssigned", assignClasses)
|
||||
|
|
|
@ -1535,7 +1535,7 @@ static StatusObject faultToleranceStatusFetcher(DatabaseConfiguration configurat
|
|||
statusObj["max_machine_failures_without_losing_data"] = std::max(machineFailuresWithoutLosingData, 0);
|
||||
|
||||
// without losing availablity
|
||||
statusObj["max_machine_failures_without_losing_availability"] = std::max(std::min(numTLogEligibleMachines - configuration.minMachinesRequired(), machineFailuresWithoutLosingData), 0);
|
||||
statusObj["max_machine_failures_without_losing_availability"] = std::max(std::min(numTLogEligibleMachines - configuration.minMachinesRequiredPerDatacenter(), machineFailuresWithoutLosingData), 0);
|
||||
return statusObj;
|
||||
}
|
||||
|
||||
|
|
|
@ -131,7 +131,6 @@
|
|||
<ActorCompiler Include="workloads\MetricLogging.actor.cpp" />
|
||||
<ActorCompiler Include="workloads\RYWPerformance.actor.cpp" />
|
||||
<ActorCompiler Include="workloads\RYWDisable.actor.cpp" />
|
||||
<ActorCompiler Include="workloads\RestartRecovery.actor.cpp" />
|
||||
<ActorCompiler Include="workloads\UnitTests.actor.cpp" />
|
||||
<ActorCompiler Include="workloads\WorkerErrors.actor.cpp" />
|
||||
<ActorCompiler Include="workloads\MemoryLifetime.actor.cpp" />
|
||||
|
|
|
@ -195,9 +195,6 @@
|
|||
</ActorCompiler>
|
||||
<ActorCompiler Include="TagPartitionedLogSystem.actor.cpp" />
|
||||
<ActorCompiler Include="LogSystemPeekCursor.actor.cpp" />
|
||||
<ActorCompiler Include="workloads\RestartRecovery.actor.cpp">
|
||||
<Filter>workloads</Filter>
|
||||
</ActorCompiler>
|
||||
<ActorCompiler Include="workloads\UnitTests.actor.cpp">
|
||||
<Filter>workloads</Filter>
|
||||
</ActorCompiler>
|
||||
|
|
|
@ -122,9 +122,6 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
|
||||
ASSERT( g_network->isSimulated() );
|
||||
|
||||
TEST(g_simulator.killableMachines > 0); // Some machines can be killed
|
||||
TEST(g_simulator.killableDatacenters > 0); // Some processes can be killed
|
||||
|
||||
if( self->killDc ) {
|
||||
Void _ = wait( delay( delayBeforeKill ) );
|
||||
|
||||
|
|
|
@ -1,69 +0,0 @@
|
|||
/*
|
||||
* RestartRecovery.actor.cpp
|
||||
*
|
||||
* This source file is part of the FoundationDB open source project
|
||||
*
|
||||
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "flow/actorcompiler.h"
|
||||
#include "fdbclient/NativeAPI.h"
|
||||
#include "fdbserver/TesterInterface.h"
|
||||
#include "workloads.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbserver/MasterInterface.h"
|
||||
#include "fdbclient/SystemData.h"
|
||||
#include "fdbserver/WorkerInterface.h"
|
||||
#include "fdbserver/ServerDBInfo.h"
|
||||
#include "fdbserver/QuietDatabase.h"
|
||||
|
||||
struct RestartRecoveryWorkload : TestWorkload {
|
||||
std::string machineToKill;
|
||||
bool enabled;
|
||||
double killAt;
|
||||
|
||||
RestartRecoveryWorkload(WorkloadContext const& wcx)
|
||||
: TestWorkload(wcx)
|
||||
{
|
||||
enabled = !clientId; // only do this on the "first" client
|
||||
killAt = getOption( options, LiteralStringRef("killAt"), 10.0 );
|
||||
}
|
||||
|
||||
virtual std::string description() { return "RestartRecoveryWorkload"; }
|
||||
virtual Future<Void> setup( Database const& cx ) { return Void(); }
|
||||
virtual Future<Void> start( Database const& cx ) {
|
||||
if (enabled)
|
||||
return assassin( cx, this );
|
||||
return Void();
|
||||
}
|
||||
virtual Future<bool> check( Database const& cx ) { return true; }
|
||||
virtual void getMetrics( vector<PerfMetric>& m ) {
|
||||
}
|
||||
|
||||
ACTOR Future<Void> assassin( Database cx, RestartRecoveryWorkload* self ) {
|
||||
Void _ = wait( delay( self->killAt ) );
|
||||
state std::vector<TLogInterface> logs = self->dbInfo->get().logSystemConfig.allPresentLogs();
|
||||
if(logs.size() > 2 && g_simulator.killableMachines > 0) {
|
||||
TraceEvent("RestartRecoveryReboot").detail("addr", logs[2].address());
|
||||
g_simulator.rebootProcess( g_simulator.getProcessByAddress(NetworkAddress(logs[2].address().ip, logs[2].address().port, true, false)), ISimulator::RebootProcess );
|
||||
Void _ = wait( delay(8.0) );
|
||||
TraceEvent("RestartRecoveryKill");
|
||||
g_simulator.rebootProcess( g_simulator.getProcessByAddress(NetworkAddress(logs[0].address().ip, logs[0].address().port, true, false)), ISimulator::RebootProcessAndDelete );
|
||||
}
|
||||
return Void();
|
||||
}
|
||||
};
|
||||
|
||||
WorkloadFactory<RestartRecoveryWorkload> RestartRecoveryWorkloadFactory("RestartRecovery");
|
|
@ -59,9 +59,6 @@ struct SaveAndKillWorkload : TestWorkload {
|
|||
ini.SetUnicode();
|
||||
ini.LoadFile(self->restartInfo.c_str());
|
||||
|
||||
ini.SetValue("META", "killableMachines", format("%d", g_simulator.killableMachines).c_str());
|
||||
ini.SetValue("META", "dataCenters", format("%d", g_simulator.neededDatacenters).c_str());
|
||||
ini.SetValue("META", "machinesNeededForProgress", format("%d", g_simulator.machinesNeededForProgress).c_str());
|
||||
ini.SetValue("META", "processesPerMachine", format("%d", g_simulator.processesPerMachine).c_str());
|
||||
ini.SetValue("META", "desiredCoordinators", format("%d", g_simulator.desiredCoordinators).c_str());
|
||||
ini.SetValue("META", "connectionString", g_simulator.connectionString.c_str());
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
testTitle=DDBalance_test
|
||||
testName=DDBalance
|
||||
testDuration=60.0
|
||||
transactionsPerSecond=250.0
|
||||
binCount=1000
|
||||
writesPerTransaction=5
|
||||
keySpaceDriftFactor=10
|
||||
moversPerClient=10
|
||||
actorsPerClient=100
|
||||
nodes=100000
|
||||
connectionFailuresDisableDuration=100000
|
||||
|
||||
testName=RestartRecovery
|
||||
killAt=30.0
|
Loading…
Reference in New Issue