Add simulation support for changing the cluster file
This commit is contained in:
parent
6e0835f8a8
commit
a72066be33
|
@ -54,6 +54,7 @@ public:
|
|||
FailDisk,
|
||||
RebootAndDelete,
|
||||
RebootProcessAndDelete,
|
||||
RebootProcessAndSwitch,
|
||||
Reboot,
|
||||
RebootProcess,
|
||||
None
|
||||
|
@ -304,6 +305,7 @@ public:
|
|||
KillType kt,
|
||||
bool forceKill = false,
|
||||
KillType* ktFinal = nullptr) = 0;
|
||||
virtual bool killAll(KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0;
|
||||
// virtual KillType getMachineKillState( UID zoneID ) = 0;
|
||||
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
|
||||
std::vector<ProcessInfo*> const& deadProcesses,
|
||||
|
@ -390,6 +392,9 @@ public:
|
|||
return clearedAddresses.find(address) != clearedAddresses.end();
|
||||
}
|
||||
|
||||
void switchCluster(NetworkAddress const& address) { switchedCluster[address] = !switchedCluster[address]; }
|
||||
bool hasSwitchedCluster(NetworkAddress const& address) const { return switchedCluster.at(address); }
|
||||
|
||||
void excludeAddress(NetworkAddress const& address) {
|
||||
excludedAddresses[address]++;
|
||||
TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
|
||||
|
@ -540,6 +545,7 @@ private:
|
|||
std::set<Optional<Standalone<StringRef>>> swapsDisabled;
|
||||
std::map<NetworkAddress, int> excludedAddresses;
|
||||
std::map<NetworkAddress, int> clearedAddresses;
|
||||
std::map<NetworkAddress, bool> switchedCluster;
|
||||
std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
|
||||
std::map<std::string, double> disabledMap;
|
||||
bool allSwapsDisabled;
|
||||
|
|
|
@ -1794,6 +1794,15 @@ public:
|
|||
}
|
||||
return result;
|
||||
}
|
||||
bool killAll(KillType kt, bool forceKill, KillType* ktFinal) override {
|
||||
bool result = false;
|
||||
for (auto& machine : machines) {
|
||||
if (killMachine(machine.second.machineId, kt, forceKill, ktFinal)) {
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
bool killMachine(Optional<Standalone<StringRef>> machineId,
|
||||
KillType kt,
|
||||
bool forceKill,
|
||||
|
@ -2008,7 +2017,7 @@ public:
|
|||
if (process->startingClass != ProcessClass::TesterClass)
|
||||
killProcess_internal(process, kt);
|
||||
}
|
||||
} else if (kt == Reboot || kt == RebootAndDelete) {
|
||||
} else if (kt == Reboot || kt == RebootAndDelete || kt == RebootProcessAndSwitch) {
|
||||
for (auto& process : machines[machineId].processes) {
|
||||
TraceEvent("KillMachineProcess")
|
||||
.detail("KillType", kt)
|
||||
|
@ -2564,7 +2573,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
|
|||
|
||||
try {
|
||||
ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
|
||||
kt == ISimulator::RebootProcessAndDelete);
|
||||
kt == ISimulator::RebootProcessAndDelete || kt == ISimulator::RebootProcessAndSwitch);
|
||||
|
||||
CODE_PROBE(kt == ISimulator::RebootProcess,
|
||||
"Simulated process rebooted",
|
||||
|
@ -2580,6 +2589,10 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
|
|||
"Simulated process rebooted with data and coordination state deletion",
|
||||
probe::assert::simOnly,
|
||||
probe::context::sim2);
|
||||
CODE_PROBE(kt == ISimulator::RebootProcessAndSwitch,
|
||||
"Simulated process rebooted with different cluster file",
|
||||
probe::assert::simOnly,
|
||||
probe::context::sim2);
|
||||
|
||||
if (p->rebooting || !p->isReliable()) {
|
||||
TraceEvent(SevDebug, "DoRebootFailed")
|
||||
|
@ -2608,6 +2621,8 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
|
|||
if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) {
|
||||
p->cleared = true;
|
||||
g_simulator->clearAddress(p->address);
|
||||
} else if (kt == ISimulator::RebootProcessAndSwitch) {
|
||||
g_simulator->switchCluster(p->address);
|
||||
}
|
||||
p->shutdownSignal.send(kt);
|
||||
} catch (Error& e) {
|
||||
|
|
|
@ -620,6 +620,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
|
|||
std::string* coordFolder,
|
||||
std::string baseFolder,
|
||||
ClusterConnectionString connStr,
|
||||
ClusterConnectionString otherConnStr,
|
||||
bool useSeedFile,
|
||||
AgentMode runBackupAgents,
|
||||
std::string whitelistBinPaths,
|
||||
|
@ -830,6 +831,23 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
|
|||
connRecord =
|
||||
makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), connStr.toString());
|
||||
}
|
||||
} else if (onShutdown.get() == ISimulator::RebootProcessAndSwitch) {
|
||||
TraceEvent("SimulatedFDBDRebootAndSwitch")
|
||||
.detail("Cycles", cycles)
|
||||
.detail("RandomId", randomId)
|
||||
.detail("Address", process->address)
|
||||
.detail("ZoneId", localities.zoneId())
|
||||
.detail("KillType", shutdownResult)
|
||||
.detail("ConnectionString", connStr.toString())
|
||||
.detail("OtherConnectionString", otherConnStr.toString())
|
||||
.detail("SwitchingTo", g_simulator->hasSwitchedCluster(process->address));
|
||||
|
||||
// Handle the case where otherConnStr is '@'.
|
||||
if (otherConnStr.toString().size() > 1) {
|
||||
std::string newConnStr =
|
||||
g_simulator->hasSwitchedCluster(process->address) ? otherConnStr.toString() : connStr.toString();
|
||||
connRecord = makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), newConnStr);
|
||||
}
|
||||
} else {
|
||||
TraceEvent("SimulatedFDBDJustRepeat")
|
||||
.detail("Cycles", cycles)
|
||||
|
@ -846,6 +864,7 @@ std::map<Optional<Standalone<StringRef>>, std::vector<std::vector<std::string>>>
|
|||
// process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per
|
||||
// process
|
||||
ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
|
||||
ClusterConnectionString otherConnStr,
|
||||
std::vector<IPAddress> ips,
|
||||
bool sslEnabled,
|
||||
LocalityData localities,
|
||||
|
@ -924,6 +943,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
|
|||
&coordFolders[i],
|
||||
baseFolder,
|
||||
connStr,
|
||||
otherConnStr,
|
||||
useSeedFile,
|
||||
agentMode,
|
||||
whitelistBinPaths,
|
||||
|
@ -942,6 +962,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
|
|||
&coordFolders[i],
|
||||
baseFolder,
|
||||
connStr,
|
||||
otherConnStr,
|
||||
useSeedFile,
|
||||
agentMode,
|
||||
whitelistBinPaths,
|
||||
|
@ -1311,6 +1332,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
|
|||
// SOMEDAY: parse backup agent from test file
|
||||
systemActors->push_back(reportErrors(
|
||||
simulatedMachine(conn,
|
||||
ClusterConnectionString(),
|
||||
ipAddrs,
|
||||
usingSSL,
|
||||
localities,
|
||||
|
@ -2346,7 +2368,10 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
|
|||
// check the sslEnablementMap using only one ip
|
||||
LocalityData localities(Optional<Standalone<StringRef>>(), zoneId, machineId, dcUID);
|
||||
localities.set("data_hall"_sr, dcUID);
|
||||
systemActors->push_back(reportErrors(simulatedMachine(conn,
|
||||
systemActors->push_back(reportErrors(
|
||||
simulatedMachine(conn,
|
||||
requiresExtraDBMachines ? ClusterConnectionString(g_simulator->extraDatabases.at(0))
|
||||
: ClusterConnectionString(),
|
||||
ips,
|
||||
sslEnabled,
|
||||
localities,
|
||||
|
@ -2376,6 +2401,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
|
|||
LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID);
|
||||
localities.set("data_hall"_sr, dcUID);
|
||||
systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase),
|
||||
conn,
|
||||
extraIps,
|
||||
sslEnabled,
|
||||
localities,
|
||||
|
@ -2422,6 +2448,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
|
|||
Optional<Standalone<StringRef>>(), newZoneId, newZoneId, Optional<Standalone<StringRef>>());
|
||||
systemActors->push_back(
|
||||
reportErrors(simulatedMachine(conn,
|
||||
ClusterConnectionString(),
|
||||
ips,
|
||||
sslEnabled,
|
||||
localities,
|
||||
|
|
|
@ -357,6 +357,11 @@ struct MachineAttritionWorkload : FailureInjectionWorkload {
|
|||
TraceEvent("Assassination").detail("TargetDataHall", target).detail("KillType", kt);
|
||||
|
||||
g_simulator->killDataHall(target, kt);
|
||||
} else if (!g_simulator->extraDatabases.empty() && deterministicRandom()->random01() < 0.1) {
|
||||
state ISimulator::KillType kt = ISimulator::RebootProcessAndSwitch;
|
||||
g_simulator->killAll(kt, true);
|
||||
wait(delay(self->testDuration / 2));
|
||||
g_simulator->killAll(kt, true);
|
||||
} else {
|
||||
state int killedMachines = 0;
|
||||
while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) {
|
||||
|
|
Loading…
Reference in New Issue