Add simulation support for changing the cluster file

This commit is contained in:
Lukas Joswiak 2022-10-13 14:58:53 -07:00
parent 6e0835f8a8
commit a72066be33
4 changed files with 69 additions and 16 deletions

View File

@ -54,6 +54,7 @@ public:
FailDisk, FailDisk,
RebootAndDelete, RebootAndDelete,
RebootProcessAndDelete, RebootProcessAndDelete,
RebootProcessAndSwitch,
Reboot, Reboot,
RebootProcess, RebootProcess,
None None
@ -304,6 +305,7 @@ public:
KillType kt, KillType kt,
bool forceKill = false, bool forceKill = false,
KillType* ktFinal = nullptr) = 0; KillType* ktFinal = nullptr) = 0;
virtual bool killAll(KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0;
// virtual KillType getMachineKillState( UID zoneID ) = 0; // virtual KillType getMachineKillState( UID zoneID ) = 0;
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses, virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
std::vector<ProcessInfo*> const& deadProcesses, std::vector<ProcessInfo*> const& deadProcesses,
@ -390,6 +392,9 @@ public:
return clearedAddresses.find(address) != clearedAddresses.end(); return clearedAddresses.find(address) != clearedAddresses.end();
} }
void switchCluster(NetworkAddress const& address) { switchedCluster[address] = !switchedCluster[address]; }
bool hasSwitchedCluster(NetworkAddress const& address) const { return switchedCluster.at(address); }
void excludeAddress(NetworkAddress const& address) { void excludeAddress(NetworkAddress const& address) {
excludedAddresses[address]++; excludedAddresses[address]++;
TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]); TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
@ -540,6 +545,7 @@ private:
std::set<Optional<Standalone<StringRef>>> swapsDisabled; std::set<Optional<Standalone<StringRef>>> swapsDisabled;
std::map<NetworkAddress, int> excludedAddresses; std::map<NetworkAddress, int> excludedAddresses;
std::map<NetworkAddress, int> clearedAddresses; std::map<NetworkAddress, int> clearedAddresses;
std::map<NetworkAddress, bool> switchedCluster;
std::map<NetworkAddress, std::map<std::string, int>> roleAddresses; std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
std::map<std::string, double> disabledMap; std::map<std::string, double> disabledMap;
bool allSwapsDisabled; bool allSwapsDisabled;

View File

@ -1794,6 +1794,15 @@ public:
} }
return result; return result;
} }
bool killAll(KillType kt, bool forceKill, KillType* ktFinal) override {
bool result = false;
for (auto& machine : machines) {
if (killMachine(machine.second.machineId, kt, forceKill, ktFinal)) {
result = true;
}
}
return result;
}
bool killMachine(Optional<Standalone<StringRef>> machineId, bool killMachine(Optional<Standalone<StringRef>> machineId,
KillType kt, KillType kt,
bool forceKill, bool forceKill,
@ -2008,7 +2017,7 @@ public:
if (process->startingClass != ProcessClass::TesterClass) if (process->startingClass != ProcessClass::TesterClass)
killProcess_internal(process, kt); killProcess_internal(process, kt);
} }
} else if (kt == Reboot || kt == RebootAndDelete) { } else if (kt == Reboot || kt == RebootAndDelete || kt == RebootProcessAndSwitch) {
for (auto& process : machines[machineId].processes) { for (auto& process : machines[machineId].processes) {
TraceEvent("KillMachineProcess") TraceEvent("KillMachineProcess")
.detail("KillType", kt) .detail("KillType", kt)
@ -2564,7 +2573,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
try { try {
ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
kt == ISimulator::RebootProcessAndDelete); kt == ISimulator::RebootProcessAndDelete || kt == ISimulator::RebootProcessAndSwitch);
CODE_PROBE(kt == ISimulator::RebootProcess, CODE_PROBE(kt == ISimulator::RebootProcess,
"Simulated process rebooted", "Simulated process rebooted",
@ -2580,6 +2589,10 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
"Simulated process rebooted with data and coordination state deletion", "Simulated process rebooted with data and coordination state deletion",
probe::assert::simOnly, probe::assert::simOnly,
probe::context::sim2); probe::context::sim2);
CODE_PROBE(kt == ISimulator::RebootProcessAndSwitch,
"Simulated process rebooted with different cluster file",
probe::assert::simOnly,
probe::context::sim2);
if (p->rebooting || !p->isReliable()) { if (p->rebooting || !p->isReliable()) {
TraceEvent(SevDebug, "DoRebootFailed") TraceEvent(SevDebug, "DoRebootFailed")
@ -2608,6 +2621,8 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) { if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) {
p->cleared = true; p->cleared = true;
g_simulator->clearAddress(p->address); g_simulator->clearAddress(p->address);
} else if (kt == ISimulator::RebootProcessAndSwitch) {
g_simulator->switchCluster(p->address);
} }
p->shutdownSignal.send(kt); p->shutdownSignal.send(kt);
} catch (Error& e) { } catch (Error& e) {

View File

@ -620,6 +620,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
std::string* coordFolder, std::string* coordFolder,
std::string baseFolder, std::string baseFolder,
ClusterConnectionString connStr, ClusterConnectionString connStr,
ClusterConnectionString otherConnStr,
bool useSeedFile, bool useSeedFile,
AgentMode runBackupAgents, AgentMode runBackupAgents,
std::string whitelistBinPaths, std::string whitelistBinPaths,
@ -830,6 +831,23 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
connRecord = connRecord =
makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), connStr.toString()); makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), connStr.toString());
} }
} else if (onShutdown.get() == ISimulator::RebootProcessAndSwitch) {
TraceEvent("SimulatedFDBDRebootAndSwitch")
.detail("Cycles", cycles)
.detail("RandomId", randomId)
.detail("Address", process->address)
.detail("ZoneId", localities.zoneId())
.detail("KillType", shutdownResult)
.detail("ConnectionString", connStr.toString())
.detail("OtherConnectionString", otherConnStr.toString())
.detail("SwitchingTo", g_simulator->hasSwitchedCluster(process->address));
// Handle the case where otherConnStr is '@'.
if (otherConnStr.toString().size() > 1) {
std::string newConnStr =
g_simulator->hasSwitchedCluster(process->address) ? otherConnStr.toString() : connStr.toString();
connRecord = makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), newConnStr);
}
} else { } else {
TraceEvent("SimulatedFDBDJustRepeat") TraceEvent("SimulatedFDBDJustRepeat")
.detail("Cycles", cycles) .detail("Cycles", cycles)
@ -846,6 +864,7 @@ std::map<Optional<Standalone<StringRef>>, std::vector<std::vector<std::string>>>
// process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per // process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per
// process // process
ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr, ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
ClusterConnectionString otherConnStr,
std::vector<IPAddress> ips, std::vector<IPAddress> ips,
bool sslEnabled, bool sslEnabled,
LocalityData localities, LocalityData localities,
@ -924,6 +943,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
&coordFolders[i], &coordFolders[i],
baseFolder, baseFolder,
connStr, connStr,
otherConnStr,
useSeedFile, useSeedFile,
agentMode, agentMode,
whitelistBinPaths, whitelistBinPaths,
@ -942,6 +962,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
&coordFolders[i], &coordFolders[i],
baseFolder, baseFolder,
connStr, connStr,
otherConnStr,
useSeedFile, useSeedFile,
agentMode, agentMode,
whitelistBinPaths, whitelistBinPaths,
@ -1311,6 +1332,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
// SOMEDAY: parse backup agent from test file // SOMEDAY: parse backup agent from test file
systemActors->push_back(reportErrors( systemActors->push_back(reportErrors(
simulatedMachine(conn, simulatedMachine(conn,
ClusterConnectionString(),
ipAddrs, ipAddrs,
usingSSL, usingSSL,
localities, localities,
@ -2346,20 +2368,23 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
// check the sslEnablementMap using only one ip // check the sslEnablementMap using only one ip
LocalityData localities(Optional<Standalone<StringRef>>(), zoneId, machineId, dcUID); LocalityData localities(Optional<Standalone<StringRef>>(), zoneId, machineId, dcUID);
localities.set("data_hall"_sr, dcUID); localities.set("data_hall"_sr, dcUID);
systemActors->push_back(reportErrors(simulatedMachine(conn, systemActors->push_back(reportErrors(
ips, simulatedMachine(conn,
sslEnabled, requiresExtraDBMachines ? ClusterConnectionString(g_simulator->extraDatabases.at(0))
localities, : ClusterConnectionString(),
processClass, ips,
baseFolder, sslEnabled,
false, localities,
machine == useSeedForMachine, processClass,
requiresExtraDBMachines ? AgentOnly : AgentAddition, baseFolder,
sslOnly, false,
whitelistBinPaths, machine == useSeedForMachine,
protocolVersion, requiresExtraDBMachines ? AgentOnly : AgentAddition,
configDBType), sslOnly,
"SimulatedMachine")); whitelistBinPaths,
protocolVersion,
configDBType),
"SimulatedMachine"));
if (requiresExtraDBMachines) { if (requiresExtraDBMachines) {
int cluster = 4; int cluster = 4;
@ -2376,6 +2401,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID); LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID);
localities.set("data_hall"_sr, dcUID); localities.set("data_hall"_sr, dcUID);
systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase),
conn,
extraIps, extraIps,
sslEnabled, sslEnabled,
localities, localities,
@ -2422,6 +2448,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
Optional<Standalone<StringRef>>(), newZoneId, newZoneId, Optional<Standalone<StringRef>>()); Optional<Standalone<StringRef>>(), newZoneId, newZoneId, Optional<Standalone<StringRef>>());
systemActors->push_back( systemActors->push_back(
reportErrors(simulatedMachine(conn, reportErrors(simulatedMachine(conn,
ClusterConnectionString(),
ips, ips,
sslEnabled, sslEnabled,
localities, localities,

View File

@ -357,6 +357,11 @@ struct MachineAttritionWorkload : FailureInjectionWorkload {
TraceEvent("Assassination").detail("TargetDataHall", target).detail("KillType", kt); TraceEvent("Assassination").detail("TargetDataHall", target).detail("KillType", kt);
g_simulator->killDataHall(target, kt); g_simulator->killDataHall(target, kt);
} else if (!g_simulator->extraDatabases.empty() && deterministicRandom()->random01() < 0.1) {
state ISimulator::KillType kt = ISimulator::RebootProcessAndSwitch;
g_simulator->killAll(kt, true);
wait(delay(self->testDuration / 2));
g_simulator->killAll(kt, true);
} else { } else {
state int killedMachines = 0; state int killedMachines = 0;
while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) { while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) {