Add simulation support for changing the cluster file

This commit is contained in:
Lukas Joswiak 2022-10-13 14:58:53 -07:00
parent 6e0835f8a8
commit a72066be33
4 changed files with 69 additions and 16 deletions

View File

@ -54,6 +54,7 @@ public:
FailDisk,
RebootAndDelete,
RebootProcessAndDelete,
RebootProcessAndSwitch,
Reboot,
RebootProcess,
None
@ -304,6 +305,7 @@ public:
KillType kt,
bool forceKill = false,
KillType* ktFinal = nullptr) = 0;
virtual bool killAll(KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0;
// virtual KillType getMachineKillState( UID zoneID ) = 0;
virtual bool canKillProcesses(std::vector<ProcessInfo*> const& availableProcesses,
std::vector<ProcessInfo*> const& deadProcesses,
@ -390,6 +392,9 @@ public:
return clearedAddresses.find(address) != clearedAddresses.end();
}
void switchCluster(NetworkAddress const& address) { switchedCluster[address] = !switchedCluster[address]; }
bool hasSwitchedCluster(NetworkAddress const& address) const { return switchedCluster.at(address); }
void excludeAddress(NetworkAddress const& address) {
excludedAddresses[address]++;
TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]);
@ -540,6 +545,7 @@ private:
std::set<Optional<Standalone<StringRef>>> swapsDisabled;
std::map<NetworkAddress, int> excludedAddresses;
std::map<NetworkAddress, int> clearedAddresses;
std::map<NetworkAddress, bool> switchedCluster;
std::map<NetworkAddress, std::map<std::string, int>> roleAddresses;
std::map<std::string, double> disabledMap;
bool allSwapsDisabled;

View File

@ -1794,6 +1794,15 @@ public:
}
return result;
}
bool killAll(KillType kt, bool forceKill, KillType* ktFinal) override {
bool result = false;
for (auto& machine : machines) {
if (killMachine(machine.second.machineId, kt, forceKill, ktFinal)) {
result = true;
}
}
return result;
}
bool killMachine(Optional<Standalone<StringRef>> machineId,
KillType kt,
bool forceKill,
@ -2008,7 +2017,7 @@ public:
if (process->startingClass != ProcessClass::TesterClass)
killProcess_internal(process, kt);
}
} else if (kt == Reboot || kt == RebootAndDelete) {
} else if (kt == Reboot || kt == RebootAndDelete || kt == RebootProcessAndSwitch) {
for (auto& process : machines[machineId].processes) {
TraceEvent("KillMachineProcess")
.detail("KillType", kt)
@ -2564,7 +2573,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
try {
ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete ||
kt == ISimulator::RebootProcessAndDelete);
kt == ISimulator::RebootProcessAndDelete || kt == ISimulator::RebootProcessAndSwitch);
CODE_PROBE(kt == ISimulator::RebootProcess,
"Simulated process rebooted",
@ -2580,6 +2589,10 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
"Simulated process rebooted with data and coordination state deletion",
probe::assert::simOnly,
probe::context::sim2);
CODE_PROBE(kt == ISimulator::RebootProcessAndSwitch,
"Simulated process rebooted with different cluster file",
probe::assert::simOnly,
probe::context::sim2);
if (p->rebooting || !p->isReliable()) {
TraceEvent(SevDebug, "DoRebootFailed")
@ -2608,6 +2621,8 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) {
if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) {
p->cleared = true;
g_simulator->clearAddress(p->address);
} else if (kt == ISimulator::RebootProcessAndSwitch) {
g_simulator->switchCluster(p->address);
}
p->shutdownSignal.send(kt);
} catch (Error& e) {

View File

@ -620,6 +620,7 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
std::string* coordFolder,
std::string baseFolder,
ClusterConnectionString connStr,
ClusterConnectionString otherConnStr,
bool useSeedFile,
AgentMode runBackupAgents,
std::string whitelistBinPaths,
@ -830,6 +831,23 @@ ACTOR Future<ISimulator::KillType> simulatedFDBDRebooter(Reference<IClusterConne
connRecord =
makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), connStr.toString());
}
} else if (onShutdown.get() == ISimulator::RebootProcessAndSwitch) {
TraceEvent("SimulatedFDBDRebootAndSwitch")
.detail("Cycles", cycles)
.detail("RandomId", randomId)
.detail("Address", process->address)
.detail("ZoneId", localities.zoneId())
.detail("KillType", shutdownResult)
.detail("ConnectionString", connStr.toString())
.detail("OtherConnectionString", otherConnStr.toString())
.detail("SwitchingTo", g_simulator->hasSwitchedCluster(process->address));
// Handle the case where otherConnStr is '@'.
if (otherConnStr.toString().size() > 1) {
std::string newConnStr =
g_simulator->hasSwitchedCluster(process->address) ? otherConnStr.toString() : connStr.toString();
connRecord = makeReference<ClusterConnectionFile>(joinPath(*dataFolder, "fdb.cluster"), newConnStr);
}
} else {
TraceEvent("SimulatedFDBDJustRepeat")
.detail("Cycles", cycles)
@ -846,6 +864,7 @@ std::map<Optional<Standalone<StringRef>>, std::vector<std::vector<std::string>>>
// process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per
// process
ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
ClusterConnectionString otherConnStr,
std::vector<IPAddress> ips,
bool sslEnabled,
LocalityData localities,
@ -924,6 +943,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
&coordFolders[i],
baseFolder,
connStr,
otherConnStr,
useSeedFile,
agentMode,
whitelistBinPaths,
@ -942,6 +962,7 @@ ACTOR Future<Void> simulatedMachine(ClusterConnectionString connStr,
&coordFolders[i],
baseFolder,
connStr,
otherConnStr,
useSeedFile,
agentMode,
whitelistBinPaths,
@ -1311,6 +1332,7 @@ ACTOR Future<Void> restartSimulatedSystem(std::vector<Future<Void>>* systemActor
// SOMEDAY: parse backup agent from test file
systemActors->push_back(reportErrors(
simulatedMachine(conn,
ClusterConnectionString(),
ipAddrs,
usingSSL,
localities,
@ -2346,7 +2368,10 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
// check the sslEnablementMap using only one ip
LocalityData localities(Optional<Standalone<StringRef>>(), zoneId, machineId, dcUID);
localities.set("data_hall"_sr, dcUID);
systemActors->push_back(reportErrors(simulatedMachine(conn,
systemActors->push_back(reportErrors(
simulatedMachine(conn,
requiresExtraDBMachines ? ClusterConnectionString(g_simulator->extraDatabases.at(0))
: ClusterConnectionString(),
ips,
sslEnabled,
localities,
@ -2376,6 +2401,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
LocalityData localities(Optional<Standalone<StringRef>>(), newZoneId, newMachineId, dcUID);
localities.set("data_hall"_sr, dcUID);
systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase),
conn,
extraIps,
sslEnabled,
localities,
@ -2422,6 +2448,7 @@ void setupSimulatedSystem(std::vector<Future<Void>>* systemActors,
Optional<Standalone<StringRef>>(), newZoneId, newZoneId, Optional<Standalone<StringRef>>());
systemActors->push_back(
reportErrors(simulatedMachine(conn,
ClusterConnectionString(),
ips,
sslEnabled,
localities,

View File

@ -357,6 +357,11 @@ struct MachineAttritionWorkload : FailureInjectionWorkload {
TraceEvent("Assassination").detail("TargetDataHall", target).detail("KillType", kt);
g_simulator->killDataHall(target, kt);
} else if (!g_simulator->extraDatabases.empty() && deterministicRandom()->random01() < 0.1) {
state ISimulator::KillType kt = ISimulator::RebootProcessAndSwitch;
g_simulator->killAll(kt, true);
wait(delay(self->testDuration / 2));
g_simulator->killAll(kt, true);
} else {
state int killedMachines = 0;
while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) {