diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index 3eeb405785..c15181c90f 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -54,6 +54,7 @@ public: FailDisk, RebootAndDelete, RebootProcessAndDelete, + RebootProcessAndSwitch, Reboot, RebootProcess, None @@ -304,6 +305,7 @@ public: KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0; + virtual bool killAll(KillType kt, bool forceKill = false, KillType* ktFinal = nullptr) = 0; // virtual KillType getMachineKillState( UID zoneID ) = 0; virtual bool canKillProcesses(std::vector const& availableProcesses, std::vector const& deadProcesses, @@ -390,6 +392,9 @@ public: return clearedAddresses.find(address) != clearedAddresses.end(); } + void switchCluster(NetworkAddress const& address) { switchedCluster[address] = !switchedCluster[address]; } + bool hasSwitchedCluster(NetworkAddress const& address) const { return switchedCluster.at(address); } + void excludeAddress(NetworkAddress const& address) { excludedAddresses[address]++; TraceEvent("ExcludeAddress").detail("Address", address).detail("Value", excludedAddresses[address]); @@ -540,6 +545,7 @@ private: std::set>> swapsDisabled; std::map excludedAddresses; std::map clearedAddresses; + std::map switchedCluster; std::map> roleAddresses; std::map disabledMap; bool allSwapsDisabled; diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index a73674b10e..59727e3161 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -1794,6 +1794,15 @@ public: } return result; } + bool killAll(KillType kt, bool forceKill, KillType* ktFinal) override { + bool result = false; + for (auto& machine : machines) { + if (killMachine(machine.second.machineId, kt, forceKill, ktFinal)) { + result = true; + } + } + return result; + } bool killMachine(Optional> machineId, KillType kt, bool forceKill, @@ -2008,7 +2017,7 @@ public: if (process->startingClass != ProcessClass::TesterClass) killProcess_internal(process, kt); } - } else if (kt == Reboot || kt == RebootAndDelete) { + } else if (kt == Reboot || kt == RebootAndDelete || kt == RebootProcessAndSwitch) { for (auto& process : machines[machineId].processes) { TraceEvent("KillMachineProcess") .detail("KillType", kt) @@ -2564,7 +2573,7 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) { try { ASSERT(kt == ISimulator::RebootProcess || kt == ISimulator::Reboot || kt == ISimulator::RebootAndDelete || - kt == ISimulator::RebootProcessAndDelete); + kt == ISimulator::RebootProcessAndDelete || kt == ISimulator::RebootProcessAndSwitch); CODE_PROBE(kt == ISimulator::RebootProcess, "Simulated process rebooted", @@ -2580,6 +2589,10 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) { "Simulated process rebooted with data and coordination state deletion", probe::assert::simOnly, probe::context::sim2); + CODE_PROBE(kt == ISimulator::RebootProcessAndSwitch, + "Simulated process rebooted with different cluster file", + probe::assert::simOnly, + probe::context::sim2); if (p->rebooting || !p->isReliable()) { TraceEvent(SevDebug, "DoRebootFailed") @@ -2608,6 +2621,8 @@ ACTOR void doReboot(ISimulator::ProcessInfo* p, ISimulator::KillType kt) { if ((kt == ISimulator::RebootAndDelete) || (kt == ISimulator::RebootProcessAndDelete)) { p->cleared = true; g_simulator->clearAddress(p->address); + } else if (kt == ISimulator::RebootProcessAndSwitch) { + g_simulator->switchCluster(p->address); } p->shutdownSignal.send(kt); } catch (Error& e) { diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 783478b7b6..c64bf610df 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -620,6 +620,7 @@ ACTOR Future simulatedFDBDRebooter(Reference simulatedFDBDRebooter(Reference(joinPath(*dataFolder, "fdb.cluster"), connStr.toString()); } + } else if (onShutdown.get() == ISimulator::RebootProcessAndSwitch) { + TraceEvent("SimulatedFDBDRebootAndSwitch") + .detail("Cycles", cycles) + .detail("RandomId", randomId) + .detail("Address", process->address) + .detail("ZoneId", localities.zoneId()) + .detail("KillType", shutdownResult) + .detail("ConnectionString", connStr.toString()) + .detail("OtherConnectionString", otherConnStr.toString()) + .detail("SwitchingTo", g_simulator->hasSwitchedCluster(process->address)); + + // Handle the case where otherConnStr is '@'. + if (otherConnStr.toString().size() > 1) { + std::string newConnStr = + g_simulator->hasSwitchedCluster(process->address) ? otherConnStr.toString() : connStr.toString(); + connRecord = makeReference(joinPath(*dataFolder, "fdb.cluster"), newConnStr); + } } else { TraceEvent("SimulatedFDBDJustRepeat") .detail("Cycles", cycles) @@ -846,6 +864,7 @@ std::map>, std::vector>> // process count is no longer needed because it is now the length of the vector of ip's, because it was one ip per // process ACTOR Future simulatedMachine(ClusterConnectionString connStr, + ClusterConnectionString otherConnStr, std::vector ips, bool sslEnabled, LocalityData localities, @@ -924,6 +943,7 @@ ACTOR Future simulatedMachine(ClusterConnectionString connStr, &coordFolders[i], baseFolder, connStr, + otherConnStr, useSeedFile, agentMode, whitelistBinPaths, @@ -942,6 +962,7 @@ ACTOR Future simulatedMachine(ClusterConnectionString connStr, &coordFolders[i], baseFolder, connStr, + otherConnStr, useSeedFile, agentMode, whitelistBinPaths, @@ -1311,6 +1332,7 @@ ACTOR Future restartSimulatedSystem(std::vector>* systemActor // SOMEDAY: parse backup agent from test file systemActors->push_back(reportErrors( simulatedMachine(conn, + ClusterConnectionString(), ipAddrs, usingSSL, localities, @@ -2346,20 +2368,23 @@ void setupSimulatedSystem(std::vector>* systemActors, // check the sslEnablementMap using only one ip LocalityData localities(Optional>(), zoneId, machineId, dcUID); localities.set("data_hall"_sr, dcUID); - systemActors->push_back(reportErrors(simulatedMachine(conn, - ips, - sslEnabled, - localities, - processClass, - baseFolder, - false, - machine == useSeedForMachine, - requiresExtraDBMachines ? AgentOnly : AgentAddition, - sslOnly, - whitelistBinPaths, - protocolVersion, - configDBType), - "SimulatedMachine")); + systemActors->push_back(reportErrors( + simulatedMachine(conn, + requiresExtraDBMachines ? ClusterConnectionString(g_simulator->extraDatabases.at(0)) + : ClusterConnectionString(), + ips, + sslEnabled, + localities, + processClass, + baseFolder, + false, + machine == useSeedForMachine, + requiresExtraDBMachines ? AgentOnly : AgentAddition, + sslOnly, + whitelistBinPaths, + protocolVersion, + configDBType), + "SimulatedMachine")); if (requiresExtraDBMachines) { int cluster = 4; @@ -2376,6 +2401,7 @@ void setupSimulatedSystem(std::vector>* systemActors, LocalityData localities(Optional>(), newZoneId, newMachineId, dcUID); localities.set("data_hall"_sr, dcUID); systemActors->push_back(reportErrors(simulatedMachine(ClusterConnectionString(extraDatabase), + conn, extraIps, sslEnabled, localities, @@ -2422,6 +2448,7 @@ void setupSimulatedSystem(std::vector>* systemActors, Optional>(), newZoneId, newZoneId, Optional>()); systemActors->push_back( reportErrors(simulatedMachine(conn, + ClusterConnectionString(), ips, sslEnabled, localities, diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 9d1dff8348..e46619cd55 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -357,6 +357,11 @@ struct MachineAttritionWorkload : FailureInjectionWorkload { TraceEvent("Assassination").detail("TargetDataHall", target).detail("KillType", kt); g_simulator->killDataHall(target, kt); + } else if (!g_simulator->extraDatabases.empty() && deterministicRandom()->random01() < 0.1) { + state ISimulator::KillType kt = ISimulator::RebootProcessAndSwitch; + g_simulator->killAll(kt, true); + wait(delay(self->testDuration / 2)); + g_simulator->killAll(kt, true); } else { state int killedMachines = 0; while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) {