more complete support for simulated disk failures

This commit is contained in:
Evan Tschannen 2021-01-27 14:29:43 -08:00
parent 75b24da109
commit 36e4f82115
3 changed files with 28 additions and 21 deletions

View File

@ -1122,7 +1122,7 @@ public:
int nQuorum = ((desiredCoordinators+1)/2)*2-1;
KillType newKt = kt;
if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))
if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))
{
LocalityGroup primaryProcessesLeft, primaryProcessesDead;
LocalityGroup primarySatelliteProcessesLeft, primarySatelliteProcessesDead;
@ -1262,6 +1262,8 @@ public:
void killProcess_internal( ProcessInfo* machine, KillType kt ) {
TEST( true ); // Simulated machine was killed with any kill type
TEST( kt == KillInstantly ); // Simulated machine was killed instantly
TEST( kt == InjectFaults ); // Simulated machine was killed with faults
TEST( kt == FailDisk ); // Simulated machine was killed with a failed disk
if (kt == KillInstantly) {
TraceEvent(SevWarn, "FailMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
@ -1269,18 +1271,14 @@ public:
latestEventCache.clear();
machine->failed = true;
} else if (kt == InjectFaults) {
if(deterministicRandom()->random01() < 0.3) {
TraceEvent(SevWarn, "FailDiskMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
TEST(true); // Simulated machine was killed with a failed disk
machine->failedDisk = true;
} else {
TEST(true); // Simulated machine was killed with faults
TraceEvent(SevWarn, "FaultMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
should_inject_fault = simulator_should_inject_fault;
machine->fault_injection_r = deterministicRandom()->randomUniqueID().first();
machine->fault_injection_p1 = 0.1;
machine->fault_injection_p2 = deterministicRandom()->random01();
}
TraceEvent(SevWarn, "FaultMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
should_inject_fault = simulator_should_inject_fault;
machine->fault_injection_r = deterministicRandom()->randomUniqueID().first();
machine->fault_injection_p1 = 0.1;
machine->fault_injection_p2 = deterministicRandom()->random01();
} else if (kt == FailDisk) {
TraceEvent(SevWarn, "FailDiskMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
machine->failedDisk = true;
} else {
ASSERT( false );
}
@ -1371,7 +1369,7 @@ public:
}
// Check if machine can be removed, if requested
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
{
std::vector<ProcessInfo*> processesLeft, processesDead;
int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0;
@ -1404,7 +1402,7 @@ public:
if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
TraceEvent("ChangedKillMachine").detail("MachineId", machineId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("Protected", protectedWorker).detail("Unavailable", unavailable).detail("Excluded", excluded).detail("Cleared", cleared).detail("ProtectedTotal", protectedAddresses.size()).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info());
}
else if ((kt == KillInstantly) || (kt == InjectFaults)) {
else if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk)) {
TraceEvent("DeadMachine").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info());
for (auto process : processesLeft) {
TraceEvent("DeadMachineSurvivors").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("SurvivingProcess", process->toString());
@ -1444,7 +1442,7 @@ public:
TraceEvent("KillMachine").detail("MachineId", machineId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig);
if ( kt < RebootAndDelete ) {
if(kt == InjectFaults && machines[machineId].machineProcess != nullptr)
if((kt == InjectFaults || kt == FailDisk) && machines[machineId].machineProcess != nullptr)
killProcess_internal( machines[machineId].machineProcess, kt );
for (auto& process : machines[machineId].processes) {
TraceEvent("KillMachineProcess").detail("KillType", kt).detail("Process", process->toString()).detail("StartingClass", process->startingClass.toString()).detail("Failed", process->failed).detail("Excluded", process->excluded).detail("Cleared", process->cleared).detail("Rebooting", process->rebooting);
@ -1492,7 +1490,7 @@ public:
}
// Check if machine can be removed, if requested
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
{
std::vector<ProcessInfo*> processesLeft, processesDead;
for (auto processInfo : getAllProcesses()) {
@ -1766,7 +1764,7 @@ ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {
//Simulates delays for performing operations on disk
Future<Void> waitUntilDiskReady( Reference<DiskParameters> diskParameters, int64_t size, bool sync ) {
if(g_simulator.getCurrentProcess()->machine->failedDisk) {
if(g_simulator.getCurrentProcess()->failedDisk) {
wait(Future<Void>(Never()));
}
if(g_simulator.connectionFailuresDisableDuration > 1e4)

View File

@ -38,7 +38,7 @@ public:
ISimulator() : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), drAgents(WaitForType), extraDB(NULL), allowLogSetKills(true), usableRegions(1) {}
// Order matters!
enum KillType { KillInstantly, InjectFaults, RebootAndDelete, RebootProcessAndDelete, Reboot, RebootProcess, None };
enum KillType { KillInstantly, InjectFaults, FailDisk, RebootAndDelete, RebootProcessAndDelete, Reboot, RebootProcess, None };
enum BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB };

View File

@ -155,11 +155,13 @@ struct MachineAttritionWorkload : TestWorkload {
ISimulator::KillType kt = ISimulator::Reboot;
if( !self->reboot ) {
int killType = deterministicRandom()->randomInt(0,3);
int killType = deterministicRandom()->randomInt(0,4);
if( killType == 0 )
kt = ISimulator::KillInstantly;
else if( killType == 1 )
kt = ISimulator::InjectFaults;
else if( killType == 2 )
kt = ISimulator::FailDisk;
else
kt = ISimulator::RebootAndDelete;
}
@ -221,7 +223,14 @@ struct MachineAttritionWorkload : TestWorkload {
TraceEvent("RebootAndDelete").detail("TargetMachine", targetMachine.toString());
g_simulator.killZone( targetMachine.zoneId(), ISimulator::RebootAndDelete );
} else {
auto kt = (deterministicRandom()->random01() < 0.5 || !self->allowFaultInjection) ? ISimulator::KillInstantly : ISimulator::InjectFaults;
auto kt = ISimulator::KillInstantly;
if( self->allowFaultInjection ) {
if( randomDouble < 0.56 ) {
kt = ISimulator::InjectFaults;
} else if( randomDouble < 0.66 ) {
kt = ISimulator::FailDisk;
}
}
g_simulator.killZone( targetMachine.zoneId(), kt );
}
}