more complete support for simulated disk failures
This commit is contained in:
parent
75b24da109
commit
36e4f82115
|
@ -1122,7 +1122,7 @@ public:
|
|||
int nQuorum = ((desiredCoordinators+1)/2)*2-1;
|
||||
|
||||
KillType newKt = kt;
|
||||
if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))
|
||||
if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete))
|
||||
{
|
||||
LocalityGroup primaryProcessesLeft, primaryProcessesDead;
|
||||
LocalityGroup primarySatelliteProcessesLeft, primarySatelliteProcessesDead;
|
||||
|
@ -1262,6 +1262,8 @@ public:
|
|||
void killProcess_internal( ProcessInfo* machine, KillType kt ) {
|
||||
TEST( true ); // Simulated machine was killed with any kill type
|
||||
TEST( kt == KillInstantly ); // Simulated machine was killed instantly
|
||||
TEST( kt == InjectFaults ); // Simulated machine was killed with faults
|
||||
TEST( kt == FailDisk ); // Simulated machine was killed with a failed disk
|
||||
|
||||
if (kt == KillInstantly) {
|
||||
TraceEvent(SevWarn, "FailMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
|
||||
|
@ -1269,18 +1271,14 @@ public:
|
|||
latestEventCache.clear();
|
||||
machine->failed = true;
|
||||
} else if (kt == InjectFaults) {
|
||||
if(deterministicRandom()->random01() < 0.3) {
|
||||
TraceEvent(SevWarn, "FailDiskMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
|
||||
TEST(true); // Simulated machine was killed with a failed disk
|
||||
machine->failedDisk = true;
|
||||
} else {
|
||||
TEST(true); // Simulated machine was killed with faults
|
||||
TraceEvent(SevWarn, "FaultMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
|
||||
should_inject_fault = simulator_should_inject_fault;
|
||||
machine->fault_injection_r = deterministicRandom()->randomUniqueID().first();
|
||||
machine->fault_injection_p1 = 0.1;
|
||||
machine->fault_injection_p2 = deterministicRandom()->random01();
|
||||
}
|
||||
TraceEvent(SevWarn, "FaultMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
|
||||
should_inject_fault = simulator_should_inject_fault;
|
||||
machine->fault_injection_r = deterministicRandom()->randomUniqueID().first();
|
||||
machine->fault_injection_p1 = 0.1;
|
||||
machine->fault_injection_p2 = deterministicRandom()->random01();
|
||||
} else if (kt == FailDisk) {
|
||||
TraceEvent(SevWarn, "FailDiskMachine").detail("Name", machine->name).detail("Address", machine->address).detail("ZoneId", machine->locality.zoneId()).detail("Process", machine->toString()).detail("Rebooting", machine->rebooting).detail("Protected", protectedAddresses.count(machine->address)).backtrace();
|
||||
machine->failedDisk = true;
|
||||
} else {
|
||||
ASSERT( false );
|
||||
}
|
||||
|
@ -1371,7 +1369,7 @@ public:
|
|||
}
|
||||
|
||||
// Check if machine can be removed, if requested
|
||||
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
|
||||
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
|
||||
{
|
||||
std::vector<ProcessInfo*> processesLeft, processesDead;
|
||||
int protectedWorker = 0, unavailable = 0, excluded = 0, cleared = 0;
|
||||
|
@ -1404,7 +1402,7 @@ public:
|
|||
if (!canKillProcesses(processesLeft, processesDead, kt, &kt)) {
|
||||
TraceEvent("ChangedKillMachine").detail("MachineId", machineId).detail("KillType", kt).detail("OrigKillType", ktOrig).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("Protected", protectedWorker).detail("Unavailable", unavailable).detail("Excluded", excluded).detail("Cleared", cleared).detail("ProtectedTotal", protectedAddresses.size()).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info());
|
||||
}
|
||||
else if ((kt == KillInstantly) || (kt == InjectFaults)) {
|
||||
else if ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk)) {
|
||||
TraceEvent("DeadMachine").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("TotalProcesses", machines.size()).detail("ProcessesPerMachine", processesPerMachine).detail("TLogPolicy", tLogPolicy->info()).detail("StoragePolicy", storagePolicy->info());
|
||||
for (auto process : processesLeft) {
|
||||
TraceEvent("DeadMachineSurvivors").detail("MachineId", machineId).detail("KillType", kt).detail("ProcessesLeft", processesLeft.size()).detail("ProcessesDead", processesDead.size()).detail("SurvivingProcess", process->toString());
|
||||
|
@ -1444,7 +1442,7 @@ public:
|
|||
|
||||
TraceEvent("KillMachine").detail("MachineId", machineId).detail("Kt", kt).detail("KtOrig", ktOrig).detail("KillableMachines", processesOnMachine).detail("ProcessPerMachine", processesPerMachine).detail("KillChanged", kt!=ktOrig);
|
||||
if ( kt < RebootAndDelete ) {
|
||||
if(kt == InjectFaults && machines[machineId].machineProcess != nullptr)
|
||||
if((kt == InjectFaults || kt == FailDisk) && machines[machineId].machineProcess != nullptr)
|
||||
killProcess_internal( machines[machineId].machineProcess, kt );
|
||||
for (auto& process : machines[machineId].processes) {
|
||||
TraceEvent("KillMachineProcess").detail("KillType", kt).detail("Process", process->toString()).detail("StartingClass", process->startingClass.toString()).detail("Failed", process->failed).detail("Excluded", process->excluded).detail("Cleared", process->cleared).detail("Rebooting", process->rebooting);
|
||||
|
@ -1492,7 +1490,7 @@ public:
|
|||
}
|
||||
|
||||
// Check if machine can be removed, if requested
|
||||
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
|
||||
if (!forceKill && ((kt == KillInstantly) || (kt == InjectFaults) || (kt == FailDisk) || (kt == RebootAndDelete) || (kt == RebootProcessAndDelete)))
|
||||
{
|
||||
std::vector<ProcessInfo*> processesLeft, processesDead;
|
||||
for (auto processInfo : getAllProcesses()) {
|
||||
|
@ -1766,7 +1764,7 @@ ACTOR void doReboot( ISimulator::ProcessInfo *p, ISimulator::KillType kt ) {
|
|||
|
||||
//Simulates delays for performing operations on disk
|
||||
Future<Void> waitUntilDiskReady( Reference<DiskParameters> diskParameters, int64_t size, bool sync ) {
|
||||
if(g_simulator.getCurrentProcess()->machine->failedDisk) {
|
||||
if(g_simulator.getCurrentProcess()->failedDisk) {
|
||||
wait(Future<Void>(Never()));
|
||||
}
|
||||
if(g_simulator.connectionFailuresDisableDuration > 1e4)
|
||||
|
|
|
@ -38,7 +38,7 @@ public:
|
|||
ISimulator() : desiredCoordinators(1), physicalDatacenters(1), processesPerMachine(0), listenersPerProcess(1), isStopped(false), lastConnectionFailure(0), connectionFailuresDisableDuration(0), speedUpSimulation(false), allSwapsDisabled(false), backupAgents(WaitForType), drAgents(WaitForType), extraDB(NULL), allowLogSetKills(true), usableRegions(1) {}
|
||||
|
||||
// Order matters!
|
||||
enum KillType { KillInstantly, InjectFaults, RebootAndDelete, RebootProcessAndDelete, Reboot, RebootProcess, None };
|
||||
enum KillType { KillInstantly, InjectFaults, FailDisk, RebootAndDelete, RebootProcessAndDelete, Reboot, RebootProcess, None };
|
||||
|
||||
enum BackupAgentType { NoBackupAgents, WaitForType, BackupToFile, BackupToDB };
|
||||
|
||||
|
|
|
@ -155,11 +155,13 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
|
||||
ISimulator::KillType kt = ISimulator::Reboot;
|
||||
if( !self->reboot ) {
|
||||
int killType = deterministicRandom()->randomInt(0,3);
|
||||
int killType = deterministicRandom()->randomInt(0,4);
|
||||
if( killType == 0 )
|
||||
kt = ISimulator::KillInstantly;
|
||||
else if( killType == 1 )
|
||||
kt = ISimulator::InjectFaults;
|
||||
else if( killType == 2 )
|
||||
kt = ISimulator::FailDisk;
|
||||
else
|
||||
kt = ISimulator::RebootAndDelete;
|
||||
}
|
||||
|
@ -221,7 +223,14 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
TraceEvent("RebootAndDelete").detail("TargetMachine", targetMachine.toString());
|
||||
g_simulator.killZone( targetMachine.zoneId(), ISimulator::RebootAndDelete );
|
||||
} else {
|
||||
auto kt = (deterministicRandom()->random01() < 0.5 || !self->allowFaultInjection) ? ISimulator::KillInstantly : ISimulator::InjectFaults;
|
||||
auto kt = ISimulator::KillInstantly;
|
||||
if( self->allowFaultInjection ) {
|
||||
if( randomDouble < 0.56 ) {
|
||||
kt = ISimulator::InjectFaults;
|
||||
} else if( randomDouble < 0.66 ) {
|
||||
kt = ISimulator::FailDisk;
|
||||
}
|
||||
}
|
||||
g_simulator.killZone( targetMachine.zoneId(), kt );
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue