From 103cc37a35e569b9b35591e65c052fe2ae490f05 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 23 Oct 2019 14:19:17 -0700 Subject: [PATCH] added datahall kill and option to target a specific datahall/dc/machine id --- fdbserver/worker.actor.cpp | 4 ++- .../workloads/MachineAttrition.actor.cpp | 27 ++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 70ca357b2c..fcc05bed66 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -964,8 +964,10 @@ ACTOR Future workerServer( when( RebootRequest req = waitNext( interf.clientInterface.reboot.getFuture() ) ) { state RebootRequest rebootReq = req; + // If suspendDuration is INT_MAX, the trace will not be logged if it was inside the next block + // Also a useful trace to have even if suspendDuration is 0 + TraceEvent("RebootRequestSuspendingProcess").detail("Duration", req.waitForDuration); if(req.waitForDuration) { - TraceEvent("RebootRequestSuspendingProcess").detail("Duration", req.waitForDuration); flushTraceFileVoid(); setProfilingEnabled(0); g_network->stop(); diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index fd7c5cdcfb..fdb9ac2ab0 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -65,7 +65,9 @@ struct MachineAttritionWorkload : TestWorkload { bool reboot; bool killDc; bool killMachine; + bool killDatahall; bool killSelf; + std::string targetId; bool replacement; bool waitForVersion; bool allowFaultInjection; @@ -85,7 +87,9 @@ struct MachineAttritionWorkload : TestWorkload { reboot = getOption( options, LiteralStringRef("reboot"), false ); killDc = getOption( options, LiteralStringRef("killDc"), deterministicRandom()->random01() < 0.25 ); killMachine = getOption( options, LiteralStringRef("killMachine"), false); + killDatahall = getOption( options, LiteralStringRef("killDatahall"), false); killSelf = getOption( options, LiteralStringRef("killSelf"), false ); + targetId = getOption( options, LiteralStringRef("targetId"), ""); replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 ); waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false ); allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true ); @@ -172,11 +176,12 @@ struct MachineAttritionWorkload : TestWorkload { wait(delay(delayBeforeKill)); // Pick a dcId to kill deterministicRandom()->randomShuffle(workers); - Optional> killDcId = workers.back().interf.locality.dcId(); - TraceEvent("Assassination").detail("TargetDataCenter", killDcId); + Optional> killDcId = self->targetId.empty() ? workers.back().interf.locality.dcId() : self->targetId; + TraceEvent("Assassination").detail("TargetDataCenterId", killDcId); for (const auto& worker : workers) { // kill all matching dcId workers if (worker.interf.locality.dcId().present() && worker.interf.locality.dcId() == killDcId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); worker.interf.clientInterface.reboot.send(rbReq); } } @@ -184,11 +189,25 @@ struct MachineAttritionWorkload : TestWorkload { wait(delay(delayBeforeKill)); // Pick a machine to kill deterministicRandom()->randomShuffle(workers); - Optional> killMachineId = workers.back().interf.locality.machineId(); - TraceEvent("Assassination").detail("TargetMachine", killMachineId); + Optional> killMachineId = self->targetId.empty() ? workers.back().interf.locality.machineId() : self->targetId; + TraceEvent("Assassination").detail("TargetMachineId", killMachineId); for (const auto& worker : workers) { // kill all matching machine workers if (worker.interf.locality.machineId().present() && worker.interf.locality.machineId() == killMachineId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); + worker.interf.clientInterface.reboot.send(rbReq); + } + } + } else if (self->killDatahall) { + wait(delay(delayBeforeKill)); + // Pick a datahall to kill + deterministicRandom()->randomShuffle(workers); + Optional> killDatahallId = self->targetId.empty() ? workers.back().interf.locality.dataHallId() : self->targetId; + TraceEvent("Assassination").detail("TargetDatahallId", killDatahallId); + for (const auto& worker : workers) { + // kill all matching datahall workers + if (worker.interf.locality.dataHallId().present() && worker.interf.locality.dataHallId() == killDatahallId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); worker.interf.clientInterface.reboot.send(rbReq); } }