2017-05-26 04:48:44 +08:00
|
|
|
/*
|
|
|
|
* MachineAttrition.actor.cpp
|
|
|
|
*
|
|
|
|
* This source file is part of the FoundationDB open source project
|
|
|
|
*
|
2022-03-22 04:36:23 +08:00
|
|
|
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2022-03-03 02:10:37 +08:00
|
|
|
#include "fdbclient/FDBOptions.g.h"
|
2019-02-18 07:41:16 +08:00
|
|
|
#include "fdbclient/NativeAPI.actor.h"
|
2019-10-17 01:00:16 +08:00
|
|
|
#include "fdbclient/CoordinationInterface.h"
|
2019-02-18 11:25:16 +08:00
|
|
|
#include "fdbserver/TesterInterface.actor.h"
|
2019-02-18 11:13:26 +08:00
|
|
|
#include "fdbserver/WorkerInterface.actor.h"
|
2019-02-18 11:18:30 +08:00
|
|
|
#include "fdbserver/workloads/workloads.actor.h"
|
2017-05-26 04:48:44 +08:00
|
|
|
#include "fdbrpc/simulator.h"
|
2019-04-03 05:15:51 +08:00
|
|
|
#include "fdbclient/ManagementAPI.actor.h"
|
2021-07-24 07:28:20 +08:00
|
|
|
#include "flow/FaultInjection.h"
|
2021-03-11 02:06:03 +08:00
|
|
|
#include "flow/actorcompiler.h" // This must be the last #include.
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
static std::set<int> const& normalAttritionErrors() {
|
|
|
|
static std::set<int> s;
|
|
|
|
if (s.empty()) {
|
2021-03-11 02:06:03 +08:00
|
|
|
s.insert(error_code_please_reboot);
|
|
|
|
s.insert(error_code_please_reboot_delete);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-08-06 06:00:17 +08:00
|
|
|
ACTOR Future<bool> ignoreSSFailuresForDuration(Database cx, double duration) {
|
|
|
|
// duration doesn't matter since this won't timeout
|
2021-07-27 10:55:10 +08:00
|
|
|
TraceEvent("IgnoreSSFailureStart").log();
|
2019-09-26 14:19:42 +08:00
|
|
|
wait(success(setHealthyZone(cx, ignoreSSFailuresZoneString, 0)));
|
2021-07-27 10:55:10 +08:00
|
|
|
TraceEvent("IgnoreSSFailureWait").log();
|
2019-08-06 06:00:17 +08:00
|
|
|
wait(delay(duration));
|
2021-07-27 10:55:10 +08:00
|
|
|
TraceEvent("IgnoreSSFailureClear").log();
|
2019-07-17 06:12:18 +08:00
|
|
|
state Transaction tr(cx);
|
|
|
|
loop {
|
|
|
|
try {
|
|
|
|
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
2022-03-03 02:10:37 +08:00
|
|
|
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
|
2019-07-17 06:12:18 +08:00
|
|
|
tr.clear(healthyZoneKey);
|
|
|
|
wait(tr.commit());
|
2021-07-27 10:55:10 +08:00
|
|
|
TraceEvent("IgnoreSSFailureComplete").log();
|
2019-08-06 06:00:17 +08:00
|
|
|
return true;
|
2019-07-17 06:12:18 +08:00
|
|
|
} catch (Error& e) {
|
|
|
|
wait(tr.onError(e));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-26 04:48:44 +08:00
|
|
|
struct MachineAttritionWorkload : TestWorkload {
|
|
|
|
bool enabled;
|
2019-11-21 07:11:18 +08:00
|
|
|
int machinesToKill, machinesToLeave, workersToKill, workersToLeave;
|
2019-11-21 02:46:00 +08:00
|
|
|
double testDuration, suspendDuration, liveDuration;
|
2017-05-26 04:48:44 +08:00
|
|
|
bool reboot;
|
|
|
|
bool killDc;
|
2019-10-24 02:29:47 +08:00
|
|
|
bool killMachine;
|
2019-10-24 05:19:17 +08:00
|
|
|
bool killDatahall;
|
2019-11-05 07:46:45 +08:00
|
|
|
bool killProcess;
|
2019-11-09 07:05:18 +08:00
|
|
|
bool killZone;
|
2017-05-26 04:48:44 +08:00
|
|
|
bool killSelf;
|
2019-11-09 05:56:39 +08:00
|
|
|
std::vector<std::string> targetIds;
|
2017-05-26 04:48:44 +08:00
|
|
|
bool replacement;
|
2017-09-16 08:55:01 +08:00
|
|
|
bool waitForVersion;
|
|
|
|
bool allowFaultInjection;
|
2019-08-06 06:00:17 +08:00
|
|
|
Future<bool> ignoreSSFailures;
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
// This is set in setup from the list of workers when the cluster is started
|
|
|
|
std::vector<LocalityData> machines;
|
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
MachineAttritionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
2021-07-24 07:28:20 +08:00
|
|
|
// only do this on the "first" client, and only when in simulation and only when fault injection is enabled
|
|
|
|
enabled = !clientId && g_network->isSimulated() && faultInjectionActivated;
|
2021-03-11 02:06:03 +08:00
|
|
|
machinesToKill = getOption(options, LiteralStringRef("machinesToKill"), 2);
|
|
|
|
machinesToLeave = getOption(options, LiteralStringRef("machinesToLeave"), 1);
|
|
|
|
workersToKill = getOption(options, LiteralStringRef("workersToKill"), 2);
|
|
|
|
workersToLeave = getOption(options, LiteralStringRef("workersToLeave"), 1);
|
|
|
|
testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0);
|
|
|
|
suspendDuration = getOption(options, LiteralStringRef("suspendDuration"), 1.0);
|
|
|
|
liveDuration = getOption(options, LiteralStringRef("liveDuration"), 5.0);
|
|
|
|
reboot = getOption(options, LiteralStringRef("reboot"), false);
|
|
|
|
killDc = getOption(
|
|
|
|
options, LiteralStringRef("killDc"), g_network->isSimulated() && deterministicRandom()->random01() < 0.25);
|
2019-11-09 07:05:18 +08:00
|
|
|
killMachine = getOption(options, LiteralStringRef("killMachine"), false);
|
|
|
|
killDatahall = getOption(options, LiteralStringRef("killDatahall"), false);
|
|
|
|
killProcess = getOption(options, LiteralStringRef("killProcess"), false);
|
|
|
|
killZone = getOption(options, LiteralStringRef("killZone"), false);
|
|
|
|
killSelf = getOption(options, LiteralStringRef("killSelf"), false);
|
2019-11-09 05:56:39 +08:00
|
|
|
targetIds = getOption(options, LiteralStringRef("targetIds"), std::vector<std::string>());
|
2021-03-11 02:06:03 +08:00
|
|
|
replacement =
|
|
|
|
getOption(options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5);
|
|
|
|
waitForVersion = getOption(options, LiteralStringRef("waitForVersion"), false);
|
|
|
|
allowFaultInjection = getOption(options, LiteralStringRef("allowFaultInjection"), true);
|
2019-08-06 06:00:17 +08:00
|
|
|
ignoreSSFailures = true;
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
2021-09-17 08:42:34 +08:00
|
|
|
static std::vector<ISimulator::ProcessInfo*> getServers() {
|
|
|
|
std::vector<ISimulator::ProcessInfo*> machines;
|
|
|
|
std::vector<ISimulator::ProcessInfo*> all = g_simulator.getAllProcesses();
|
2021-03-11 02:06:03 +08:00
|
|
|
for (int i = 0; i < all.size(); i++)
|
|
|
|
if (!all[i]->failed && all[i]->name == std::string("Server") &&
|
|
|
|
all[i]->startingClass != ProcessClass::TesterClass)
|
|
|
|
machines.push_back(all[i]);
|
2017-05-26 04:48:44 +08:00
|
|
|
return machines;
|
|
|
|
}
|
|
|
|
|
2020-10-05 13:29:07 +08:00
|
|
|
std::string description() const override { return "MachineAttritionWorkload"; }
|
|
|
|
Future<Void> setup(Database const& cx) override { return Void(); }
|
|
|
|
Future<Void> start(Database const& cx) override {
|
2017-05-26 04:48:44 +08:00
|
|
|
if (enabled) {
|
2021-03-11 02:06:03 +08:00
|
|
|
std::map<Optional<Standalone<StringRef>>, LocalityData> machineIDMap;
|
2017-05-26 04:48:44 +08:00
|
|
|
auto processes = getServers();
|
|
|
|
for (auto it = processes.begin(); it != processes.end(); ++it) {
|
|
|
|
machineIDMap[(*it)->locality.zoneId()] = (*it)->locality;
|
|
|
|
}
|
|
|
|
machines.clear();
|
|
|
|
for (auto it = machineIDMap.begin(); it != machineIDMap.end(); ++it) {
|
|
|
|
machines.push_back(it->second);
|
|
|
|
}
|
2021-03-11 02:06:03 +08:00
|
|
|
deterministicRandom()->randomShuffle(machines);
|
2017-05-26 04:48:44 +08:00
|
|
|
double meanDelay = testDuration / machinesToKill;
|
|
|
|
TraceEvent("AttritionStarting")
|
2021-03-11 02:06:03 +08:00
|
|
|
.detail("KillDataCenters", killDc)
|
|
|
|
.detail("Reboot", reboot)
|
|
|
|
.detail("MachinesToLeave", machinesToLeave)
|
|
|
|
.detail("MachinesToKill", machinesToKill)
|
|
|
|
.detail("MeanDelay", meanDelay);
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
return timeout(
|
2021-03-11 02:06:03 +08:00
|
|
|
reportErrorsExcept(
|
|
|
|
machineKillWorker(this, meanDelay, cx), "machineKillWorkerError", UID(), &normalAttritionErrors()),
|
|
|
|
testDuration,
|
|
|
|
Void());
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
2019-10-11 02:49:07 +08:00
|
|
|
if (!clientId && !g_network->isSimulated()) {
|
2019-10-17 01:00:16 +08:00
|
|
|
return timeout(
|
2021-03-11 02:06:03 +08:00
|
|
|
reportErrorsExcept(
|
|
|
|
noSimMachineKillWorker(this, cx), "noSimMachineKillWorkerError", UID(), &normalAttritionErrors()),
|
|
|
|
testDuration,
|
|
|
|
Void());
|
2019-10-11 02:49:07 +08:00
|
|
|
}
|
2021-03-11 02:06:03 +08:00
|
|
|
if (killSelf)
|
2017-05-26 04:48:44 +08:00
|
|
|
throw please_reboot();
|
|
|
|
return Void();
|
|
|
|
}
|
2020-10-05 13:29:07 +08:00
|
|
|
Future<bool> check(Database const& cx) override { return ignoreSSFailures; }
|
2021-09-17 08:42:34 +08:00
|
|
|
void getMetrics(std::vector<PerfMetric>& m) override {}
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2019-10-24 01:37:38 +08:00
|
|
|
static bool noSimIsViableKill(WorkerDetails worker) {
|
2019-11-06 05:57:32 +08:00
|
|
|
return (worker.processClass != ProcessClass::ClassType::TesterClass);
|
2019-10-17 01:00:16 +08:00
|
|
|
}
|
|
|
|
|
2019-11-09 07:05:18 +08:00
|
|
|
template <typename Proc>
|
2021-03-11 02:06:03 +08:00
|
|
|
static void sendRebootRequests(std::vector<WorkerDetails> workers,
|
|
|
|
std::vector<std::string> targets,
|
|
|
|
RebootRequest rbReq,
|
|
|
|
Proc idAccess) {
|
2019-11-09 07:05:18 +08:00
|
|
|
for (const auto& worker : workers) {
|
|
|
|
// kill all matching workers
|
|
|
|
if (idAccess(worker).present() &&
|
|
|
|
std::count(targets.begin(), targets.end(), idAccess(worker).get().toString())) {
|
2019-11-21 07:11:18 +08:00
|
|
|
TraceEvent("SendingRebootRequest").detail("TargetWorker", worker.interf.locality.toString());
|
2019-11-09 07:05:18 +08:00
|
|
|
worker.interf.clientInterface.reboot.send(rbReq);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
ACTOR static Future<Void> noSimMachineKillWorker(MachineAttritionWorkload* self, Database cx) {
|
2019-10-11 02:49:07 +08:00
|
|
|
ASSERT(!g_network->isSimulated());
|
2019-11-21 02:46:00 +08:00
|
|
|
state int killedWorkers = 0;
|
2019-10-24 01:37:38 +08:00
|
|
|
state std::vector<WorkerDetails> allWorkers =
|
2019-10-11 02:49:07 +08:00
|
|
|
wait(self->dbInfo->get().clusterInterface.getWorkers.getReply(GetWorkersRequest()));
|
|
|
|
// Can reuse reboot request to send to each interface since no reply promise needed
|
|
|
|
state RebootRequest rbReq;
|
|
|
|
if (self->reboot) {
|
|
|
|
rbReq.waitForDuration = self->suspendDuration;
|
|
|
|
} else {
|
|
|
|
rbReq.waitForDuration = std::numeric_limits<uint32_t>::max();
|
|
|
|
}
|
2019-10-24 01:37:38 +08:00
|
|
|
state std::vector<WorkerDetails> workers;
|
|
|
|
// Pre-processing step: remove all testers from list of workers
|
|
|
|
for (const auto& worker : allWorkers) {
|
|
|
|
if (noSimIsViableKill(worker)) {
|
|
|
|
workers.push_back(worker);
|
2019-10-17 01:00:16 +08:00
|
|
|
}
|
|
|
|
}
|
2019-10-25 00:45:04 +08:00
|
|
|
deterministicRandom()->randomShuffle(workers);
|
2019-11-21 02:46:00 +08:00
|
|
|
wait(delay(self->liveDuration));
|
2019-11-09 05:56:39 +08:00
|
|
|
// if a specific kill is requested, it must be accompanied by a set of target IDs otherwise no kills will occur
|
2019-10-11 02:49:07 +08:00
|
|
|
if (self->killDc) {
|
2019-11-09 05:56:39 +08:00
|
|
|
TraceEvent("Assassination").detail("TargetDataCenterIds", describe(self->targetIds));
|
2021-03-11 02:06:03 +08:00
|
|
|
sendRebootRequests(workers,
|
|
|
|
self->targetIds,
|
|
|
|
rbReq,
|
2019-11-09 07:05:18 +08:00
|
|
|
// idAccess lambda
|
|
|
|
[](WorkerDetails worker) { return worker.interf.locality.dcId(); });
|
2019-10-24 02:29:47 +08:00
|
|
|
} else if (self->killMachine) {
|
2019-11-09 07:09:09 +08:00
|
|
|
TraceEvent("Assassination").detail("TargetMachineIds", describe(self->targetIds));
|
2021-03-11 02:06:03 +08:00
|
|
|
sendRebootRequests(workers,
|
|
|
|
self->targetIds,
|
|
|
|
rbReq,
|
2019-11-09 07:05:18 +08:00
|
|
|
// idAccess lambda
|
|
|
|
[](WorkerDetails worker) { return worker.interf.locality.machineId(); });
|
2019-10-24 05:19:17 +08:00
|
|
|
} else if (self->killDatahall) {
|
2019-11-09 07:09:09 +08:00
|
|
|
TraceEvent("Assassination").detail("TargetDatahallIds", describe(self->targetIds));
|
2021-03-11 02:06:03 +08:00
|
|
|
sendRebootRequests(workers,
|
|
|
|
self->targetIds,
|
|
|
|
rbReq,
|
2019-11-09 07:05:18 +08:00
|
|
|
// idAccess lambda
|
|
|
|
[](WorkerDetails worker) { return worker.interf.locality.dataHallId(); });
|
2019-11-05 07:46:45 +08:00
|
|
|
} else if (self->killProcess) {
|
2019-11-09 07:09:09 +08:00
|
|
|
TraceEvent("Assassination").detail("TargetProcessIds", describe(self->targetIds));
|
2021-03-11 02:06:03 +08:00
|
|
|
sendRebootRequests(workers,
|
|
|
|
self->targetIds,
|
|
|
|
rbReq,
|
2019-11-09 07:05:18 +08:00
|
|
|
// idAccess lambda
|
|
|
|
[](WorkerDetails worker) { return worker.interf.locality.processId(); });
|
|
|
|
} else if (self->killZone) {
|
2019-11-09 07:09:09 +08:00
|
|
|
TraceEvent("Assassination").detail("TargetZoneIds", describe(self->targetIds));
|
2021-03-11 02:06:03 +08:00
|
|
|
sendRebootRequests(workers,
|
|
|
|
self->targetIds,
|
|
|
|
rbReq,
|
2019-11-09 07:05:18 +08:00
|
|
|
// idAccess lambda
|
|
|
|
[](WorkerDetails worker) { return worker.interf.locality.zoneId(); });
|
2019-10-11 02:49:07 +08:00
|
|
|
} else {
|
2019-11-21 07:11:18 +08:00
|
|
|
while (killedWorkers < self->workersToKill && workers.size() > self->workersToLeave) {
|
2019-10-11 02:49:07 +08:00
|
|
|
TraceEvent("WorkerKillBegin")
|
2019-11-21 02:46:00 +08:00
|
|
|
.detail("KilledWorkers", killedWorkers)
|
2019-11-21 07:11:18 +08:00
|
|
|
.detail("WorkersToKill", self->workersToKill)
|
|
|
|
.detail("WorkersToLeave", self->workersToLeave)
|
2019-11-21 02:46:00 +08:00
|
|
|
.detail("Workers", workers.size());
|
2019-10-11 02:49:07 +08:00
|
|
|
if (self->waitForVersion) {
|
|
|
|
state Transaction tr(cx);
|
|
|
|
loop {
|
|
|
|
try {
|
|
|
|
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
|
|
|
|
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
|
|
|
wait(success(tr.getReadVersion()));
|
|
|
|
break;
|
|
|
|
} catch (Error& e) {
|
|
|
|
wait(tr.onError(e));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-11-21 02:46:00 +08:00
|
|
|
// Pick a worker to kill
|
|
|
|
state WorkerDetails targetWorker;
|
|
|
|
targetWorker = workers.back();
|
2019-10-11 02:49:07 +08:00
|
|
|
TraceEvent("Assassination")
|
2019-11-21 02:46:00 +08:00
|
|
|
.detail("TargetWorker", targetWorker.interf.locality.toString())
|
|
|
|
.detail("ZoneId", targetWorker.interf.locality.zoneId())
|
|
|
|
.detail("KilledWorkers", killedWorkers)
|
2019-11-21 07:11:18 +08:00
|
|
|
.detail("WorkersToKill", self->workersToKill)
|
|
|
|
.detail("WorkersToLeave", self->workersToLeave)
|
2019-11-21 02:46:00 +08:00
|
|
|
.detail("Workers", workers.size());
|
|
|
|
targetWorker.interf.clientInterface.reboot.send(rbReq);
|
|
|
|
killedWorkers++;
|
2019-10-11 02:49:07 +08:00
|
|
|
workers.pop_back();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Void();
|
|
|
|
}
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
ACTOR static Future<Void> machineKillWorker(MachineAttritionWorkload* self, double meanDelay, Database cx) {
|
2017-05-26 04:48:44 +08:00
|
|
|
state int killedMachines = 0;
|
2019-05-11 05:01:52 +08:00
|
|
|
state double delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
ASSERT(g_network->isSimulated());
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
if (self->killDc) {
|
|
|
|
wait(delay(delayBeforeKill));
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
// decide on a machine to kill
|
2021-03-11 02:06:03 +08:00
|
|
|
ASSERT(self->machines.size());
|
2017-05-26 04:48:44 +08:00
|
|
|
Optional<Standalone<StringRef>> target = self->machines.back().dcId();
|
|
|
|
|
|
|
|
ISimulator::KillType kt = ISimulator::Reboot;
|
2021-03-11 02:06:03 +08:00
|
|
|
if (!self->reboot) {
|
|
|
|
int killType = deterministicRandom()->randomInt(0, 3); // FIXME: enable disk stalls
|
|
|
|
if (killType == 0)
|
2017-05-26 04:48:44 +08:00
|
|
|
kt = ISimulator::KillInstantly;
|
2021-03-11 02:06:03 +08:00
|
|
|
else if (killType == 1)
|
2017-05-26 04:48:44 +08:00
|
|
|
kt = ISimulator::InjectFaults;
|
2021-03-11 02:06:03 +08:00
|
|
|
else if (killType == 2)
|
2017-05-26 04:48:44 +08:00
|
|
|
kt = ISimulator::RebootAndDelete;
|
2021-02-03 09:33:47 +08:00
|
|
|
else
|
|
|
|
kt = ISimulator::FailDisk;
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
2021-03-11 02:06:03 +08:00
|
|
|
TraceEvent("Assassination")
|
|
|
|
.detail("TargetDatacenter", target)
|
|
|
|
.detail("Reboot", self->reboot)
|
|
|
|
.detail("KillType", kt);
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
g_simulator.killDataCenter(target, kt);
|
2017-05-26 04:48:44 +08:00
|
|
|
} else {
|
2021-03-11 02:06:03 +08:00
|
|
|
while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) {
|
|
|
|
TraceEvent("WorkerKillBegin")
|
|
|
|
.detail("KilledMachines", killedMachines)
|
|
|
|
.detail("MachinesToKill", self->machinesToKill)
|
|
|
|
.detail("MachinesToLeave", self->machinesToLeave)
|
|
|
|
.detail("Machines", self->machines.size());
|
|
|
|
TEST(true); // Killing a machine
|
2017-05-27 08:43:28 +08:00
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
wait(delay(delayBeforeKill));
|
2021-07-27 10:55:10 +08:00
|
|
|
TraceEvent("WorkerKillAfterDelay").log();
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
if (self->waitForVersion) {
|
|
|
|
state Transaction tr(cx);
|
2017-09-16 08:55:01 +08:00
|
|
|
loop {
|
|
|
|
try {
|
|
|
|
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
|
|
|
|
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
2019-02-13 08:07:17 +08:00
|
|
|
wait(success(tr.getReadVersion()));
|
2017-09-16 08:55:01 +08:00
|
|
|
break;
|
2021-03-11 02:06:03 +08:00
|
|
|
} catch (Error& e) {
|
|
|
|
wait(tr.onError(e));
|
2017-09-16 08:55:01 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-26 04:48:44 +08:00
|
|
|
// decide on a machine to kill
|
2019-04-03 05:15:51 +08:00
|
|
|
state LocalityData targetMachine = self->machines.back();
|
2021-03-11 02:06:03 +08:00
|
|
|
if (BUGGIFY_WITH_PROB(0.01)) {
|
|
|
|
TEST(true); // Marked a zone for maintenance before killing it
|
2019-09-26 14:19:42 +08:00
|
|
|
wait(success(
|
|
|
|
setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20)));
|
2019-07-17 06:12:18 +08:00
|
|
|
} else if (BUGGIFY_WITH_PROB(0.005)) {
|
|
|
|
TEST(true); // Disable DD for all storage server failures
|
2019-08-27 03:49:02 +08:00
|
|
|
self->ignoreSSFailures =
|
|
|
|
uncancellable(ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5));
|
2019-04-03 05:15:51 +08:00
|
|
|
}
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
TraceEvent("Assassination")
|
|
|
|
.detail("TargetMachine", targetMachine.toString())
|
|
|
|
.detail("ZoneId", targetMachine.zoneId())
|
|
|
|
.detail("Reboot", self->reboot)
|
|
|
|
.detail("KilledMachines", killedMachines)
|
|
|
|
.detail("MachinesToKill", self->machinesToKill)
|
|
|
|
.detail("MachinesToLeave", self->machinesToLeave)
|
|
|
|
.detail("Machines", self->machines.size())
|
|
|
|
.detail("Replace", self->replacement);
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
if (self->reboot) {
|
2021-03-11 02:06:03 +08:00
|
|
|
if (deterministicRandom()->random01() > 0.5) {
|
|
|
|
g_simulator.rebootProcess(targetMachine.zoneId(), deterministicRandom()->random01() > 0.5);
|
2017-05-26 04:48:44 +08:00
|
|
|
} else {
|
2021-03-11 02:06:03 +08:00
|
|
|
g_simulator.killZone(targetMachine.zoneId(), ISimulator::Reboot);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
} else {
|
2019-05-11 05:01:52 +08:00
|
|
|
auto randomDouble = deterministicRandom()->random01();
|
2021-03-11 02:06:03 +08:00
|
|
|
TraceEvent("WorkerKill")
|
|
|
|
.detail("MachineCount", self->machines.size())
|
|
|
|
.detail("RandomValue", randomDouble);
|
|
|
|
if (randomDouble < 0.33) {
|
2017-05-26 04:48:44 +08:00
|
|
|
TraceEvent("RebootAndDelete").detail("TargetMachine", targetMachine.toString());
|
2021-03-11 02:06:03 +08:00
|
|
|
g_simulator.killZone(targetMachine.zoneId(), ISimulator::RebootAndDelete);
|
2017-05-26 04:48:44 +08:00
|
|
|
} else {
|
2021-01-28 06:29:43 +08:00
|
|
|
auto kt = ISimulator::KillInstantly;
|
2021-03-11 02:06:03 +08:00
|
|
|
if (self->allowFaultInjection) {
|
|
|
|
if (randomDouble < 0.50) {
|
2021-02-03 09:33:47 +08:00
|
|
|
kt = ISimulator::InjectFaults;
|
|
|
|
}
|
2021-03-11 02:06:03 +08:00
|
|
|
// FIXME: enable disk stalls
|
2021-02-03 09:33:47 +08:00
|
|
|
/*
|
2021-01-28 06:29:43 +08:00
|
|
|
if( randomDouble < 0.56 ) {
|
2021-03-11 02:06:03 +08:00
|
|
|
kt = ISimulator::InjectFaults;
|
2021-01-28 06:29:43 +08:00
|
|
|
} else if( randomDouble < 0.66 ) {
|
2021-03-11 02:06:03 +08:00
|
|
|
kt = ISimulator::FailDisk;
|
2021-01-28 06:29:43 +08:00
|
|
|
}
|
2021-02-03 09:33:47 +08:00
|
|
|
*/
|
2021-01-28 06:29:43 +08:00
|
|
|
}
|
2021-03-11 02:06:03 +08:00
|
|
|
g_simulator.killZone(targetMachine.zoneId(), kt);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
killedMachines++;
|
2021-03-11 02:06:03 +08:00
|
|
|
if (!self->replacement)
|
2017-05-26 04:48:44 +08:00
|
|
|
self->machines.pop_back();
|
|
|
|
|
2019-08-06 06:00:17 +08:00
|
|
|
wait(delay(meanDelay - delayBeforeKill) && success(self->ignoreSSFailures));
|
2019-07-25 06:32:52 +08:00
|
|
|
|
2019-05-11 05:01:52 +08:00
|
|
|
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
2017-05-26 04:48:44 +08:00
|
|
|
TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-11 02:06:03 +08:00
|
|
|
if (self->killSelf)
|
2017-05-26 04:48:44 +08:00
|
|
|
throw please_reboot();
|
|
|
|
return Void();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
WorkloadFactory<MachineAttritionWorkload> MachineAttritionWorkloadFactory("Attrition");
|