2017-05-26 04:48:44 +08:00
|
|
|
/*
|
|
|
|
* TargetedKill.actor.cpp
|
|
|
|
*
|
|
|
|
* This source file is part of the FoundationDB open source project
|
|
|
|
*
|
2022-03-22 04:36:23 +08:00
|
|
|
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
2018-02-22 02:25:11 +08:00
|
|
|
*
|
2017-05-26 04:48:44 +08:00
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2019-02-18 07:41:16 +08:00
|
|
|
#include "fdbclient/NativeAPI.actor.h"
|
2019-02-18 11:25:16 +08:00
|
|
|
#include "fdbserver/TesterInterface.actor.h"
|
2019-02-18 11:18:30 +08:00
|
|
|
#include "fdbserver/workloads/workloads.actor.h"
|
2017-05-26 04:48:44 +08:00
|
|
|
#include "fdbrpc/simulator.h"
|
|
|
|
#include "fdbserver/MasterInterface.h"
|
|
|
|
#include "fdbclient/SystemData.h"
|
2019-02-18 11:13:26 +08:00
|
|
|
#include "fdbserver/WorkerInterface.actor.h"
|
2017-05-26 04:48:44 +08:00
|
|
|
#include "fdbserver/ServerDBInfo.h"
|
|
|
|
#include "fdbserver/QuietDatabase.h"
|
2018-08-11 06:18:24 +08:00
|
|
|
#include "flow/actorcompiler.h" // This must be the last #include.
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
struct TargetedKillWorkload : TestWorkload {
|
2022-10-14 10:53:48 +08:00
|
|
|
static constexpr auto NAME = "TargetedKill";
|
|
|
|
|
2017-05-26 04:48:44 +08:00
|
|
|
std::string machineToKill;
|
|
|
|
bool enabled, killAllMachineProcesses;
|
2021-10-20 08:22:27 +08:00
|
|
|
int numKillStorages;
|
2017-05-26 04:48:44 +08:00
|
|
|
double killAt;
|
2021-09-04 06:03:12 +08:00
|
|
|
bool reboot;
|
|
|
|
double suspendDuration;
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
TargetedKillWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
|
|
|
enabled = !clientId; // only do this on the "first" client
|
2022-09-20 02:35:58 +08:00
|
|
|
killAt = getOption(options, "killAt"_sr, 5.0);
|
|
|
|
reboot = getOption(options, "reboot"_sr, false);
|
|
|
|
suspendDuration = getOption(options, "suspendDuration"_sr, 1.0);
|
|
|
|
machineToKill = getOption(options, "machineToKill"_sr, "master"_sr).toString();
|
|
|
|
killAllMachineProcesses = getOption(options, "killWholeMachine"_sr, false);
|
|
|
|
numKillStorages = getOption(options, "numKillStorages"_sr, 1);
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
2020-10-05 13:29:07 +08:00
|
|
|
Future<Void> setup(Database const& cx) override { return Void(); }
|
|
|
|
Future<Void> start(Database const& cx) override {
|
2017-05-26 04:48:44 +08:00
|
|
|
if (enabled)
|
|
|
|
return assassin(cx, this);
|
|
|
|
return Void();
|
|
|
|
}
|
2020-10-05 13:29:07 +08:00
|
|
|
Future<bool> check(Database const& cx) override { return true; }
|
2021-09-17 08:42:34 +08:00
|
|
|
void getMetrics(std::vector<PerfMetric>& m) override {}
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-10-20 08:22:27 +08:00
|
|
|
Future<Void> killEndpoint(std::vector<WorkerDetails> workers,
|
|
|
|
NetworkAddress address,
|
|
|
|
Database cx,
|
|
|
|
TargetedKillWorkload* self) {
|
2022-09-15 08:10:49 +08:00
|
|
|
if (g_simulator == g_network) {
|
|
|
|
g_simulator->killInterface(address, ISimulator::KillInstantly);
|
2017-05-26 04:48:44 +08:00
|
|
|
return Void();
|
|
|
|
}
|
|
|
|
|
|
|
|
int killed = 0;
|
2021-10-20 08:22:27 +08:00
|
|
|
RebootRequest rbReq;
|
2021-09-04 06:03:12 +08:00
|
|
|
if (self->reboot) {
|
|
|
|
rbReq.waitForDuration = self->suspendDuration;
|
|
|
|
} else {
|
|
|
|
rbReq.waitForDuration = std::numeric_limits<uint32_t>::max();
|
|
|
|
}
|
2017-05-26 04:48:44 +08:00
|
|
|
for (int i = 0; i < workers.size(); i++) {
|
2019-03-09 00:25:07 +08:00
|
|
|
if (workers[i].interf.master.getEndpoint().getPrimaryAddress() == address ||
|
|
|
|
(self->killAllMachineProcesses &&
|
|
|
|
workers[i].interf.master.getEndpoint().getPrimaryAddress().ip == address.ip &&
|
|
|
|
workers[i].processClass != ProcessClass::TesterClass)) {
|
|
|
|
TraceEvent("WorkerKill").detail("TargetedMachine", address).detail("Worker", workers[i].interf.id());
|
2021-09-04 06:03:12 +08:00
|
|
|
workers[i].interf.clientInterface.reboot.send(rbReq);
|
2021-10-05 13:43:48 +08:00
|
|
|
killed++;
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!killed)
|
|
|
|
TraceEvent(SevWarn, "WorkerNotFoundAtEndpoint").detail("Address", address);
|
|
|
|
else
|
|
|
|
TraceEvent("WorkersKilledAtEndpoint").detail("Address", address).detail("KilledProcesses", killed);
|
|
|
|
|
|
|
|
return Void();
|
|
|
|
}
|
|
|
|
|
|
|
|
ACTOR Future<Void> assassin(Database cx, TargetedKillWorkload* self) {
|
2018-08-11 04:57:10 +08:00
|
|
|
wait(delay(self->killAt));
|
2021-09-17 08:42:34 +08:00
|
|
|
state std::vector<StorageServerInterface> storageServers = wait(getStorageServers(cx));
|
2021-10-20 08:22:27 +08:00
|
|
|
state std::vector<WorkerDetails> workers = wait(getWorkers(self->dbInfo));
|
2017-05-26 04:48:44 +08:00
|
|
|
|
2021-10-20 08:22:27 +08:00
|
|
|
state NetworkAddress machine;
|
|
|
|
state NetworkAddress ccAddr;
|
|
|
|
state int killed = 0;
|
|
|
|
state int s = 0;
|
|
|
|
state int j = 0;
|
2017-05-26 04:48:44 +08:00
|
|
|
if (self->machineToKill == "master") {
|
|
|
|
machine = self->dbInfo->get().master.address();
|
2020-09-11 08:44:15 +08:00
|
|
|
} else if (self->machineToKill == "commitproxy") {
|
2022-01-14 04:41:20 +08:00
|
|
|
auto commitProxies = cx->getCommitProxies(UseProvisionalProxies::False);
|
2020-09-16 13:29:49 +08:00
|
|
|
int o = deterministicRandom()->randomInt(0, commitProxies->size());
|
|
|
|
for (int i = 0; i < commitProxies->size(); i++) {
|
|
|
|
CommitProxyInterface mpi = commitProxies->getInterface(o);
|
2017-05-26 04:48:44 +08:00
|
|
|
machine = mpi.address();
|
2018-10-31 04:44:37 +08:00
|
|
|
if (machine != self->dbInfo->get().clusterInterface.getWorkers.getEndpoint().getPrimaryAddress())
|
2017-05-26 04:48:44 +08:00
|
|
|
break;
|
2020-09-16 13:29:49 +08:00
|
|
|
o = ++o % commitProxies->size();
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
2020-09-11 08:44:15 +08:00
|
|
|
} else if (self->machineToKill == "grvproxy") {
|
2022-01-14 04:41:20 +08:00
|
|
|
auto grvProxies = cx->getGrvProxies(UseProvisionalProxies::False);
|
2020-07-15 15:37:41 +08:00
|
|
|
int o = deterministicRandom()->randomInt(0, grvProxies->size());
|
|
|
|
for (int i = 0; i < grvProxies->size(); i++) {
|
|
|
|
GrvProxyInterface gpi = grvProxies->getInterface(o);
|
|
|
|
machine = gpi.address();
|
|
|
|
if (machine != self->dbInfo->get().clusterInterface.getWorkers.getEndpoint().getPrimaryAddress())
|
|
|
|
break;
|
|
|
|
o = ++o % grvProxies->size();
|
|
|
|
}
|
2020-09-11 08:44:15 +08:00
|
|
|
} else if (self->machineToKill == "tlog") {
|
2017-05-26 04:48:44 +08:00
|
|
|
auto tlogs = self->dbInfo->get().logSystemConfig.allPresentLogs();
|
2019-05-11 05:01:52 +08:00
|
|
|
int o = deterministicRandom()->randomInt(0, tlogs.size());
|
2017-05-26 04:48:44 +08:00
|
|
|
for (int i = 0; i < tlogs.size(); i++) {
|
|
|
|
TLogInterface tli = tlogs[o];
|
|
|
|
machine = tli.address();
|
2018-10-31 04:44:37 +08:00
|
|
|
if (machine != self->dbInfo->get().clusterInterface.getWorkers.getEndpoint().getPrimaryAddress())
|
2017-05-26 04:48:44 +08:00
|
|
|
break;
|
|
|
|
o = ++o % tlogs.size();
|
|
|
|
}
|
2020-09-11 08:44:15 +08:00
|
|
|
} else if (self->machineToKill == "storage" || self->machineToKill == "ss" ||
|
|
|
|
self->machineToKill == "storageserver") {
|
2021-10-20 08:22:27 +08:00
|
|
|
s = deterministicRandom()->randomInt(0, storageServers.size());
|
|
|
|
ccAddr = self->dbInfo->get().clusterInterface.getWorkers.getEndpoint().getPrimaryAddress();
|
|
|
|
for (j = 0; j < storageServers.size(); j++) {
|
|
|
|
StorageServerInterface ssi = storageServers[s];
|
2017-05-26 04:48:44 +08:00
|
|
|
machine = ssi.address();
|
2021-11-17 09:39:59 +08:00
|
|
|
if (machine != ccAddr) {
|
2021-10-20 08:22:27 +08:00
|
|
|
TraceEvent("IsolatedMark").detail("TargetedMachine", machine).detail("Role", self->machineToKill);
|
|
|
|
wait(self->killEndpoint(workers, machine, cx, self));
|
|
|
|
killed++;
|
|
|
|
if (killed == self->numKillStorages)
|
|
|
|
return Void();
|
|
|
|
}
|
|
|
|
s = ++s % storageServers.size();
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
2020-09-11 08:44:15 +08:00
|
|
|
} else if (self->machineToKill == "clustercontroller" || self->machineToKill == "cc") {
|
2018-10-31 04:44:37 +08:00
|
|
|
machine = self->dbInfo->get().clusterInterface.getWorkers.getEndpoint().getPrimaryAddress();
|
2017-05-26 04:48:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TraceEvent("IsolatedMark").detail("TargetedMachine", machine).detail("Role", self->machineToKill);
|
|
|
|
|
2021-10-20 08:22:27 +08:00
|
|
|
wait(self->killEndpoint(workers, machine, cx, self));
|
2017-05-26 04:48:44 +08:00
|
|
|
|
|
|
|
return Void();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2022-10-14 10:53:48 +08:00
|
|
|
WorkloadFactory<TargetedKillWorkload> TargetedKillWorkloadFactory;
|