foundationdb/fdbserver/workloads/Rollback.actor.cpp

137 lines
5.4 KiB
C++

/*
* Rollback.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "flow/DeterministicRandom.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/MasterInterface.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/ServerDBInfo.h"
#include "flow/actorcompiler.h" // This must be the last #include.
// Choose a random proxy and a random tLog, represented as unclogTlog.
// The workload first clogs network link between the chosen proxy and all tLogs but the unclogTlog;
// While the network is still clogged, the workload kills the proxy and clogs the unclogged tlog's interface.
// Note: The clogged network link's latency will become "clogDuration".
struct RollbackWorkload : FailureInjectionWorkload {
static constexpr auto NAME = "Rollback";
bool enableFailures = false, multiple = true, enabled;
double meanDelay = 20.0, clogDuration = clogDuration = 3.0, testDuration = 10.0;
RollbackWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {}
RollbackWorkload(WorkloadContext const& wcx) : FailureInjectionWorkload(wcx) {
enabled = !clientId; // only do this on the "first" client
meanDelay = getOption(options, "meanDelay"_sr, meanDelay); // Only matters if multiple==true
clogDuration = getOption(options, "clogDuration"_sr, clogDuration);
testDuration = getOption(options, "testDuration"_sr, testDuration);
enableFailures = getOption(options, "enableFailures"_sr, enableFailures);
multiple = getOption(options, "multiple"_sr, multiple);
}
void initFailureInjectionMode(DeterministicRandom& random) override {
enabled = clientId == 0;
multiple = random.coinflip();
enableFailures = random.random01() < 0.2;
}
Future<Void> setup(Database const& cx) override { return Void(); }
Future<Void> start(Database const& cx) override {
if (g_simulator == g_network && enabled)
return timeout(reportErrors(rollbackFailureWorker(cx, this, meanDelay), "RollbackFailureWorkerError"),
testDuration,
Void());
return Void();
}
Future<bool> check(Database const& cx) override { return true; }
void getMetrics(std::vector<PerfMetric>& m) override {}
ACTOR Future<Void> simulateFailure(Database cx, RollbackWorkload* self) {
state ServerDBInfo system = self->dbInfo->get();
auto tlogs = system.logSystemConfig.allPresentLogs();
if (tlogs.empty() || system.client.commitProxies.empty()) {
TraceEvent(SevInfo, "UnableToTriggerRollback").detail("Reason", "No tlogs in System Map");
return Void();
}
state CommitProxyInterface proxy = deterministicRandom()->randomChoice(system.client.commitProxies);
int utIndex = deterministicRandom()->randomInt(0, tlogs.size());
state NetworkAddress uncloggedTLog = tlogs[utIndex].address();
for (int t = 0; t < tlogs.size(); t++)
if (t != utIndex)
if (tlogs[t].address().ip == proxy.address().ip) {
TraceEvent(SevInfo, "UnableToTriggerRollback").detail("Reason", "proxy-clogged tLog shared IPs");
return Void();
}
TraceEvent("AttemptingToTriggerRollback")
.detail("CommitProxy", proxy.address())
.detail("UncloggedTLog", uncloggedTLog);
for (int t = 0; t < tlogs.size(); t++) {
if (t != utIndex) {
g_simulator->clogPair(proxy.address().ip, tlogs[t].address().ip, self->clogDuration);
// g_simulator->clogInterface( g_simulator->getProcess( system.tlogs[t].commit.getEndpoint() ),
// self->clogDuration, ClogAll );
}
}
// While the clogged machines are still clogged...
wait(delay(self->clogDuration / 3));
system = self->dbInfo->get();
if (self->enableFailures) {
// Reboot the proxy and clog the unclogged tlog.
g_simulator->rebootProcess(g_simulator->getProcessByAddress(proxy.address()), ISimulator::Reboot);
g_simulator->clogInterface(uncloggedTLog.ip, self->clogDuration, ClogAll);
} else {
// Alternatively, if we're not injecting machine failures, clog the proxy and the unclogged tlog.
g_simulator->clogInterface(proxy.address().ip, self->clogDuration, ClogAll);
g_simulator->clogInterface(uncloggedTLog.ip, self->clogDuration, ClogAll);
}
return Void();
}
ACTOR Future<Void> rollbackFailureWorker(Database cx, RollbackWorkload* self, double delay) {
state PromiseStream<Void> events;
if (self->multiple) {
state double lastTime = now();
loop {
wait(poisson(&lastTime, delay));
wait(self->simulateFailure(cx, self));
}
} else {
wait(::delay(deterministicRandom()->random01() *
std::max(0.0, self->testDuration - self->clogDuration * 13.0)));
wait(self->simulateFailure(cx, self));
}
return Void();
}
};
WorkloadFactory<RollbackWorkload> RollbackWorkloadFactory;
FailureInjectorFactory<RollbackWorkload> RollbackFailureInjectorFactory;