Merge pull request #7769 from sfc-gh-mpilman/features/always-inject-faults
Separated normal workloads and failure injection
This commit is contained in:
commit
bd8347d92e
|
@ -66,6 +66,7 @@ struct WorkloadRequest {
|
|||
double databasePingDelay;
|
||||
int64_t sharedRandomNumber;
|
||||
bool useDatabase;
|
||||
bool runFailureWorkloads = true;
|
||||
Optional<TenantNameRef> defaultTenant;
|
||||
|
||||
// The vector of option lists are to construct compound workloads. If there
|
||||
|
@ -98,6 +99,7 @@ struct WorkloadRequest {
|
|||
clientCount,
|
||||
reply,
|
||||
defaultTenant,
|
||||
runFailureWorkloads,
|
||||
arena);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -92,6 +92,63 @@ private:
|
|||
virtual void getMetrics(std::vector<PerfMetric>& m) = 0;
|
||||
};
|
||||
|
||||
struct CompoundWorkload;
|
||||
class DeterministicRandom;
|
||||
|
||||
struct NoOptions {};
|
||||
|
||||
struct FailureInjectionWorkload : TestWorkload {
|
||||
FailureInjectionWorkload(WorkloadContext const&);
|
||||
virtual ~FailureInjectionWorkload() {}
|
||||
virtual bool add(DeterministicRandom& random, WorkloadRequest const& work, CompoundWorkload const& workload);
|
||||
virtual void initFailureInjectionMode(DeterministicRandom& random, unsigned count);
|
||||
|
||||
Future<Void> setupInjectionWorkload(Database const& cx, Future<Void> done);
|
||||
Future<Void> startInjectionWorkload(Database const& cx, Future<Void> done);
|
||||
Future<bool> checkInjectionWorkload(Database const& cx, Future<bool> done);
|
||||
};
|
||||
|
||||
struct IFailureInjectorFactory : ReferenceCounted<IFailureInjectorFactory> {
|
||||
virtual ~IFailureInjectorFactory() = default;
|
||||
static std::vector<Reference<IFailureInjectorFactory>>& factories() {
|
||||
static std::vector<Reference<IFailureInjectorFactory>> _factories;
|
||||
return _factories;
|
||||
}
|
||||
virtual Reference<FailureInjectionWorkload> create(WorkloadContext const& wcx) = 0;
|
||||
};
|
||||
|
||||
template <class W>
|
||||
struct FailureInjectorFactory : IFailureInjectorFactory {
|
||||
static_assert(std::is_base_of<FailureInjectionWorkload, W>::value);
|
||||
FailureInjectorFactory() {
|
||||
IFailureInjectorFactory::factories().push_back(Reference<IFailureInjectorFactory>::addRef(this));
|
||||
}
|
||||
Reference<FailureInjectionWorkload> create(WorkloadContext const& wcx) override {
|
||||
return makeReference<W>(wcx, NoOptions());
|
||||
}
|
||||
};
|
||||
|
||||
struct CompoundWorkload : TestWorkload {
|
||||
bool runFailureWorkloads = true;
|
||||
std::vector<Reference<TestWorkload>> workloads;
|
||||
std::vector<Reference<FailureInjectionWorkload>> failureInjection;
|
||||
|
||||
CompoundWorkload(WorkloadContext& wcx);
|
||||
CompoundWorkload* add(Reference<TestWorkload>&& w);
|
||||
void addFailureInjection(WorkloadRequest& work);
|
||||
|
||||
std::string description() const override;
|
||||
|
||||
Future<Void> setup(Database const& cx) override;
|
||||
Future<Void> start(Database const& cx) override;
|
||||
Future<bool> check(Database const& cx) override;
|
||||
|
||||
Future<std::vector<PerfMetric>> getMetrics() override;
|
||||
double getCheckTimeout() const override;
|
||||
|
||||
void getMetrics(std::vector<PerfMetric>&) override;
|
||||
};
|
||||
|
||||
struct WorkloadProcess;
|
||||
struct ClientWorkload : TestWorkload {
|
||||
WorkloadProcess* impl;
|
||||
|
@ -223,6 +280,7 @@ public:
|
|||
bool dumpAfterTest;
|
||||
bool clearAfterTest;
|
||||
bool useDB;
|
||||
bool runFailureWorkloads = true;
|
||||
double startDelay;
|
||||
int phases;
|
||||
Standalone<VectorRef<VectorRef<KeyValueRef>>> options;
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include <toml.hpp>
|
||||
|
||||
#include "flow/ActorCollection.h"
|
||||
#include "flow/DeterministicRandom.h"
|
||||
#include "fdbrpc/sim_validation.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbclient/ClusterInterface.h"
|
||||
|
@ -271,104 +272,185 @@ Standalone<VectorRef<KeyValueRef>> checkAllOptionsConsumed(VectorRef<KeyValueRef
|
|||
return unconsumed;
|
||||
}
|
||||
|
||||
struct CompoundWorkload : TestWorkload {
|
||||
std::vector<Reference<TestWorkload>> workloads;
|
||||
CompoundWorkload::CompoundWorkload(WorkloadContext& wcx) : TestWorkload(wcx) {}
|
||||
|
||||
CompoundWorkload(WorkloadContext& wcx) : TestWorkload(wcx) {}
|
||||
CompoundWorkload* add(Reference<TestWorkload>&& w) {
|
||||
workloads.push_back(std::move(w));
|
||||
return this;
|
||||
}
|
||||
CompoundWorkload* CompoundWorkload::add(Reference<TestWorkload>&& w) {
|
||||
workloads.push_back(std::move(w));
|
||||
return this;
|
||||
}
|
||||
|
||||
std::string description() const override {
|
||||
std::string d;
|
||||
for (int w = 0; w < workloads.size(); w++)
|
||||
d += workloads[w]->description() + (w == workloads.size() - 1 ? "" : ";");
|
||||
return d;
|
||||
std::string CompoundWorkload::description() const {
|
||||
std::vector<std::string> names;
|
||||
names.reserve(workloads.size());
|
||||
for (auto const& w : workloads) {
|
||||
names.push_back(w->description());
|
||||
}
|
||||
Future<Void> setup(Database const& cx) override {
|
||||
std::vector<Future<Void>> all;
|
||||
all.reserve(workloads.size());
|
||||
for (int w = 0; w < workloads.size(); w++)
|
||||
all.push_back(workloads[w]->setup(cx));
|
||||
return waitForAll(all);
|
||||
return fmt::format("{}", fmt::join(std::move(names), ";"));
|
||||
}
|
||||
Future<Void> CompoundWorkload::setup(Database const& cx) {
|
||||
std::vector<Future<Void>> all;
|
||||
all.reserve(workloads.size());
|
||||
for (int w = 0; w < workloads.size(); w++)
|
||||
all.push_back(workloads[w]->setup(cx));
|
||||
auto done = waitForAll(all);
|
||||
if (failureInjection.empty()) {
|
||||
return done;
|
||||
}
|
||||
Future<Void> start(Database const& cx) override {
|
||||
std::vector<Future<Void>> all;
|
||||
all.reserve(workloads.size());
|
||||
auto wCount = std::make_shared<unsigned>(0);
|
||||
for (int i = 0; i < workloads.size(); i++) {
|
||||
std::string workloadName = workloads[i]->description();
|
||||
++(*wCount);
|
||||
TraceEvent("WorkloadRunStatus")
|
||||
.detail("Name", workloadName)
|
||||
.detail("Count", *wCount)
|
||||
.detail("Phase", "Start");
|
||||
all.push_back(fmap(
|
||||
[workloadName, wCount](Void value) {
|
||||
--(*wCount);
|
||||
TraceEvent("WorkloadRunStatus")
|
||||
.detail("Name", workloadName)
|
||||
.detail("Remaining", *wCount)
|
||||
.detail("Phase", "End");
|
||||
return Void();
|
||||
},
|
||||
workloads[i]->start(cx)));
|
||||
std::vector<Future<Void>> res;
|
||||
res.reserve(failureInjection.size());
|
||||
for (auto& f : failureInjection) {
|
||||
res.push_back(f->setupInjectionWorkload(cx, done));
|
||||
}
|
||||
return waitForAll(res);
|
||||
}
|
||||
|
||||
Future<Void> CompoundWorkload::start(Database const& cx) {
|
||||
std::vector<Future<Void>> all;
|
||||
all.reserve(workloads.size() + failureInjection.size());
|
||||
auto wCount = std::make_shared<unsigned>(0);
|
||||
auto startWorkload = [&](TestWorkload& workload) -> Future<Void> {
|
||||
auto workloadName = workload.description();
|
||||
++(*wCount);
|
||||
TraceEvent("WorkloadRunStatus").detail("Name", workloadName).detail("Count", *wCount).detail("Phase", "Start");
|
||||
return fmap(
|
||||
[workloadName, wCount](Void value) {
|
||||
--(*wCount);
|
||||
TraceEvent("WorkloadRunStatus")
|
||||
.detail("Name", workloadName)
|
||||
.detail("Remaining", *wCount)
|
||||
.detail("Phase", "End");
|
||||
return Void();
|
||||
},
|
||||
workload.start(cx));
|
||||
};
|
||||
for (auto& workload : workloads) {
|
||||
all.push_back(startWorkload(*workload));
|
||||
}
|
||||
for (auto& workload : failureInjection) {
|
||||
all.push_back(startWorkload(*workload));
|
||||
}
|
||||
return waitForAll(all);
|
||||
}
|
||||
|
||||
Future<bool> CompoundWorkload::check(Database const& cx) {
|
||||
std::vector<Future<bool>> all;
|
||||
all.reserve(workloads.size() + failureInjection.size());
|
||||
auto wCount = std::make_shared<unsigned>(0);
|
||||
auto starter = [&](TestWorkload& workload) -> Future<bool> {
|
||||
++(*wCount);
|
||||
std::string workloadName = workload.description();
|
||||
TraceEvent("WorkloadCheckStatus")
|
||||
.detail("Name", workloadName)
|
||||
.detail("Count", *wCount)
|
||||
.detail("Phase", "Start");
|
||||
return fmap(
|
||||
[workloadName, wCount](bool ret) {
|
||||
--(*wCount);
|
||||
TraceEvent("WorkloadCheckStatus")
|
||||
.detail("Name", workloadName)
|
||||
.detail("Remaining", *wCount)
|
||||
.detail("Phase", "End");
|
||||
return true;
|
||||
},
|
||||
workload.check(cx));
|
||||
};
|
||||
for (auto& workload : workloads) {
|
||||
all.push_back(starter(*workload));
|
||||
}
|
||||
for (auto& workload : failureInjection) {
|
||||
all.push_back(starter(*workload));
|
||||
}
|
||||
return allTrue(all);
|
||||
}
|
||||
|
||||
ACTOR Future<std::vector<PerfMetric>> getMetricsCompoundWorkload(CompoundWorkload* self) {
|
||||
state std::vector<Future<std::vector<PerfMetric>>> results;
|
||||
for (int w = 0; w < self->workloads.size(); w++) {
|
||||
std::vector<PerfMetric> p;
|
||||
results.push_back(self->workloads[w]->getMetrics());
|
||||
}
|
||||
wait(waitForAll(results));
|
||||
std::vector<PerfMetric> res;
|
||||
for (int i = 0; i < results.size(); ++i) {
|
||||
auto const& p = results[i].get();
|
||||
for (auto const& m : p) {
|
||||
res.push_back(m.withPrefix(self->workloads[i]->description() + "."));
|
||||
}
|
||||
return waitForAll(all);
|
||||
}
|
||||
Future<bool> check(Database const& cx) override {
|
||||
std::vector<Future<bool>> all;
|
||||
all.reserve(workloads.size());
|
||||
auto wCount = std::make_shared<unsigned>(0);
|
||||
for (int i = 0; i < workloads.size(); i++) {
|
||||
++(*wCount);
|
||||
std::string workloadName = workloads[i]->description();
|
||||
TraceEvent("WorkloadCheckStatus")
|
||||
.detail("Name", workloadName)
|
||||
.detail("Count", *wCount)
|
||||
.detail("Phase", "Start");
|
||||
all.push_back(fmap(
|
||||
[workloadName, wCount](bool ret) {
|
||||
--(*wCount);
|
||||
TraceEvent("WorkloadCheckStatus")
|
||||
.detail("Name", workloadName)
|
||||
.detail("Remaining", *wCount)
|
||||
.detail("Phase", "End");
|
||||
return true;
|
||||
},
|
||||
workloads[i]->check(cx)));
|
||||
}
|
||||
return allTrue(all);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
ACTOR static Future<std::vector<PerfMetric>> getMetrics(CompoundWorkload* self) {
|
||||
state std::vector<Future<std::vector<PerfMetric>>> results;
|
||||
for (int w = 0; w < self->workloads.size(); w++) {
|
||||
std::vector<PerfMetric> p;
|
||||
results.push_back(self->workloads[w]->getMetrics());
|
||||
}
|
||||
wait(waitForAll(results));
|
||||
std::vector<PerfMetric> res;
|
||||
for (int i = 0; i < results.size(); ++i) {
|
||||
auto const& p = results[i].get();
|
||||
for (auto const& m : p) {
|
||||
res.push_back(m.withPrefix(self->workloads[i]->description() + "."));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
void CompoundWorkload::addFailureInjection(WorkloadRequest& work) {
|
||||
if (!work.runFailureWorkloads || !FLOW_KNOBS->ENABLE_SIMULATION_IMPROVEMENTS) {
|
||||
return;
|
||||
}
|
||||
|
||||
Future<std::vector<PerfMetric>> getMetrics() override { return getMetrics(this); }
|
||||
double getCheckTimeout() const override {
|
||||
double m = 0;
|
||||
for (int w = 0; w < workloads.size(); w++)
|
||||
m = std::max(workloads[w]->getCheckTimeout(), m);
|
||||
return m;
|
||||
// Some common workloads won't work with failure injection workloads
|
||||
for (auto const& w : workloads) {
|
||||
auto desc = w->description();
|
||||
if (desc == "ChangeConfig") {
|
||||
return;
|
||||
} else if (desc == "SaveAndKill") {
|
||||
return;
|
||||
}
|
||||
}
|
||||
auto& factories = IFailureInjectorFactory::factories();
|
||||
DeterministicRandom random(sharedRandomNumber);
|
||||
for (auto& factory : factories) {
|
||||
auto workload = factory->create(*this);
|
||||
while (workload->add(random, work, *this)) {
|
||||
failureInjection.push_back(workload);
|
||||
workload = factory->create(*this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void getMetrics(std::vector<PerfMetric>&) override { ASSERT(false); }
|
||||
};
|
||||
Future<std::vector<PerfMetric>> CompoundWorkload::getMetrics() {
|
||||
return getMetricsCompoundWorkload(this);
|
||||
}
|
||||
|
||||
double CompoundWorkload::getCheckTimeout() const {
|
||||
double m = 0;
|
||||
for (int w = 0; w < workloads.size(); w++)
|
||||
m = std::max(workloads[w]->getCheckTimeout(), m);
|
||||
return m;
|
||||
}
|
||||
|
||||
void CompoundWorkload::getMetrics(std::vector<PerfMetric>&) {
|
||||
ASSERT(false);
|
||||
}
|
||||
|
||||
FailureInjectionWorkload::FailureInjectionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {}
|
||||
|
||||
bool FailureInjectionWorkload::add(DeterministicRandom& random,
|
||||
const WorkloadRequest& work,
|
||||
const CompoundWorkload& workload) {
|
||||
auto desc = description();
|
||||
unsigned alreadyAdded = std::count_if(workload.workloads.begin(), workload.workloads.end(), [&desc](auto const& w) {
|
||||
return w->description() == desc;
|
||||
});
|
||||
alreadyAdded += std::count_if(workload.failureInjection.begin(),
|
||||
workload.failureInjection.end(),
|
||||
[&desc](auto const& w) { return w->description() == desc; });
|
||||
bool willAdd = alreadyAdded < 3 && work.useDatabase && 0.1 / (1 + alreadyAdded) > random.random01();
|
||||
if (willAdd) {
|
||||
initFailureInjectionMode(random, alreadyAdded);
|
||||
}
|
||||
return willAdd;
|
||||
}
|
||||
|
||||
void FailureInjectionWorkload::initFailureInjectionMode(DeterministicRandom& random, unsigned count) {}
|
||||
|
||||
Future<Void> FailureInjectionWorkload::setupInjectionWorkload(const Database& cx, Future<Void> done) {
|
||||
return holdWhile(this->setup(cx), done);
|
||||
}
|
||||
|
||||
Future<Void> FailureInjectionWorkload::startInjectionWorkload(const Database& cx, Future<Void> done) {
|
||||
return holdWhile(this->start(cx), done);
|
||||
}
|
||||
|
||||
Future<bool> FailureInjectionWorkload::checkInjectionWorkload(const Database& cx, Future<bool> done) {
|
||||
return holdWhile(this->check(cx), done);
|
||||
}
|
||||
|
||||
ACTOR Future<Reference<TestWorkload>> getWorkloadIface(WorkloadRequest work,
|
||||
Reference<IClusterConnectionRecord> ccr,
|
||||
|
@ -422,10 +504,6 @@ ACTOR Future<Reference<TestWorkload>> getWorkloadIface(WorkloadRequest work,
|
|||
fprintf(stderr, "ERROR: No options were provided for workload.\n");
|
||||
throw test_specification_invalid();
|
||||
}
|
||||
if (work.options.size() == 1) {
|
||||
Reference<TestWorkload> res = wait(getWorkloadIface(work, ccr, work.options[0], dbInfo));
|
||||
return res;
|
||||
}
|
||||
|
||||
wcx.clientId = work.clientId;
|
||||
wcx.clientCount = work.clientCount;
|
||||
|
@ -440,6 +518,7 @@ ACTOR Future<Reference<TestWorkload>> getWorkloadIface(WorkloadRequest work,
|
|||
for (int i = 0; i < work.options.size(); i++) {
|
||||
compound->add(ifaces[i].getValue());
|
||||
}
|
||||
compound->addFailureInjection(work);
|
||||
return compound;
|
||||
}
|
||||
|
||||
|
@ -736,7 +815,7 @@ ACTOR Future<Void> testerServerCore(TesterInterface interf,
|
|||
state PromiseStream<Future<Void>> addWorkload;
|
||||
state Future<Void> workerFatalError = actorCollection(addWorkload.getFuture());
|
||||
|
||||
TraceEvent("StartingTesterServerCore", interf.id());
|
||||
TraceEvent("StartingTesterServerCore", interf.id()).log();
|
||||
loop choose {
|
||||
when(wait(workerFatalError)) {}
|
||||
when(WorkloadRequest work = waitNext(interf.recruitments.getFuture())) {
|
||||
|
@ -883,6 +962,7 @@ ACTOR Future<DistributedTestResults> runWorkload(Database cx,
|
|||
WorkloadRequest req;
|
||||
req.title = spec.title;
|
||||
req.useDatabase = spec.useDB;
|
||||
req.runFailureWorkloads = spec.runFailureWorkloads;
|
||||
req.timeout = spec.timeout;
|
||||
req.databasePingDelay = spec.useDB ? spec.databasePingDelay : 0.0;
|
||||
req.options = spec.options;
|
||||
|
@ -976,6 +1056,7 @@ ACTOR Future<Void> changeConfiguration(Database cx, std::vector<TesterInterface>
|
|||
state TestSpec spec;
|
||||
Standalone<VectorRef<KeyValueRef>> options;
|
||||
spec.title = LiteralStringRef("ChangeConfig");
|
||||
spec.runFailureWorkloads = false;
|
||||
options.push_back_deep(options.arena(),
|
||||
KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("ChangeConfig")));
|
||||
options.push_back_deep(options.arena(), KeyValueRef(LiteralStringRef("configMode"), configMode));
|
||||
|
@ -1022,6 +1103,7 @@ ACTOR Future<Void> checkConsistency(Database cx,
|
|||
}
|
||||
spec.title = LiteralStringRef("ConsistencyCheck");
|
||||
spec.databasePingDelay = databasePingDelay;
|
||||
spec.runFailureWorkloads = false;
|
||||
spec.timeout = 32000;
|
||||
options.push_back_deep(options.arena(),
|
||||
KeyValueRef(LiteralStringRef("testName"), LiteralStringRef("ConsistencyCheck")));
|
||||
|
@ -1317,6 +1399,8 @@ std::map<std::string, std::function<void(const std::string& value, TestSpec* spe
|
|||
if (value == "false")
|
||||
spec->restorePerpetualWiggleSetting = false;
|
||||
} },
|
||||
{ "runFailureWorkloads",
|
||||
[](const std::string& value, TestSpec* spec) { spec->runFailureWorkloads = (value == "true"); } },
|
||||
};
|
||||
|
||||
std::vector<TestSpec> readTests(std::ifstream& ifs) {
|
||||
|
@ -1541,6 +1625,24 @@ ACTOR Future<Void> monitorServerDBInfo(Reference<AsyncVar<Optional<ClusterContro
|
|||
}
|
||||
}
|
||||
|
||||
ACTOR Future<Void> initializeSimConfig(Database db) {
|
||||
state Transaction tr(db);
|
||||
ASSERT(g_network->isSimulated());
|
||||
loop {
|
||||
try {
|
||||
DatabaseConfiguration dbConfig = wait(getDatabaseConfiguration(&tr));
|
||||
g_simulator->storagePolicy = dbConfig.storagePolicy;
|
||||
g_simulator->tLogPolicy = dbConfig.tLogPolicy;
|
||||
g_simulator->tLogWriteAntiQuorum = dbConfig.tLogWriteAntiQuorum;
|
||||
g_simulator->remoteTLogPolicy = dbConfig.getRemoteTLogPolicy();
|
||||
g_simulator->usableRegions = dbConfig.usableRegions;
|
||||
return Void();
|
||||
} catch (Error& e) {
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Test orchestrator: sends test specification to testers in the right order and collects the results.
|
||||
*
|
||||
|
@ -1621,6 +1723,7 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
|
|||
|
||||
// Change the configuration (and/or create the database) if necessary
|
||||
printf("startingConfiguration:%s start\n", startingConfiguration.toString().c_str());
|
||||
fmt::print("useDB: {}\n", useDB);
|
||||
printSimulatedTopology();
|
||||
if (useDB && startingConfiguration != StringRef()) {
|
||||
try {
|
||||
|
@ -1655,6 +1758,9 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
|
|||
}
|
||||
|
||||
wait(waitForAll(tenantFutures));
|
||||
if (g_network->isSimulated()) {
|
||||
wait(initializeSimConfig(cx));
|
||||
}
|
||||
}
|
||||
|
||||
if (useDB && waitForQuiescenceBegin) {
|
||||
|
@ -1674,6 +1780,7 @@ ACTOR Future<Void> runTests(Reference<AsyncVar<Optional<struct ClusterController
|
|||
if (perpetualWiggleEnabled) { // restore the enabled perpetual storage wiggle setting
|
||||
printf("Set perpetual_storage_wiggle=1 ...\n");
|
||||
Version cVer = wait(setPerpetualStorageWiggle(cx, true, LockAware::True));
|
||||
(void)cVer;
|
||||
printf("Set perpetual_storage_wiggle=1 Done.\n");
|
||||
}
|
||||
}
|
||||
|
@ -1822,6 +1929,7 @@ ACTOR Future<Void> runTests(Reference<IClusterConnectionRecord> connRecord,
|
|||
TestSpec spec;
|
||||
Standalone<VectorRef<KeyValueRef>> options;
|
||||
spec.title = LiteralStringRef("ConsistencyCheck");
|
||||
spec.runFailureWorkloads = false;
|
||||
spec.databasePingDelay = 0;
|
||||
spec.timeout = 0;
|
||||
spec.waitForQuiescenceBegin = false;
|
||||
|
|
|
@ -28,19 +28,19 @@
|
|||
#include "fdbserver/Status.h"
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
struct DiskFailureInjectionWorkload : TestWorkload {
|
||||
struct DiskFailureInjectionWorkload : FailureInjectionWorkload {
|
||||
bool enabled;
|
||||
double testDuration;
|
||||
double startDelay;
|
||||
bool throttleDisk;
|
||||
int workersToThrottle;
|
||||
double stallInterval;
|
||||
double stallPeriod;
|
||||
double throttlePeriod;
|
||||
bool corruptFile;
|
||||
int workersToCorrupt;
|
||||
double percentBitFlips;
|
||||
double periodicBroadcastInterval;
|
||||
double testDuration = 60.0;
|
||||
double startDelay = 0.0;
|
||||
bool throttleDisk = false;
|
||||
int workersToThrottle = 3;
|
||||
double stallInterval = 0.0;
|
||||
double stallPeriod = 60.0;
|
||||
double throttlePeriod = 60.0;
|
||||
bool corruptFile = false;
|
||||
int workersToCorrupt = 1;
|
||||
double percentBitFlips = 10;
|
||||
double periodicBroadcastInterval = 5.0;
|
||||
std::vector<NetworkAddress> chosenWorkers;
|
||||
std::vector<Future<Void>> clients;
|
||||
// Verification Mode: We run the workload indefinitely in this mode.
|
||||
|
@ -48,22 +48,27 @@ struct DiskFailureInjectionWorkload : TestWorkload {
|
|||
// that we haven't lost the chaos event. testDuration is ignored in this mode
|
||||
bool verificationMode;
|
||||
|
||||
DiskFailureInjectionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
||||
DiskFailureInjectionWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {}
|
||||
|
||||
DiskFailureInjectionWorkload(WorkloadContext const& wcx) : FailureInjectionWorkload(wcx) {
|
||||
enabled = !clientId; // only do this on the "first" client
|
||||
startDelay = getOption(options, LiteralStringRef("startDelay"), 0.0);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), 60.0);
|
||||
verificationMode = getOption(options, LiteralStringRef("verificationMode"), false);
|
||||
throttleDisk = getOption(options, LiteralStringRef("throttleDisk"), false);
|
||||
workersToThrottle = getOption(options, LiteralStringRef("workersToThrottle"), 3);
|
||||
stallInterval = getOption(options, LiteralStringRef("stallInterval"), 0.0);
|
||||
stallPeriod = getOption(options, LiteralStringRef("stallPeriod"), 60.0);
|
||||
throttlePeriod = getOption(options, LiteralStringRef("throttlePeriod"), 60.0);
|
||||
corruptFile = getOption(options, LiteralStringRef("corruptFile"), false);
|
||||
workersToCorrupt = getOption(options, LiteralStringRef("workersToCorrupt"), 1);
|
||||
percentBitFlips = getOption(options, LiteralStringRef("percentBitFlips"), 10.0);
|
||||
periodicBroadcastInterval = getOption(options, LiteralStringRef("periodicBroadcastInterval"), 5.0);
|
||||
startDelay = getOption(options, LiteralStringRef("startDelay"), startDelay);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), testDuration);
|
||||
verificationMode = getOption(options, LiteralStringRef("verificationMode"), verificationMode);
|
||||
throttleDisk = getOption(options, LiteralStringRef("throttleDisk"), throttleDisk);
|
||||
workersToThrottle = getOption(options, LiteralStringRef("workersToThrottle"), workersToThrottle);
|
||||
stallInterval = getOption(options, LiteralStringRef("stallInterval"), stallInterval);
|
||||
stallPeriod = getOption(options, LiteralStringRef("stallPeriod"), stallPeriod);
|
||||
throttlePeriod = getOption(options, LiteralStringRef("throttlePeriod"), throttlePeriod);
|
||||
corruptFile = getOption(options, LiteralStringRef("corruptFile"), corruptFile);
|
||||
workersToCorrupt = getOption(options, LiteralStringRef("workersToCorrupt"), workersToCorrupt);
|
||||
percentBitFlips = getOption(options, LiteralStringRef("percentBitFlips"), percentBitFlips);
|
||||
periodicBroadcastInterval =
|
||||
getOption(options, LiteralStringRef("periodicBroadcastInterval"), periodicBroadcastInterval);
|
||||
}
|
||||
|
||||
void initFailureInjectionMode(DeterministicRandom& random, unsigned count) override { enabled = clientId == 0; }
|
||||
|
||||
std::string description() const override {
|
||||
if (g_simulator == g_network)
|
||||
return "DiskFailureInjection";
|
||||
|
@ -275,3 +280,4 @@ struct DiskFailureInjectionWorkload : TestWorkload {
|
|||
}
|
||||
};
|
||||
WorkloadFactory<DiskFailureInjectionWorkload> DiskFailureInjectionWorkloadFactory("DiskFailureInjection");
|
||||
FailureInjectorFactory<DiskFailureInjectionWorkload> DiskFailureInjectionWorkloadFailureInjectionFactory;
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbclient/ManagementAPI.actor.h"
|
||||
#include "flow/FaultInjection.h"
|
||||
#include "flow/DeterministicRandom.h"
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
static std::set<int> const& normalAttritionErrors() {
|
||||
|
@ -60,50 +61,99 @@ ACTOR Future<bool> ignoreSSFailuresForDuration(Database cx, double duration) {
|
|||
}
|
||||
}
|
||||
|
||||
struct MachineAttritionWorkload : TestWorkload {
|
||||
struct MachineAttritionWorkload : FailureInjectionWorkload {
|
||||
bool enabled;
|
||||
int machinesToKill, machinesToLeave, workersToKill, workersToLeave;
|
||||
double testDuration, suspendDuration, liveDuration;
|
||||
bool reboot;
|
||||
bool killDc;
|
||||
bool killMachine;
|
||||
bool killDatahall;
|
||||
bool killProcess;
|
||||
bool killZone;
|
||||
bool killSelf;
|
||||
int machinesToKill = 2, machinesToLeave = 1, workersToKill = 2, workersToLeave = 1;
|
||||
double testDuration = 10.0, suspendDuration = 1.0, liveDuration = 5.0;
|
||||
bool iterate = false;
|
||||
bool reboot = false;
|
||||
bool killDc = false;
|
||||
bool killMachine = false;
|
||||
bool killDatahall = false;
|
||||
bool killProcess = false;
|
||||
bool killZone = false;
|
||||
bool killSelf = false;
|
||||
std::vector<std::string> targetIds;
|
||||
bool replacement;
|
||||
bool waitForVersion;
|
||||
bool allowFaultInjection;
|
||||
Future<bool> ignoreSSFailures;
|
||||
bool replacement = false;
|
||||
bool waitForVersion = false;
|
||||
bool allowFaultInjection = true;
|
||||
Future<bool> ignoreSSFailures = true;
|
||||
double maxRunDuration = 60.0, backoff = 1.5;
|
||||
|
||||
// This is set in setup from the list of workers when the cluster is started
|
||||
std::vector<LocalityData> machines;
|
||||
|
||||
MachineAttritionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
||||
MachineAttritionWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {
|
||||
enabled = !clientId && g_network->isSimulated() && faultInjectionActivated;
|
||||
suspendDuration = 10.0;
|
||||
iterate = true;
|
||||
}
|
||||
|
||||
MachineAttritionWorkload(WorkloadContext const& wcx) : FailureInjectionWorkload(wcx) {
|
||||
// only do this on the "first" client, and only when in simulation and only when fault injection is enabled
|
||||
enabled = !clientId && g_network->isSimulated() && faultInjectionActivated;
|
||||
machinesToKill = getOption(options, LiteralStringRef("machinesToKill"), 2);
|
||||
machinesToLeave = getOption(options, LiteralStringRef("machinesToLeave"), 1);
|
||||
workersToKill = getOption(options, LiteralStringRef("workersToKill"), 2);
|
||||
workersToLeave = getOption(options, LiteralStringRef("workersToLeave"), 1);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0);
|
||||
suspendDuration = getOption(options, LiteralStringRef("suspendDuration"), 1.0);
|
||||
liveDuration = getOption(options, LiteralStringRef("liveDuration"), 5.0);
|
||||
reboot = getOption(options, LiteralStringRef("reboot"), false);
|
||||
machinesToKill = getOption(options, LiteralStringRef("machinesToKill"), machinesToKill);
|
||||
machinesToLeave = getOption(options, LiteralStringRef("machinesToLeave"), machinesToLeave);
|
||||
workersToKill = getOption(options, LiteralStringRef("workersToKill"), workersToKill);
|
||||
workersToLeave = getOption(options, LiteralStringRef("workersToLeave"), workersToLeave);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), testDuration);
|
||||
suspendDuration = getOption(options, LiteralStringRef("suspendDuration"), suspendDuration);
|
||||
liveDuration = getOption(options, LiteralStringRef("liveDuration"), liveDuration);
|
||||
reboot = getOption(options, LiteralStringRef("reboot"), reboot);
|
||||
killDc = getOption(
|
||||
options, LiteralStringRef("killDc"), g_network->isSimulated() && deterministicRandom()->random01() < 0.25);
|
||||
killMachine = getOption(options, LiteralStringRef("killMachine"), false);
|
||||
killDatahall = getOption(options, LiteralStringRef("killDatahall"), false);
|
||||
killProcess = getOption(options, LiteralStringRef("killProcess"), false);
|
||||
killZone = getOption(options, LiteralStringRef("killZone"), false);
|
||||
killSelf = getOption(options, LiteralStringRef("killSelf"), false);
|
||||
killMachine = getOption(options, LiteralStringRef("killMachine"), killMachine);
|
||||
killDatahall = getOption(options, LiteralStringRef("killDatahall"), killDatahall);
|
||||
killProcess = getOption(options, LiteralStringRef("killProcess"), killProcess);
|
||||
killZone = getOption(options, LiteralStringRef("killZone"), killZone);
|
||||
killSelf = getOption(options, LiteralStringRef("killSelf"), killSelf);
|
||||
targetIds = getOption(options, LiteralStringRef("targetIds"), std::vector<std::string>());
|
||||
replacement =
|
||||
getOption(options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5);
|
||||
waitForVersion = getOption(options, LiteralStringRef("waitForVersion"), false);
|
||||
allowFaultInjection = getOption(options, LiteralStringRef("allowFaultInjection"), true);
|
||||
ignoreSSFailures = true;
|
||||
waitForVersion = getOption(options, LiteralStringRef("waitForVersion"), waitForVersion);
|
||||
allowFaultInjection = getOption(options, LiteralStringRef("allowFaultInjection"), allowFaultInjection);
|
||||
}
|
||||
|
||||
bool add(DeterministicRandom& random, WorkloadRequest const& work, CompoundWorkload const& workload) override {
|
||||
auto desc = this->description();
|
||||
unsigned alreadyAdded = std::count_if(workload.workloads.begin(),
|
||||
workload.workloads.end(),
|
||||
[&desc](auto const& w) { return w->description() == desc; });
|
||||
alreadyAdded += std::count_if(workload.failureInjection.begin(),
|
||||
workload.failureInjection.end(),
|
||||
[&desc](auto const& w) { return w->description() == desc; });
|
||||
auto res = work.useDatabase && random.random01() < 1.0 / (2.0 + alreadyAdded);
|
||||
if (res) {
|
||||
initializeForInjection(random);
|
||||
}
|
||||
TraceEvent("AddingFailureInjection")
|
||||
.detail("Reboot", reboot)
|
||||
.detail("Replacement", replacement)
|
||||
.detail("AllowFaultInjection", allowFaultInjection)
|
||||
.detail("KillDC", killDc)
|
||||
.detail("KillDataHall", killDatahall)
|
||||
.detail("KillZone", killZone);
|
||||
return res;
|
||||
}
|
||||
|
||||
void initializeForInjection(DeterministicRandom& random) {
|
||||
reboot = random.random01() < 0.25;
|
||||
replacement = random.random01() < 0.25;
|
||||
allowFaultInjection = random.random01() < 0.5;
|
||||
suspendDuration = 10.0 * random.random01();
|
||||
if (g_network->isSimulated()) {
|
||||
std::set<Optional<StringRef>> dataCenters;
|
||||
std::set<Optional<StringRef>> dataHalls;
|
||||
std::set<Optional<StringRef>> zones;
|
||||
for (auto process : g_simulator->getAllProcesses()) {
|
||||
dataCenters.emplace(process->locality.dcId().castTo<StringRef>());
|
||||
dataHalls.emplace(process->locality.dataHallId().castTo<StringRef>());
|
||||
zones.emplace(process->locality.zoneId().castTo<StringRef>());
|
||||
}
|
||||
killDc = dataCenters.size() > 0 && random.random01() > (dataHalls.size() < 0 ? 0.1 : 0.25);
|
||||
killDatahall = dataHalls.size() > 0 && killDc && random.random01() < 0.5;
|
||||
killZone = zones.size() > 0 && random.random01() < 0.2;
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<ISimulator::ProcessInfo*> getServers() {
|
||||
|
@ -198,7 +248,8 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
}
|
||||
deterministicRandom()->randomShuffle(workers);
|
||||
wait(delay(self->liveDuration));
|
||||
// if a specific kill is requested, it must be accompanied by a set of target IDs otherwise no kills will occur
|
||||
// if a specific kill is requested, it must be accompanied by a set of target IDs otherwise no kills will
|
||||
// occur
|
||||
if (self->killDc) {
|
||||
TraceEvent("Assassination").detail("TargetDataCenterIds", describe(self->targetIds));
|
||||
sendRebootRequests(workers,
|
||||
|
@ -275,140 +326,150 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
ACTOR static Future<Void> machineKillWorker(MachineAttritionWorkload* self, double meanDelay, Database cx) {
|
||||
ASSERT(g_network->isSimulated());
|
||||
state double delayBeforeKill;
|
||||
state double suspendDuration = self->suspendDuration;
|
||||
state double startTime = now();
|
||||
|
||||
if (self->killDc) {
|
||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||
wait(delay(delayBeforeKill));
|
||||
|
||||
// decide on a machine to kill
|
||||
ASSERT(self->machines.size());
|
||||
Optional<Standalone<StringRef>> target = self->machines.back().dcId();
|
||||
|
||||
ISimulator::KillType kt = ISimulator::Reboot;
|
||||
if (!self->reboot) {
|
||||
int killType = deterministicRandom()->randomInt(0, 3); // FIXME: enable disk stalls
|
||||
if (killType == 0)
|
||||
kt = ISimulator::KillInstantly;
|
||||
else if (killType == 1)
|
||||
kt = ISimulator::InjectFaults;
|
||||
else if (killType == 2)
|
||||
kt = ISimulator::RebootAndDelete;
|
||||
else
|
||||
kt = ISimulator::FailDisk;
|
||||
}
|
||||
TraceEvent("Assassination")
|
||||
.detail("TargetDatacenter", target)
|
||||
.detail("Reboot", self->reboot)
|
||||
.detail("KillType", kt);
|
||||
|
||||
g_simulator->killDataCenter(target, kt);
|
||||
} else if (self->killDatahall) {
|
||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||
wait(delay(delayBeforeKill));
|
||||
|
||||
// It only makes sense to kill a single data hall.
|
||||
ASSERT(self->targetIds.size() == 1);
|
||||
auto target = self->targetIds.front();
|
||||
|
||||
auto kt = ISimulator::KillInstantly;
|
||||
TraceEvent("Assassination").detail("TargetDataHall", target).detail("KillType", kt);
|
||||
|
||||
g_simulator->killDataHall(target, kt);
|
||||
} else {
|
||||
state int killedMachines = 0;
|
||||
while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) {
|
||||
TraceEvent("WorkerKillBegin")
|
||||
.detail("KilledMachines", killedMachines)
|
||||
.detail("MachinesToKill", self->machinesToKill)
|
||||
.detail("MachinesToLeave", self->machinesToLeave)
|
||||
.detail("Machines", self->machines.size());
|
||||
CODE_PROBE(true, "Killing a machine");
|
||||
|
||||
loop {
|
||||
if (self->killDc) {
|
||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||
wait(delay(delayBeforeKill));
|
||||
TraceEvent("WorkerKillAfterDelay").log();
|
||||
|
||||
if (self->waitForVersion) {
|
||||
state Transaction tr(cx);
|
||||
loop {
|
||||
try {
|
||||
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
|
||||
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
||||
wait(success(tr.getReadVersion()));
|
||||
break;
|
||||
} catch (Error& e) {
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// decide on a machine to kill
|
||||
state LocalityData targetMachine = self->machines.back();
|
||||
if (BUGGIFY_WITH_PROB(0.01)) {
|
||||
CODE_PROBE(true, "Marked a zone for maintenance before killing it");
|
||||
wait(success(
|
||||
setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20)));
|
||||
} else if (BUGGIFY_WITH_PROB(0.005)) {
|
||||
CODE_PROBE(true, "Disable DD for all storage server failures");
|
||||
self->ignoreSSFailures =
|
||||
uncancellable(ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5));
|
||||
}
|
||||
ASSERT(self->machines.size());
|
||||
Optional<Standalone<StringRef>> target = self->machines.back().dcId();
|
||||
|
||||
ISimulator::KillType kt = ISimulator::Reboot;
|
||||
if (!self->reboot) {
|
||||
int killType = deterministicRandom()->randomInt(0, 3); // FIXME: enable disk stalls
|
||||
if (killType == 0)
|
||||
kt = ISimulator::KillInstantly;
|
||||
else if (killType == 1)
|
||||
kt = ISimulator::InjectFaults;
|
||||
else if (killType == 2)
|
||||
kt = ISimulator::RebootAndDelete;
|
||||
else
|
||||
kt = ISimulator::FailDisk;
|
||||
}
|
||||
TraceEvent("Assassination")
|
||||
.detail("TargetMachine", targetMachine.toString())
|
||||
.detail("ZoneId", targetMachine.zoneId())
|
||||
.detail("TargetDatacenter", target)
|
||||
.detail("Reboot", self->reboot)
|
||||
.detail("KilledMachines", killedMachines)
|
||||
.detail("MachinesToKill", self->machinesToKill)
|
||||
.detail("MachinesToLeave", self->machinesToLeave)
|
||||
.detail("Machines", self->machines.size())
|
||||
.detail("Replace", self->replacement);
|
||||
|
||||
if (self->reboot) {
|
||||
if (deterministicRandom()->random01() > 0.5) {
|
||||
g_simulator->rebootProcess(targetMachine.zoneId(), deterministicRandom()->random01() > 0.5);
|
||||
} else {
|
||||
g_simulator->killZone(targetMachine.zoneId(), ISimulator::Reboot);
|
||||
}
|
||||
} else {
|
||||
auto randomDouble = deterministicRandom()->random01();
|
||||
TraceEvent("WorkerKill")
|
||||
.detail("MachineCount", self->machines.size())
|
||||
.detail("RandomValue", randomDouble);
|
||||
if (randomDouble < 0.33) {
|
||||
TraceEvent("RebootAndDelete").detail("TargetMachine", targetMachine.toString());
|
||||
g_simulator->killZone(targetMachine.zoneId(), ISimulator::RebootAndDelete);
|
||||
} else {
|
||||
auto kt = ISimulator::KillInstantly;
|
||||
if (self->allowFaultInjection) {
|
||||
if (randomDouble < 0.50) {
|
||||
kt = ISimulator::InjectFaults;
|
||||
}
|
||||
// FIXME: enable disk stalls
|
||||
/*
|
||||
if( randomDouble < 0.56 ) {
|
||||
kt = ISimulator::InjectFaults;
|
||||
} else if( randomDouble < 0.66 ) {
|
||||
kt = ISimulator::FailDisk;
|
||||
}
|
||||
*/
|
||||
}
|
||||
g_simulator->killZone(targetMachine.zoneId(), kt);
|
||||
}
|
||||
}
|
||||
|
||||
killedMachines++;
|
||||
if (self->replacement) {
|
||||
// Replace by reshuffling, since we always pick from the back.
|
||||
deterministicRandom()->randomShuffle(self->machines);
|
||||
} else {
|
||||
self->machines.pop_back();
|
||||
}
|
||||
|
||||
wait(delay(meanDelay - delayBeforeKill) && success(self->ignoreSSFailures));
|
||||
.detail("KillType", kt);
|
||||
|
||||
g_simulator->killDataCenter(target, kt);
|
||||
} else if (self->killDatahall) {
|
||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||
TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);
|
||||
wait(delay(delayBeforeKill));
|
||||
|
||||
// It only makes sense to kill a single data hall.
|
||||
ASSERT(self->targetIds.size() == 1);
|
||||
auto target = self->targetIds.front();
|
||||
|
||||
auto kt = ISimulator::KillInstantly;
|
||||
TraceEvent("Assassination").detail("TargetDataHall", target).detail("KillType", kt);
|
||||
|
||||
g_simulator->killDataHall(target, kt);
|
||||
} else {
|
||||
state int killedMachines = 0;
|
||||
while (killedMachines < self->machinesToKill && self->machines.size() > self->machinesToLeave) {
|
||||
TraceEvent("WorkerKillBegin")
|
||||
.detail("KilledMachines", killedMachines)
|
||||
.detail("MachinesToKill", self->machinesToKill)
|
||||
.detail("MachinesToLeave", self->machinesToLeave)
|
||||
.detail("Machines", self->machines.size());
|
||||
CODE_PROBE(true, "Killing a machine");
|
||||
|
||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||
wait(delay(delayBeforeKill));
|
||||
TraceEvent("WorkerKillAfterDelay").log();
|
||||
|
||||
if (self->waitForVersion) {
|
||||
state Transaction tr(cx);
|
||||
loop {
|
||||
try {
|
||||
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
|
||||
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
|
||||
wait(success(tr.getReadVersion()));
|
||||
break;
|
||||
} catch (Error& e) {
|
||||
wait(tr.onError(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// decide on a machine to kill
|
||||
state LocalityData targetMachine = self->machines.back();
|
||||
if (BUGGIFY_WITH_PROB(0.01)) {
|
||||
CODE_PROBE(true, "Marked a zone for maintenance before killing it");
|
||||
wait(success(
|
||||
setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20)));
|
||||
} else if (BUGGIFY_WITH_PROB(0.005)) {
|
||||
CODE_PROBE(true, "Disable DD for all storage server failures");
|
||||
self->ignoreSSFailures =
|
||||
uncancellable(ignoreSSFailuresForDuration(cx, deterministicRandom()->random01() * 5));
|
||||
}
|
||||
|
||||
TraceEvent("Assassination")
|
||||
.detail("TargetMachine", targetMachine.toString())
|
||||
.detail("ZoneId", targetMachine.zoneId())
|
||||
.detail("Reboot", self->reboot)
|
||||
.detail("KilledMachines", killedMachines)
|
||||
.detail("MachinesToKill", self->machinesToKill)
|
||||
.detail("MachinesToLeave", self->machinesToLeave)
|
||||
.detail("Machines", self->machines.size())
|
||||
.detail("Replace", self->replacement);
|
||||
|
||||
if (self->reboot) {
|
||||
if (deterministicRandom()->random01() > 0.5) {
|
||||
g_simulator->rebootProcess(targetMachine.zoneId(), deterministicRandom()->random01() > 0.5);
|
||||
} else {
|
||||
g_simulator->killZone(targetMachine.zoneId(), ISimulator::Reboot);
|
||||
}
|
||||
} else {
|
||||
auto randomDouble = deterministicRandom()->random01();
|
||||
TraceEvent("WorkerKill")
|
||||
.detail("MachineCount", self->machines.size())
|
||||
.detail("RandomValue", randomDouble);
|
||||
if (randomDouble < 0.33) {
|
||||
TraceEvent("RebootAndDelete").detail("TargetMachine", targetMachine.toString());
|
||||
g_simulator->killZone(targetMachine.zoneId(), ISimulator::RebootAndDelete);
|
||||
} else {
|
||||
auto kt = ISimulator::KillInstantly;
|
||||
if (self->allowFaultInjection) {
|
||||
if (randomDouble < 0.50) {
|
||||
kt = ISimulator::InjectFaults;
|
||||
}
|
||||
// FIXME: enable disk stalls
|
||||
/*
|
||||
if( randomDouble < 0.56 ) {
|
||||
kt = ISimulator::InjectFaults;
|
||||
} else if( randomDouble < 0.66 ) {
|
||||
kt = ISimulator::FailDisk;
|
||||
}
|
||||
*/
|
||||
}
|
||||
g_simulator->killZone(targetMachine.zoneId(), kt);
|
||||
}
|
||||
}
|
||||
|
||||
killedMachines++;
|
||||
if (self->replacement) {
|
||||
// Replace by reshuffling, since we always pick from the back.
|
||||
deterministicRandom()->randomShuffle(self->machines);
|
||||
} else {
|
||||
self->machines.pop_back();
|
||||
}
|
||||
|
||||
wait(delay(meanDelay - delayBeforeKill) && success(self->ignoreSSFailures));
|
||||
|
||||
delayBeforeKill = deterministicRandom()->random01() * meanDelay;
|
||||
TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill);
|
||||
}
|
||||
}
|
||||
if (!self->iterate || now() - startTime > self->maxRunDuration) {
|
||||
break;
|
||||
} else {
|
||||
wait(delay(suspendDuration));
|
||||
suspendDuration *= self->backoff;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -419,3 +480,4 @@ struct MachineAttritionWorkload : TestWorkload {
|
|||
};
|
||||
|
||||
WorkloadFactory<MachineAttritionWorkload> MachineAttritionWorkloadFactory("Attrition");
|
||||
FailureInjectorFactory<MachineAttritionWorkload> MachineAttritionFailureWorkloadFactory;
|
||||
|
|
|
@ -18,24 +18,48 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "flow/DeterministicRandom.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "fdbclient/NativeAPI.actor.h"
|
||||
#include "fdbserver/TesterInterface.actor.h"
|
||||
#include "fdbserver/workloads/workloads.actor.h"
|
||||
#include "fdbrpc/simulator.h"
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
struct RandomCloggingWorkload : TestWorkload {
|
||||
struct RandomCloggingWorkload : FailureInjectionWorkload {
|
||||
bool enabled;
|
||||
double testDuration;
|
||||
double scale, clogginess;
|
||||
int swizzleClog;
|
||||
double testDuration = 10.0;
|
||||
double scale = 1.0, clogginess = 1.0;
|
||||
int swizzleClog = 0;
|
||||
bool iterate = false;
|
||||
double maxRunDuration = 60.0, backoff = 1.5, suspend = 10.0;
|
||||
|
||||
RandomCloggingWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
||||
RandomCloggingWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {}
|
||||
|
||||
RandomCloggingWorkload(WorkloadContext const& wcx) : FailureInjectionWorkload(wcx) {
|
||||
enabled = !clientId; // only do this on the "first" client
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0);
|
||||
scale = getOption(options, LiteralStringRef("scale"), 1.0);
|
||||
clogginess = getOption(options, LiteralStringRef("clogginess"), 1.0);
|
||||
swizzleClog = getOption(options, LiteralStringRef("swizzle"), 0);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), testDuration);
|
||||
scale = getOption(options, LiteralStringRef("scale"), scale);
|
||||
clogginess = getOption(options, LiteralStringRef("clogginess"), clogginess);
|
||||
swizzleClog = getOption(options, LiteralStringRef("swizzle"), swizzleClog);
|
||||
}
|
||||
|
||||
bool add(DeterministicRandom& random, WorkloadRequest const& work, CompoundWorkload const& workload) override {
|
||||
auto desc = description();
|
||||
unsigned alreadyAdded = std::count_if(workload.workloads.begin(),
|
||||
workload.workloads.end(),
|
||||
[&desc](auto const& w) { return w->description() == desc; });
|
||||
alreadyAdded += std::count_if(workload.failureInjection.begin(),
|
||||
workload.failureInjection.end(),
|
||||
[&desc](auto const& w) { return w->description() == desc; });
|
||||
bool willAdd = work.useDatabase && 0.25 / (1 + alreadyAdded) > random.random01();
|
||||
if (willAdd) {
|
||||
enabled = this->clientId == 0;
|
||||
scale = std::max(random.random01(), 0.1);
|
||||
clogginess = std::max(random.random01(), 0.1);
|
||||
swizzleClog = random.random01() < 0.3;
|
||||
iterate = random.random01() < 0.5;
|
||||
}
|
||||
return willAdd;
|
||||
}
|
||||
|
||||
std::string description() const override {
|
||||
|
@ -46,17 +70,31 @@ struct RandomCloggingWorkload : TestWorkload {
|
|||
}
|
||||
Future<Void> setup(Database const& cx) override { return Void(); }
|
||||
Future<Void> start(Database const& cx) override {
|
||||
if (g_simulator == g_network && enabled)
|
||||
return timeout(
|
||||
reportErrors(swizzleClog ? swizzleClogClient(this) : clogClient(this), "RandomCloggingError"),
|
||||
testDuration,
|
||||
Void());
|
||||
else
|
||||
return Void();
|
||||
if (g_network->isSimulated() && enabled) {
|
||||
return _start(this);
|
||||
}
|
||||
return Void();
|
||||
}
|
||||
Future<bool> check(Database const& cx) override { return true; }
|
||||
void getMetrics(std::vector<PerfMetric>& m) override {}
|
||||
|
||||
ACTOR static Future<Void> _start(RandomCloggingWorkload* self) {
|
||||
state Future<Void> done = delay(self->maxRunDuration);
|
||||
loop {
|
||||
wait(done ||
|
||||
timeout(reportErrors(self->swizzleClog ? self->swizzleClogClient(self) : self->clogClient(self),
|
||||
"RandomCloggingError"),
|
||||
self->testDuration,
|
||||
Void()));
|
||||
if (!done.isReady() && self->iterate) {
|
||||
wait(delay(self->suspend));
|
||||
self->suspend *= self->backoff;
|
||||
} else {
|
||||
return Void();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ACTOR void doClog(ISimulator::ProcessInfo* machine, double t, double delay = 0.0) {
|
||||
wait(::delay(delay));
|
||||
g_simulator->clogInterface(machine->address.ip, t);
|
||||
|
@ -114,3 +152,4 @@ struct RandomCloggingWorkload : TestWorkload {
|
|||
};
|
||||
|
||||
WorkloadFactory<RandomCloggingWorkload> RandomCloggingWorkloadFactory("RandomClogging");
|
||||
FailureInjectorFactory<RandomCloggingWorkload> RandomCloggingFailureInjectionFactory;
|
||||
|
|
|
@ -29,17 +29,21 @@
|
|||
#include "fdbserver/QuietDatabase.h"
|
||||
#include "flow/actorcompiler.h" // This must be the last #include.
|
||||
|
||||
struct MoveKeysWorkload : TestWorkload {
|
||||
struct MoveKeysWorkload : FailureInjectionWorkload {
|
||||
bool enabled;
|
||||
double testDuration, meanDelay;
|
||||
double maxKeyspace;
|
||||
double testDuration = 10.0, meanDelay = 0.05;
|
||||
double maxKeyspace = 0.1;
|
||||
DatabaseConfiguration configuration;
|
||||
|
||||
MoveKeysWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
||||
MoveKeysWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {
|
||||
enabled = !clientId && g_network->isSimulated(); // only do this on the "first" client
|
||||
meanDelay = getOption(options, LiteralStringRef("meanDelay"), 0.05);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0);
|
||||
maxKeyspace = getOption(options, LiteralStringRef("maxKeyspace"), 0.1);
|
||||
}
|
||||
|
||||
MoveKeysWorkload(WorkloadContext const& wcx) : FailureInjectionWorkload(wcx) {
|
||||
enabled = !clientId && g_network->isSimulated(); // only do this on the "first" client
|
||||
meanDelay = getOption(options, LiteralStringRef("meanDelay"), meanDelay);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), testDuration);
|
||||
maxKeyspace = getOption(options, LiteralStringRef("maxKeyspace"), maxKeyspace);
|
||||
}
|
||||
|
||||
std::string description() const override { return "MoveKeysWorkload"; }
|
||||
|
@ -232,3 +236,4 @@ struct MoveKeysWorkload : TestWorkload {
|
|||
};
|
||||
|
||||
WorkloadFactory<MoveKeysWorkload> MoveKeysWorkloadFactory("RandomMoveKeys");
|
||||
FailureInjectorFactory<MoveKeysWorkload> MoveKeysFailureInjectionFactory;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "flow/DeterministicRandom.h"
|
||||
#include "fdbclient/NativeAPI.actor.h"
|
||||
#include "fdbserver/TesterInterface.actor.h"
|
||||
#include "fdbserver/workloads/workloads.actor.h"
|
||||
|
@ -31,17 +32,25 @@
|
|||
// The workload first clogs network link between the chosen proxy and all tLogs but the unclogTlog;
|
||||
// While the network is still clogged, the workload kills the proxy and clogs the unclogged tlog's interface.
|
||||
// Note: The clogged network link's latency will become "clogDuration".
|
||||
struct RollbackWorkload : TestWorkload {
|
||||
bool enableFailures, multiple, enabled;
|
||||
double meanDelay, clogDuration, testDuration;
|
||||
struct RollbackWorkload : FailureInjectionWorkload {
|
||||
bool enableFailures = false, multiple = true, enabled;
|
||||
double meanDelay = 20.0, clogDuration = clogDuration = 3.0, testDuration = 10.0;
|
||||
|
||||
RollbackWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
||||
RollbackWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {}
|
||||
|
||||
RollbackWorkload(WorkloadContext const& wcx) : FailureInjectionWorkload(wcx) {
|
||||
enabled = !clientId; // only do this on the "first" client
|
||||
meanDelay = getOption(options, LiteralStringRef("meanDelay"), 20.0); // Only matters if multiple==true
|
||||
clogDuration = getOption(options, LiteralStringRef("clogDuration"), 3.0);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), 10.0);
|
||||
enableFailures = getOption(options, LiteralStringRef("enableFailures"), false);
|
||||
multiple = getOption(options, LiteralStringRef("multiple"), true);
|
||||
meanDelay = getOption(options, LiteralStringRef("meanDelay"), meanDelay); // Only matters if multiple==true
|
||||
clogDuration = getOption(options, LiteralStringRef("clogDuration"), clogDuration);
|
||||
testDuration = getOption(options, LiteralStringRef("testDuration"), testDuration);
|
||||
enableFailures = getOption(options, LiteralStringRef("enableFailures"), enableFailures);
|
||||
multiple = getOption(options, LiteralStringRef("multiple"), multiple);
|
||||
}
|
||||
|
||||
void initFailureInjectionMode(DeterministicRandom& random, unsigned count) override {
|
||||
enabled = clientId == 0;
|
||||
multiple = random.coinflip();
|
||||
enableFailures = random.random01() < 0.2;
|
||||
}
|
||||
|
||||
std::string description() const override { return "RollbackWorkload"; }
|
||||
|
@ -122,3 +131,4 @@ struct RollbackWorkload : TestWorkload {
|
|||
};
|
||||
|
||||
WorkloadFactory<RollbackWorkload> RollbackWorkloadFactory("Rollback");
|
||||
FailureInjectorFactory<RollbackWorkload> RollbackFailureInjectorFactory;
|
||||
|
|
|
@ -240,6 +240,7 @@ void FlowKnobs::initialize(Randomize randomize, IsSimulated isSimulated) {
|
|||
init( SIM_SPEEDUP_AFTER_SECONDS, 450 );
|
||||
init( MAX_TRACE_LINES, 1'000'000 );
|
||||
init( CODE_COV_TRACE_EVENT_SEVERITY, 10 ); // Code coverage TraceEvent severity level
|
||||
init( ENABLE_SIMULATION_IMPROVEMENTS, false ); // Separate normal workloads and failure injection
|
||||
|
||||
//TDMetrics
|
||||
init( MAX_METRICS, 600 );
|
||||
|
|
|
@ -290,6 +290,7 @@ public:
|
|||
int SIM_CONNECT_ERROR_MODE;
|
||||
double SIM_SPEEDUP_AFTER_SECONDS;
|
||||
int MAX_TRACE_LINES;
|
||||
bool ENABLE_SIMULATION_IMPROVEMENTS;
|
||||
|
||||
// Tracefiles
|
||||
int ZERO_LENGTH_FILE_PAD;
|
||||
|
|
Loading…
Reference in New Issue