From e04646e267279257aa480a0c7cc31e17c49d4880 Mon Sep 17 00:00:00 2001 From: Sajjad Rahnama Date: Fri, 23 Jul 2021 16:28:20 -0700 Subject: [PATCH 1/3] Fault Injection Active/Deactivation --- fdbrpc/sim2.actor.cpp | 3 ++- fdbserver/SimulatedCluster.actor.cpp | 3 ++- fdbserver/fdbserver.actor.cpp | 21 +++++++++++++++++-- .../workloads/MachineAttrition.actor.cpp | 5 +++-- flow/FaultInjection.cpp | 7 ++++++- flow/FaultInjection.h | 2 ++ 6 files changed, 34 insertions(+), 7 deletions(-) diff --git a/fdbrpc/sim2.actor.cpp b/fdbrpc/sim2.actor.cpp index fe7ded16e5..33da8e7ed6 100644 --- a/fdbrpc/sim2.actor.cpp +++ b/fdbrpc/sim2.actor.cpp @@ -46,10 +46,11 @@ #include "fdbrpc/Replication.h" #include "fdbrpc/ReplicationUtils.h" #include "fdbrpc/AsyncFileWriteChecker.h" +#include "flow/FaultInjection.h" #include "flow/actorcompiler.h" // This must be the last #include. bool simulator_should_inject_fault(const char* context, const char* file, int line, int error_code) { - if (!g_network->isSimulated()) + if (!g_network->isSimulated() || !faultInjectionActivated) return false; auto p = g_simulator.getCurrentProcess(); diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 5f656f13f1..56264fa61c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -41,6 +41,7 @@ #include "flow/ProtocolVersion.h" #include "flow/network.h" #include "flow/TypeTraits.h" +#include "flow/FaultInjection.h" #include "flow/actorcompiler.h" // This must be the last #include. #undef max @@ -1651,7 +1652,7 @@ void SimulationConfig::setTss(const TestConfig& testConfig) { std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType); set_config(confStr); double tssRandom = deterministicRandom()->random01(); - if (tssRandom > 0.5) { + if (tssRandom > 0.5 || !faultInjectionActivated) { // normal tss mode g_simulator.tssMode = ISimulator::TSSMode::EnabledNormal; } else if (tssRandom < 0.25 && !testConfig.isFirstTestInRestart) { diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index a9d6697bc8..bc4b8a951e 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -68,6 +68,7 @@ #include "flow/TLSConfig.actor.h" #include "flow/Tracing.h" #include "flow/UnitTest.h" +#include "flow/FaultInjection.h" #if defined(__linux__) || defined(__FreeBSD__) #include @@ -92,7 +93,7 @@ enum { OPT_DCID, OPT_MACHINE_CLASS, OPT_BUGGIFY, OPT_VERSION, OPT_BUILD_FLAGS, OPT_CRASHONERROR, OPT_HELP, OPT_NETWORKIMPL, OPT_NOBUFSTDOUT, OPT_BUFSTDOUTERR, OPT_TRACECLOCK, OPT_NUMTESTERS, OPT_DEVHELP, OPT_ROLLSIZE, OPT_MAXLOGS, OPT_MAXLOGSSIZE, OPT_KNOB, OPT_UNITTESTPARAM, OPT_TESTSERVERS, OPT_TEST_ON_SERVERS, OPT_METRICSCONNFILE, OPT_METRICSPREFIX, OPT_LOGGROUP, OPT_LOCALITY, OPT_IO_TRUST_SECONDS, OPT_IO_TRUST_WARN_ONLY, OPT_FILESYSTEM, OPT_PROFILER_RSS_SIZE, OPT_KVFILE, - OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, + OPT_TRACE_FORMAT, OPT_WHITELIST_BINPATH, OPT_BLOB_CREDENTIAL_FILE, OPT_CONFIG_PATH, OPT_USE_TEST_CONFIG_DB, OPT_FAULT_INJECTION, }; CSimpleOpt::SOption g_rgOptions[] = { @@ -177,6 +178,8 @@ CSimpleOpt::SOption g_rgOptions[] = { { OPT_BLOB_CREDENTIAL_FILE, "--blob_credential_file", SO_REQ_SEP }, { OPT_CONFIG_PATH, "--config_path", SO_REQ_SEP }, { OPT_USE_TEST_CONFIG_DB, "--use_test_config_db", SO_NONE }, + { OPT_FAULT_INJECTION, "-fi", SO_REQ_SEP }, + { OPT_FAULT_INJECTION, "--fault_injection", SO_REQ_SEP }, #ifndef TLS_DISABLED TLS_OPTION_FLAGS @@ -646,6 +649,7 @@ static void printUsage(const char* name, bool devhelp) { "--kvfile FILE", "Input file (SQLite database file) for use by the 'kvfilegeneratesums' and 'kvfileintegritycheck' roles."); printOptionUsage("-b [on,off], --buggify [on,off]", " Sets Buggify system state, defaults to `off'."); + printOptionUsage("-f [on,off], --fault_injection [on,off]", " Sets fault injection, defaults to `on'."); printOptionUsage("--crash", "Crash on serious errors instead of continuing."); printOptionUsage("-N NETWORKIMPL, --network NETWORKIMPL", " Select network implementation, `net2' (default)," @@ -960,7 +964,7 @@ struct CLIOptions { 8LL << 30; // Nice to maintain the same default value for memLimit and SERVER_KNOBS->SERVER_MEM_LIMIT and // SERVER_KNOBS->COMMIT_BATCHES_MEM_BYTES_HARD_LIMIT uint64_t storageMemLimit = 1LL << 30; - bool buggifyEnabled = false, restarting = false; + bool buggifyEnabled = false, faultInjectionEnabled = true, restarting = false; Optional> zoneId; Optional> dcId; ProcessClass processClass = ProcessClass(ProcessClass::UnsetClass, ProcessClass::CommandLineSource); @@ -1382,6 +1386,17 @@ private: flushAndExit(FDB_EXIT_ERROR); } break; + case OPT_FAULT_INJECTION: + if (!strcmp(args.OptionArg(), "on")) + faultInjectionEnabled = true; + else if (!strcmp(args.OptionArg(), "off")) + faultInjectionEnabled = false; + else { + fprintf(stderr, "ERROR: Unknown fault injection state `%s'\n", args.OptionArg()); + printHelpTeaser(argv[0]); + flushAndExit(FDB_EXIT_ERROR); + } + break; case OPT_CRASHONERROR: g_crashOnError = true; break; @@ -1638,6 +1653,7 @@ int main(int argc, char* argv[]) { setThreadLocalDeterministicRandomSeed(opts.randomSeed); enableBuggify(opts.buggifyEnabled, BuggifyType::General); + enableFaultInjection(opts.faultInjectionEnabled); IKnobCollection::setGlobalKnobCollection(IKnobCollection::Type::SERVER, Randomize::True, @@ -1795,6 +1811,7 @@ int main(int argc, char* argv[]) { .detail("CommandLine", opts.commandLine) .setMaxFieldLength(0) .detail("BuggifyEnabled", opts.buggifyEnabled) + .detail("FaultInjectionEnabled", opts.faultInjectionEnabled) .detail("MemoryLimit", opts.memLimit) .trackLatest("ProgramStart"); diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 75c2c248a9..e46c249c6d 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -25,6 +25,7 @@ #include "fdbserver/workloads/workloads.actor.h" #include "fdbrpc/simulator.h" #include "fdbclient/ManagementAPI.actor.h" +#include "flow/FaultInjection.h" #include "flow/actorcompiler.h" // This must be the last #include. static std::set const& normalAttritionErrors() { @@ -78,8 +79,8 @@ struct MachineAttritionWorkload : TestWorkload { std::vector machines; MachineAttritionWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { - enabled = - !clientId && g_network->isSimulated(); // only do this on the "first" client, and only when in simulation + // only do this on the "first" client, and only when in simulation and only when fault injection is enabled + enabled = !clientId && g_network->isSimulated() && faultInjectionActivated; machinesToKill = getOption(options, LiteralStringRef("machinesToKill"), 2); machinesToLeave = getOption(options, LiteralStringRef("machinesToLeave"), 1); workersToKill = getOption(options, LiteralStringRef("workersToKill"), 2); diff --git a/flow/FaultInjection.cpp b/flow/FaultInjection.cpp index 861de1307a..5ba346efc5 100644 --- a/flow/FaultInjection.cpp +++ b/flow/FaultInjection.cpp @@ -20,4 +20,9 @@ #include "flow/FaultInjection.h" -bool (*should_inject_fault)(const char* context, const char* file, int line, int error_code) = 0; \ No newline at end of file +bool (*should_inject_fault)(const char* context, const char* file, int line, int error_code) = 0; +bool faultInjectionActivated = true; + +void enableFaultInjection(bool enabled) { + faultInjectionActivated = enabled; +} diff --git a/flow/FaultInjection.h b/flow/FaultInjection.h index e1f2aa0bb6..fa8f521076 100644 --- a/flow/FaultInjection.h +++ b/flow/FaultInjection.h @@ -32,6 +32,8 @@ #define SHOULD_INJECT_FAULT(context) (should_inject_fault && should_inject_fault(context, __FILE__, __LINE__, 0)) extern bool (*should_inject_fault)(const char* context, const char* file, int line, int error_code); +extern bool faultInjectionActivated; +extern void enableFaultInjection(bool enabled); // Enable fault injection called from fdbserver actor main function #else #define INJECT_FAULT(error_type, context) #endif From d36b5d62dfb0940754754ffbadb926ee12c4c563 Mon Sep 17 00:00:00 2001 From: Sajjad Rahnama Date: Mon, 26 Jul 2021 10:41:17 -0700 Subject: [PATCH 2/3] Fault Injection Active/Deactivation - Edit TSS mode set_config --- fdbserver/SimulatedCluster.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 56264fa61c..f9916cf1bd 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1648,11 +1648,11 @@ void SimulationConfig::setTss(const TestConfig& testConfig) { tssCount = std::max(0, std::min(tssCount, (db.usableRegions * (machine_count / datacenters) - replication_type) / 2)); - if (!testConfig.config.present() && tssCount > 0) { + if (!testConfig.config.present() && tssCount > 0 && faultInjectionActivated) { std::string confStr = format("tss_count:=%d tss_storage_engine:=%d", tssCount, db.storageServerStoreType); set_config(confStr); double tssRandom = deterministicRandom()->random01(); - if (tssRandom > 0.5 || !faultInjectionActivated) { + if (tssRandom > 0.5) { // normal tss mode g_simulator.tssMode = ISimulator::TSSMode::EnabledNormal; } else if (tssRandom < 0.25 && !testConfig.isFirstTestInRestart) { From 9d6402b7595188cd13b3e723cbb910022b349ac9 Mon Sep 17 00:00:00 2001 From: Sajjad Rahnama Date: Mon, 26 Jul 2021 11:04:18 -0700 Subject: [PATCH 3/3] Fault Injection Active/Deactivation - Edit usage in fdbserver --- fdbserver/fdbserver.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index bc4b8a951e..ba75fe296c 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -649,7 +649,7 @@ static void printUsage(const char* name, bool devhelp) { "--kvfile FILE", "Input file (SQLite database file) for use by the 'kvfilegeneratesums' and 'kvfileintegritycheck' roles."); printOptionUsage("-b [on,off], --buggify [on,off]", " Sets Buggify system state, defaults to `off'."); - printOptionUsage("-f [on,off], --fault_injection [on,off]", " Sets fault injection, defaults to `on'."); + printOptionUsage("-fi [on,off], --fault_injection [on,off]", " Sets fault injection, defaults to `on'."); printOptionUsage("--crash", "Crash on serious errors instead of continuing."); printOptionUsage("-N NETWORKIMPL, --network NETWORKIMPL", " Select network implementation, `net2' (default),"