From d1c80659b5b2852fe6b3e26773f2622c48972c62 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Wed, 19 Oct 2022 16:18:00 -0600 Subject: [PATCH] Remember disk corruptions and downgrade trace severity if a corruption was injected --- fdbclient/ManagementAPI.actor.cpp | 11 ++++- fdbrpc/include/fdbrpc/AsyncFileChaos.h | 42 ++++++++++++------- fdbrpc/include/fdbrpc/simulator.h | 4 ++ fdbserver/KeyValueStoreSQLite.actor.cpp | 11 ++++- .../workloads/MachineAttrition.actor.cpp | 4 ++ 5 files changed, 54 insertions(+), 18 deletions(-) diff --git a/fdbclient/ManagementAPI.actor.cpp b/fdbclient/ManagementAPI.actor.cpp index 665fbd9274..2b379d3be5 100644 --- a/fdbclient/ManagementAPI.actor.cpp +++ b/fdbclient/ManagementAPI.actor.cpp @@ -1010,7 +1010,9 @@ ACTOR Future> changeQuorumChecker(Transaction* tr, choose { when(wait(waitForAll(leaderServers))) {} - when(wait(delay(5.0))) { return CoordinatorsResult::COORDINATOR_UNREACHABLE; } + when(wait(delay(5.0))) { + return CoordinatorsResult::COORDINATOR_UNREACHABLE; + } } TraceEvent("ChangeQuorumCheckerSetCoordinatorsKey") .detail("CurrentCoordinators", old.toString()) @@ -1112,7 +1114,9 @@ ACTOR Future changeQuorum(Database cx, Reference lockDatabase(Reference tr, UID id) ACTOR Future lockDatabase(Database cx, UID id) { state Transaction tr(cx); + UID debugID = deterministicRandom()->randomUniqueID(); + TraceEvent("LockDatabaseTransaction", debugID).log(); + tr.debugTransaction(debugID); loop { try { wait(lockDatabase(&tr, id)); diff --git a/fdbrpc/include/fdbrpc/AsyncFileChaos.h b/fdbrpc/include/fdbrpc/AsyncFileChaos.h index 165b297bbf..27c0c1281d 100644 --- a/fdbrpc/include/fdbrpc/AsyncFileChaos.h +++ b/fdbrpc/include/fdbrpc/AsyncFileChaos.h @@ -25,15 +25,18 @@ #include "flow/IAsyncFile.h" #include "flow/network.h" #include "flow/ActorCollection.h" +#include "fdbrpc/simulator.h" // template class AsyncFileChaos final : public IAsyncFile, public ReferenceCounted { private: Reference file; + // since we have to read this often, we cache the filename here + std::string filename; bool enabled; public: - explicit AsyncFileChaos(Reference file) : file(file) { + explicit AsyncFileChaos(Reference file) : file(file), filename(file->getFilename()) { // We only allow chaos events on storage files enabled = (file->getFilename().find("storage-") != std::string::npos); } @@ -78,6 +81,7 @@ public: Future write(void const* data, int length, int64_t offset) override { Arena arena; char* pdata = nullptr; + unsigned corruptedBlock = 0; // Check if a bit flip event was injected, if so, copy the buffer contents // with a random bit flipped in a new buffer and use that for the write @@ -90,7 +94,10 @@ public: pdata = (char*)arena.allocate4kAlignedBuffer(length); memcpy(pdata, data, length); // flip a random bit in the copied buffer - pdata[deterministicRandom()->randomInt(0, length)] ^= (1 << deterministicRandom()->randomInt(0, 8)); + auto corruptedPos = deterministicRandom()->randomInt(0, length); + pdata[corruptedPos] ^= (1 << deterministicRandom()->randomInt(0, 8)); + // mark the block as corrupted + corruptedBlock = offset + corruptedPos / (4 * 1024); // increment the metric for bit flips auto res = g_network->global(INetwork::enChaosMetrics); @@ -102,20 +109,27 @@ public: } } - double diskDelay = getDelay(); - if (diskDelay == 0.0) { - if (pdata) - return holdWhile(arena, file->write(pdata, length, offset)); - - return file->write(data, length, offset); - } - // Wait for diskDelay before submitting the I/O - return mapAsync(Void)>, Void>(delay(diskDelay), [=](Void _) -> Future { - if (pdata) - return holdWhile(arena, file->write(pdata, length, offset)); + return mapAsync(Void)>, Void>(delay(getDelay()), [=](Void _) -> Future { + if (pdata) { + // if (g_network->isSimulated()) + return map(holdWhile(arena, file->write(pdata, length, offset)), [corruptedBlock, this](auto res) { + if (g_network->isSimulated()) { + g_simulator->corruptedBlocks.template emplace(filename, corruptedBlock); + } + return res; + }); + } - return file->write(data, length, offset); + return map(file->write(data, length, offset), [this, pdata, offset, length](auto res) { + if (pdata != nullptr || !g_network->isSimulated()) { + return res; + } + g_simulator->corruptedBlocks.erase( + g_simulator->corruptedBlocks.lower_bound(std::make_pair(filename, offset / 4096)), + g_simulator->corruptedBlocks.upper_bound(std::make_pair(filename, (offset + length) / 4096))); + return res; + }); }); } diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index 3eeb405785..8c4dd55f08 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -26,6 +26,8 @@ #include #include +#include + #include "flow/flow.h" #include "flow/Histogram.h" #include "flow/ProtocolVersion.h" @@ -508,6 +510,8 @@ public: std::unordered_map, PrivateKey> authKeys; + std::set> corruptedBlocks; + flowGlobalType global(int id) const final { return getCurrentProcess()->global(id); }; void setGlobal(size_t id, flowGlobalType v) final { getCurrentProcess()->setGlobal(id, v); }; diff --git a/fdbserver/KeyValueStoreSQLite.actor.cpp b/fdbserver/KeyValueStoreSQLite.actor.cpp index 634beb190c..5eee7f39fa 100644 --- a/fdbserver/KeyValueStoreSQLite.actor.cpp +++ b/fdbserver/KeyValueStoreSQLite.actor.cpp @@ -149,6 +149,13 @@ struct PageChecksumCodec { } if (!silent) { + auto severity = SevError; + if (g_network->isSimulated()) { + if (g_simulator->corruptedBlocks.count(std::make_pair(filename, pageNumber - 1))) { + // this corruption was caused by failure injection + severity = SevWarnAlways; + } + } TraceEvent trEvent(SevError, "SQLitePageChecksumFailure"); trEvent.error(checksum_failed()) .detail("CodecPageSize", pageSize) @@ -706,7 +713,7 @@ struct IntKeyCursor { db.checkError("BtreeCloseCursor", sqlite3BtreeCloseCursor(cursor)); } catch (...) { } - delete[](char*) cursor; + delete[] (char*)cursor; } } }; @@ -744,7 +751,7 @@ struct RawCursor { } catch (...) { TraceEvent(SevError, "RawCursorDestructionError").log(); } - delete[](char*) cursor; + delete[] (char*)cursor; } } void moveFirst() { diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 34d9205ded..62f6954629 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -116,6 +116,10 @@ struct MachineAttritionWorkload : FailureInjectionWorkload { bool shouldInject(DeterministicRandom& random, const WorkloadRequest& work, const unsigned alreadyAdded) const override { + if (g_network->isSimulated() && !g_simulator->extraDatabases.empty()) { + // Remove this as soon as we track extra databases properly + return false; + } return work.useDatabase && random.random01() < 1.0 / (2.0 + alreadyAdded); }