430 lines
14 KiB
C++
430 lines
14 KiB
C++
/*
|
|
* BlobGranuleValidation.actor.cpp
|
|
*
|
|
* This source file is part of the FoundationDB open source project
|
|
*
|
|
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "fdbserver/BlobGranuleValidation.actor.h"
|
|
#include "fdbserver/Knobs.h"
|
|
#include "flow/actorcompiler.h" // has to be last include
|
|
|
|
ACTOR Future<std::pair<RangeResult, Version>> readFromFDB(Database cx, KeyRange range) {
|
|
state bool first = true;
|
|
state Version v;
|
|
state RangeResult out;
|
|
state Transaction tr(cx);
|
|
state KeyRange currentRange = range;
|
|
loop {
|
|
tr.setOption(FDBTransactionOptions::RAW_ACCESS);
|
|
try {
|
|
state RangeResult r = wait(tr.getRange(currentRange, CLIENT_KNOBS->TOO_MANY));
|
|
Version grv = wait(tr.getReadVersion());
|
|
// need consistent version snapshot of range
|
|
if (first) {
|
|
v = grv;
|
|
first = false;
|
|
} else if (v != grv) {
|
|
// reset the range and restart the read at a higher version
|
|
first = true;
|
|
out = RangeResult();
|
|
currentRange = range;
|
|
tr.reset();
|
|
continue;
|
|
}
|
|
out.arena().dependsOn(r.arena());
|
|
out.append(out.arena(), r.begin(), r.size());
|
|
if (r.more) {
|
|
currentRange = KeyRangeRef(keyAfter(r.back().key), currentRange.end);
|
|
} else {
|
|
break;
|
|
}
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
return std::pair(out, v);
|
|
}
|
|
|
|
// FIXME: typedef this pair type and/or chunk list
|
|
ACTOR Future<std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>> readFromBlob(
|
|
Database cx,
|
|
Reference<BlobConnectionProvider> bstore,
|
|
KeyRange range,
|
|
Version beginVersion,
|
|
Version readVersion,
|
|
Optional<TenantName> tenantName) {
|
|
state RangeResult out;
|
|
state Standalone<VectorRef<BlobGranuleChunkRef>> chunks;
|
|
state Transaction tr(cx, tenantName);
|
|
|
|
loop {
|
|
try {
|
|
Standalone<VectorRef<BlobGranuleChunkRef>> chunks_ =
|
|
wait(tr.readBlobGranules(range, beginVersion, readVersion));
|
|
chunks = chunks_;
|
|
break;
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
|
|
for (const BlobGranuleChunkRef& chunk : chunks) {
|
|
ASSERT(chunk.tenantPrefix.present() == tenantName.present());
|
|
RangeResult chunkRows = wait(readBlobGranule(chunk, range, beginVersion, readVersion, bstore));
|
|
out.arena().dependsOn(chunkRows.arena());
|
|
out.append(out.arena(), chunkRows.begin(), chunkRows.size());
|
|
}
|
|
return std::pair(out, chunks);
|
|
}
|
|
|
|
bool compareFDBAndBlob(RangeResult fdb,
|
|
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob,
|
|
KeyRange range,
|
|
Version v,
|
|
bool debug) {
|
|
bool correct = fdb == blob.first;
|
|
if (!correct) {
|
|
TraceEvent ev(SevError, "GranuleMismatch");
|
|
ev.detail("RangeStart", range.begin)
|
|
.detail("RangeEnd", range.end)
|
|
.detail("Version", v)
|
|
.detail("FDBSize", fdb.size())
|
|
.detail("BlobSize", blob.first.size());
|
|
|
|
if (debug) {
|
|
fmt::print("\nMismatch for [{0} - {1}) @ {2}. F({3}) B({4}):\n",
|
|
range.begin.printable(),
|
|
range.end.printable(),
|
|
v,
|
|
fdb.size(),
|
|
blob.first.size());
|
|
|
|
Optional<KeyValueRef> lastCorrect;
|
|
for (int i = 0; i < std::max(fdb.size(), blob.first.size()); i++) {
|
|
if (i >= fdb.size() || i >= blob.first.size() || fdb[i] != blob.first[i]) {
|
|
TraceEvent ev("GranuleMismatchInfo");
|
|
ev.detail("Idx", i);
|
|
printf(" Found mismatch at %d.\n", i);
|
|
if (lastCorrect.present()) {
|
|
printf(" last correct: %s=%s\n",
|
|
lastCorrect.get().key.printable().c_str(),
|
|
lastCorrect.get().value.printable().c_str());
|
|
ev.detail("LastCorrectKey", lastCorrect.get().key);
|
|
}
|
|
if (i < fdb.size()) {
|
|
printf(" FDB: %s=%s\n", fdb[i].key.printable().c_str(), fdb[i].value.printable().c_str());
|
|
ev.detail("FDBKey", fdb[i].key);
|
|
} else {
|
|
printf(" FDB: <missing>\n");
|
|
ev.detail("FDBKey", "Missing");
|
|
}
|
|
if (i < blob.first.size()) {
|
|
printf(" BLB: %s=%s\n",
|
|
blob.first[i].key.printable().c_str(),
|
|
blob.first[i].value.printable().c_str());
|
|
ev.detail("BlobKey", blob.first[i].key);
|
|
} else {
|
|
printf(" BLB: <missing>\n");
|
|
ev.detail("BlobKey", "Missing");
|
|
}
|
|
if (i < fdb.size() && i < blob.first.size() && fdb[i].key == blob.first[i].key) {
|
|
// value mismatch
|
|
ev.detail("FDBValue", fdb[i].value).detail("BlobValue", blob.first[i].value);
|
|
}
|
|
printf("\n");
|
|
break;
|
|
}
|
|
if (i < fdb.size()) {
|
|
lastCorrect = fdb[i];
|
|
} else {
|
|
lastCorrect = blob.first[i];
|
|
}
|
|
}
|
|
|
|
printGranuleChunks(blob.second);
|
|
}
|
|
}
|
|
return correct;
|
|
}
|
|
|
|
void printGranuleChunks(const Standalone<VectorRef<BlobGranuleChunkRef>>& chunks) {
|
|
printf("Chunks:\n");
|
|
for (auto& chunk : chunks) {
|
|
printf("[%s - %s)\n", chunk.keyRange.begin.printable().c_str(), chunk.keyRange.end.printable().c_str());
|
|
|
|
printf(" SnapshotFile:\n %s\n",
|
|
chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : "<none>");
|
|
printf(" DeltaFiles:\n");
|
|
for (auto& df : chunk.deltaFiles) {
|
|
printf(" %s\n", df.toString().c_str());
|
|
}
|
|
printf(" Deltas: (%d)", chunk.newDeltas.size());
|
|
if (chunk.newDeltas.size() > 0) {
|
|
fmt::print(" with version [{0} - {1}]",
|
|
chunk.newDeltas[0].version,
|
|
chunk.newDeltas[chunk.newDeltas.size() - 1].version);
|
|
}
|
|
fmt::print(" IncludedVersion: {}\n", chunk.includedVersion);
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
|
|
// clear key range and check whether it is merged or not, repeatedly
|
|
state Transaction tr(cx);
|
|
state int reClearCount = 1;
|
|
state int reClearInterval = 1; // do quadratic backoff on clear rate, b/c large keys can keep it not write-cold
|
|
loop {
|
|
try {
|
|
Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(range, 2));
|
|
if (ranges.size() == 1) {
|
|
return Void();
|
|
}
|
|
CODE_PROBE(true, "ClearAndAwaitMerge doing clear");
|
|
reClearCount--;
|
|
if (reClearCount <= 0) {
|
|
tr.clear(range);
|
|
wait(tr.commit());
|
|
fmt::print("ClearAndAwaitMerge cleared [{0} - {1}) @ {2}\n",
|
|
range.begin.printable(),
|
|
range.end.printable(),
|
|
tr.getCommittedVersion());
|
|
reClearCount = reClearInterval;
|
|
reClearInterval++;
|
|
}
|
|
wait(delay(30.0)); // sleep a bit before checking on merge again
|
|
tr.reset();
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Standalone<VectorRef<BlobGranuleSummaryRef>>> getSummaries(Database cx,
|
|
KeyRange range,
|
|
Version summaryVersion,
|
|
Optional<TenantName> tenantName) {
|
|
state Transaction tr(cx, tenantName);
|
|
loop {
|
|
try {
|
|
Standalone<VectorRef<BlobGranuleSummaryRef>> summaries =
|
|
wait(tr.summarizeBlobGranules(range, summaryVersion, 1000000));
|
|
|
|
// do some basic validation
|
|
ASSERT(!summaries.empty());
|
|
ASSERT(summaries.front().keyRange.begin == range.begin);
|
|
ASSERT(summaries.back().keyRange.end == range.end);
|
|
|
|
for (int i = 0; i < summaries.size() - 1; i++) {
|
|
ASSERT(summaries[i].keyRange.end == summaries[i + 1].keyRange.begin);
|
|
}
|
|
|
|
return summaries;
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> validateGranuleSummaries(Database cx,
|
|
KeyRange range,
|
|
Optional<TenantName> tenantName,
|
|
Promise<Void> testComplete) {
|
|
state Arena lastSummaryArena;
|
|
state KeyRangeMap<Optional<BlobGranuleSummaryRef>> lastSummary;
|
|
state Version lastSummaryVersion = invalidVersion;
|
|
state Transaction tr(cx, tenantName);
|
|
state int successCount = 0;
|
|
try {
|
|
loop {
|
|
// get grv and get latest summaries
|
|
state Version nextSummaryVersion;
|
|
tr.reset();
|
|
loop {
|
|
try {
|
|
wait(store(nextSummaryVersion, tr.getReadVersion()));
|
|
ASSERT(nextSummaryVersion >= lastSummaryVersion);
|
|
break;
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
|
|
state Standalone<VectorRef<BlobGranuleSummaryRef>> nextSummary;
|
|
try {
|
|
wait(store(nextSummary, getSummaries(cx, range, nextSummaryVersion, tenantName)));
|
|
} catch (Error& e) {
|
|
if (e.code() == error_code_blob_granule_transaction_too_old) {
|
|
ASSERT(lastSummaryVersion == invalidVersion);
|
|
|
|
wait(delay(1.0));
|
|
continue;
|
|
} else {
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
if (lastSummaryVersion != invalidVersion) {
|
|
CODE_PROBE(true, "comparing multiple summaries");
|
|
// diff with last summary ranges to ensure versions never decreased for any range
|
|
for (auto& it : nextSummary) {
|
|
auto lastSummaries = lastSummary.intersectingRanges(it.keyRange);
|
|
for (auto& itLast : lastSummaries) {
|
|
|
|
if (!itLast.cvalue().present()) {
|
|
ASSERT(lastSummaryVersion == invalidVersion);
|
|
continue;
|
|
}
|
|
auto& last = itLast.cvalue().get();
|
|
|
|
ASSERT(it.snapshotVersion >= last.snapshotVersion);
|
|
// same invariant isn't always true for delta version because of force flushing around granule
|
|
// merges
|
|
if (it.keyRange == itLast.range()) {
|
|
ASSERT(it.deltaVersion >= last.deltaVersion);
|
|
if (it.snapshotVersion == last.snapshotVersion) {
|
|
ASSERT(it.snapshotSize == last.snapshotSize);
|
|
}
|
|
if (it.snapshotVersion == last.snapshotVersion && it.deltaVersion == last.deltaVersion) {
|
|
ASSERT(it.snapshotSize == last.snapshotSize);
|
|
ASSERT(it.deltaSize == last.deltaSize);
|
|
} else if (it.snapshotVersion == last.snapshotVersion) {
|
|
ASSERT(it.deltaSize > last.deltaSize);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!testComplete.canBeSet()) {
|
|
return Void();
|
|
}
|
|
}
|
|
|
|
successCount++;
|
|
|
|
lastSummaryArena = nextSummary.arena();
|
|
lastSummaryVersion = nextSummaryVersion;
|
|
lastSummary.insert(range, {});
|
|
for (auto& it : nextSummary) {
|
|
lastSummary.insert(it.keyRange, it);
|
|
}
|
|
|
|
wait(delayJittered(deterministicRandom()->randomInt(1, 10)));
|
|
}
|
|
} catch (Error& e) {
|
|
if (e.code() != error_code_operation_cancelled) {
|
|
TraceEvent(SevError, "UnexpectedErrorValidateGranuleSummaries").error(e);
|
|
}
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
struct feed_cmp_f {
|
|
bool operator()(const std::pair<Key, KeyRange>& lhs, const std::pair<Key, KeyRange>& rhs) const {
|
|
if (lhs.second.begin == rhs.second.begin) {
|
|
return lhs.second.end < rhs.second.end;
|
|
}
|
|
return lhs.second.begin < rhs.second.begin;
|
|
}
|
|
};
|
|
|
|
ACTOR Future<std::vector<std::pair<Key, KeyRange>>> getActiveFeeds(Transaction* tr) {
|
|
RangeResult feedResult = wait(tr->getRange(changeFeedKeys, 10000));
|
|
ASSERT(!feedResult.more);
|
|
std::vector<std::pair<Key, KeyRange>> results;
|
|
for (auto& it : feedResult) {
|
|
Key feedKey = it.key.removePrefix(changeFeedPrefix);
|
|
KeyRange feedRange;
|
|
Version version;
|
|
ChangeFeedStatus status;
|
|
|
|
std::tie(feedRange, version, status) = decodeChangeFeedValue(it.value);
|
|
results.push_back({ feedKey, feedRange });
|
|
}
|
|
|
|
std::sort(results.begin(), results.end(), feed_cmp_f());
|
|
|
|
return results;
|
|
}
|
|
|
|
// TODO: add debug parameter
|
|
// FIXME: this check currently assumes blob granules are the only users of change feeds, and will fail if that is not
|
|
// the case
|
|
ACTOR Future<Void> checkFeedCleanup(Database cx, bool debug) {
|
|
if (SERVER_KNOBS->BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY < 0) {
|
|
// no guarantee of feed cleanup, return
|
|
return Void();
|
|
}
|
|
// big extra timeout just because simulation can take a while to quiesce
|
|
state double checkTimeoutOnceStable = 300.0 + 2 * SERVER_KNOBS->BLOB_WORKER_FORCE_FLUSH_CLEANUP_DELAY;
|
|
state Optional<double> stableTimestamp;
|
|
state Standalone<VectorRef<KeyRangeRef>> lastGranules;
|
|
|
|
state Transaction tr(cx);
|
|
loop {
|
|
try {
|
|
// get set of current granules. if different than last set of granules
|
|
state Standalone<VectorRef<KeyRangeRef>> granules = wait(tr.getBlobGranuleRanges(normalKeys, 10000));
|
|
state std::vector<std::pair<Key, KeyRange>> activeFeeds = wait(getActiveFeeds(&tr));
|
|
|
|
// TODO REMOVE
|
|
if (debug) {
|
|
fmt::print("{0} granules and {1} active feeds found\n", granules.size(), activeFeeds.size());
|
|
}
|
|
/*fmt::print("Granules:\n");
|
|
for (auto& it : granules) {
|
|
fmt::print(" [{0} - {1})\n", it.begin.printable(), it.end.printable());
|
|
}*/
|
|
bool allPresent = granules.size() == activeFeeds.size();
|
|
for (int i = 0; allPresent && i < granules.size(); i++) {
|
|
if (granules[i] != activeFeeds[i].second) {
|
|
if (debug) {
|
|
fmt::print("Feed {0} for [{1} - {2}) still exists despite no granule!\n",
|
|
activeFeeds[i].first.printable(),
|
|
activeFeeds[i].second.begin.printable(),
|
|
activeFeeds[i].second.end.printable());
|
|
}
|
|
allPresent = false;
|
|
break;
|
|
}
|
|
}
|
|
if (allPresent) {
|
|
if (debug) {
|
|
fmt::print("Feed Cleanup Check Complete\n");
|
|
}
|
|
return Void();
|
|
}
|
|
if (granules != lastGranules) {
|
|
stableTimestamp.reset();
|
|
} else if (!stableTimestamp.present()) {
|
|
stableTimestamp = now();
|
|
}
|
|
lastGranules = granules;
|
|
|
|
// ensure this converges within a time window of granules becoming stable
|
|
if (stableTimestamp.present()) {
|
|
ASSERT(now() - stableTimestamp.get() <= checkTimeoutOnceStable);
|
|
}
|
|
|
|
wait(delay(2.0));
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
}
|