1033 lines
37 KiB
C++
1033 lines
37 KiB
C++
/*
|
|
* BlobGranuleCorrectnessWorkload.actor.cpp
|
|
*
|
|
* This source file is part of the FoundationDB open source project
|
|
*
|
|
* Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <cmath>
|
|
#include <map>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "fmt/format.h"
|
|
#include "fdbclient/BlobGranuleReader.actor.h"
|
|
#include "fdbclient/ManagementAPI.actor.h"
|
|
#include "fdbclient/NativeAPI.actor.h"
|
|
#include "fdbclient/SystemData.h"
|
|
#include "fdbclient/TenantManagement.actor.h"
|
|
#include "fdbclient/Tuple.h"
|
|
#include "fdbserver/BlobGranuleServerCommon.actor.h"
|
|
#include "fdbserver/BlobGranuleValidation.actor.h"
|
|
#include "fdbserver/Knobs.h"
|
|
#include "fdbserver/TesterInterface.actor.h"
|
|
#include "fdbserver/workloads/workloads.actor.h"
|
|
#include "flow/Arena.h"
|
|
#include "flow/IRandom.h"
|
|
#include "flow/genericactors.actor.h"
|
|
|
|
#include "flow/actorcompiler.h" // This must be the last #include.
|
|
|
|
#define BGW_DEBUG true
|
|
#define BGW_TUPLE_KEY_SIZE 2
|
|
|
|
struct WriteData {
|
|
Version writeVersion;
|
|
Version clearVersion;
|
|
int32_t val;
|
|
int16_t valLength;
|
|
|
|
// start as MAX_VERSION while uncommitted/uncleared so that they're ignored by concurrent readers
|
|
explicit WriteData(int32_t val, int16_t valLength)
|
|
: writeVersion(MAX_VERSION), clearVersion(MAX_VERSION), val(val), valLength(valLength) {}
|
|
};
|
|
|
|
struct KeyData {
|
|
int nextClearIdx;
|
|
std::vector<WriteData> writes;
|
|
};
|
|
|
|
static std::vector<int> targetValSizes = { 40, 100, 500 };
|
|
|
|
struct ThreadData : ReferenceCounted<ThreadData>, NonCopyable {
|
|
// directory info
|
|
int32_t directoryID;
|
|
KeyRange directoryRange;
|
|
TenantName tenantName;
|
|
TenantMapEntry tenant;
|
|
Reference<BlobConnectionProvider> bstore;
|
|
|
|
// key + value gen data
|
|
// in vector for efficient random selection
|
|
std::vector<uint32_t> usedKeys;
|
|
// by key for tracking data
|
|
std::map<uint32_t, KeyData> keyData;
|
|
|
|
std::deque<Version> writeVersions;
|
|
|
|
// randomized parameters that can be different per directory
|
|
int targetByteRate;
|
|
bool nextKeySequential;
|
|
int16_t targetValLength;
|
|
double reuseKeyProb;
|
|
int targetIDsPerKey;
|
|
|
|
// communication between workers
|
|
Promise<Void> firstWriteSuccessful;
|
|
Version minSuccessfulReadVersion = MAX_VERSION;
|
|
|
|
Future<Void> summaryClient;
|
|
Promise<Void> triggerSummaryComplete;
|
|
|
|
// stats
|
|
int64_t errors = 0;
|
|
int64_t mismatches = 0;
|
|
int64_t reads = 0;
|
|
int64_t timeTravelReads = 0;
|
|
int64_t timeTravelTooOld = 0;
|
|
int64_t rowsRead = 0;
|
|
int64_t bytesRead = 0;
|
|
int64_t rowsWritten = 0;
|
|
int64_t bytesWritten = 0;
|
|
|
|
ThreadData(uint32_t directoryID, int64_t targetByteRate)
|
|
: directoryID(directoryID), targetByteRate(targetByteRate) {
|
|
tenantName = StringRef(std::to_string(directoryID));
|
|
|
|
targetByteRate *= (0.5 + deterministicRandom()->random01());
|
|
|
|
targetValLength = deterministicRandom()->randomChoice(targetValSizes);
|
|
targetValLength *= (0.5 + deterministicRandom()->random01());
|
|
|
|
nextKeySequential = deterministicRandom()->random01() < 0.5;
|
|
reuseKeyProb = 0.1 + (deterministicRandom()->random01() * 0.8);
|
|
targetIDsPerKey = 1 + deterministicRandom()->randomInt(10, 100);
|
|
|
|
if (BGW_DEBUG) {
|
|
fmt::print("Directory {0} initialized with the following parameters:\n", directoryID);
|
|
fmt::print(" targetByteRate={0}\n", targetByteRate);
|
|
fmt::print(" targetValLength={0}\n", targetValLength);
|
|
fmt::print(" nextKeySequential={0}\n", nextKeySequential);
|
|
fmt::print(" reuseKeyProb={0}\n", reuseKeyProb);
|
|
fmt::print(" targetIDsPerKey={0}\n", targetIDsPerKey);
|
|
}
|
|
}
|
|
|
|
// TODO could make keys variable length?
|
|
Key getKey(uint32_t key, uint32_t id) {
|
|
std::stringstream ss;
|
|
ss << std::setw(32) << std::setfill('0') << id;
|
|
Standalone<StringRef> str(ss.str());
|
|
Tuple::UserTypeStr udt(0x41, str);
|
|
return Tuple::makeTuple((int64_t)key, udt).pack();
|
|
}
|
|
|
|
void validateGranuleBoundary(Key k, Key e, Key lastKey) {
|
|
if (k == allKeys.begin || k == allKeys.end) {
|
|
return;
|
|
}
|
|
|
|
// Fully formed tuples are inserted. The expectation is boundaries should be a
|
|
// sub-tuple of the inserted key.
|
|
Tuple t = Tuple::unpackUserType(k, true);
|
|
if (SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET) {
|
|
Tuple t2;
|
|
try {
|
|
t2 = Tuple::unpackUserType(lastKey);
|
|
} catch (Error& e) {
|
|
// Ignore being unable to parse lastKey as it may be a dummy key.
|
|
}
|
|
|
|
if (t2.size() > 0 && t.getInt(0) != t2.getInt(0)) {
|
|
if (t.size() > BGW_TUPLE_KEY_SIZE - SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET) {
|
|
fmt::print("Tenant: {0}, K={1}, E={2}, LK={3}. {4} != {5}\n",
|
|
tenant.prefix.printable(),
|
|
k.printable(),
|
|
e.printable(),
|
|
lastKey.printable(),
|
|
t.getInt(0),
|
|
t2.getInt(0));
|
|
}
|
|
ASSERT(t.size() <= BGW_TUPLE_KEY_SIZE - SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// For debugging mismatches on what data should be and why
|
|
// set mismatch to true, dir id and key id to the directory and key id that are wrong, and rv to read version that read
|
|
// the wrong value
|
|
#define DEBUG_MISMATCH false
|
|
#define DEBUG_DIR_ID 0
|
|
#define DEBUG_KEY_ID 0
|
|
#define DEBUG_RV invalidVersion
|
|
|
|
#define DEBUG_KEY_OP(dirId, keyId) BGW_DEBUG&& DEBUG_MISMATCH&& dirId == DEBUG_DIR_ID&& DEBUG_KEY_ID == keyId
|
|
#define DEBUG_READ_OP(dirId, rv) BGW_DEBUG&& DEBUG_MISMATCH&& dirId == DEBUG_DIR_ID&& rv == DEBUG_RV
|
|
|
|
/*
|
|
* This is a stand-alone workload designed to validate blob granule correctness.
|
|
* By enabling distinct ranges and writing to those parts of the key space, we can control what parts of the key space
|
|
* are written to blob, and can validate that the granule data is correct at any desired version.
|
|
*/
|
|
struct BlobGranuleCorrectnessWorkload : TestWorkload {
|
|
static constexpr auto NAME = "BlobGranuleCorrectnessWorkload";
|
|
bool doSetup;
|
|
double testDuration;
|
|
|
|
// parameters global across all clients
|
|
int64_t targetByteRate;
|
|
bool doMergeCheckAtEnd;
|
|
|
|
std::vector<Reference<ThreadData>> directories;
|
|
std::vector<Future<Void>> clients;
|
|
DatabaseConfiguration config;
|
|
|
|
BlobGranuleCorrectnessWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
|
|
doSetup = !clientId; // only do this on the "first" client
|
|
testDuration = getOption(options, "testDuration"_sr, 120.0);
|
|
|
|
// randomize global test settings based on shared parameter to get similar workload across tests, but then vary
|
|
// different parameters within those constraints
|
|
int64_t randomness = sharedRandomNumber;
|
|
|
|
doMergeCheckAtEnd = randomness % 10 == 0;
|
|
randomness /= 10;
|
|
|
|
// randomize between low and high directory count
|
|
int64_t targetDirectories = 1 + (randomness % 8);
|
|
randomness /= 8;
|
|
|
|
int64_t targetMyDirectories =
|
|
(targetDirectories / clientCount) + ((targetDirectories % clientCount > clientId) ? 1 : 0);
|
|
|
|
if (targetMyDirectories > 0) {
|
|
int myDirectories = 1;
|
|
if (targetMyDirectories > 1) {
|
|
myDirectories = deterministicRandom()->randomInt(1, 2 * targetMyDirectories + 1);
|
|
}
|
|
|
|
// anywhere from 2 delta files per second to 1 delta file every 2 seconds, spread across all directories
|
|
int denom = std::min(clientCount, (int)targetDirectories);
|
|
targetByteRate = 2 * SERVER_KNOBS->BG_DELTA_FILE_TARGET_BYTES / (1 + (randomness % 4)) / denom;
|
|
randomness /= 4;
|
|
|
|
// either do equal across all of my directories, or skewed
|
|
bool skewed = myDirectories > 1 && deterministicRandom()->random01() < 0.4;
|
|
int skewMultiplier;
|
|
if (skewed) {
|
|
// first directory has 1/2, second has 1/4, third has 1/8, etc...
|
|
skewMultiplier = 2;
|
|
targetByteRate /= 2;
|
|
} else {
|
|
skewMultiplier = 1;
|
|
targetByteRate /= myDirectories;
|
|
}
|
|
for (int i = 0; i < myDirectories; i++) {
|
|
// set up directory with its own randomness
|
|
uint32_t dirId = i * clientCount + clientId;
|
|
if (BGW_DEBUG) {
|
|
fmt::print("Client {0}/{1} creating directory {2}\n", clientId, clientCount, dirId);
|
|
}
|
|
directories.push_back(makeReference<ThreadData>(dirId, targetByteRate));
|
|
targetByteRate /= skewMultiplier;
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<TenantMapEntry> setUpTenant(Database cx, TenantName name) {
|
|
if (BGW_DEBUG) {
|
|
fmt::print("Setting up blob granule range for tenant {0}\n", name.printable());
|
|
}
|
|
|
|
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(cx.getReference(), name));
|
|
ASSERT(entry.present());
|
|
|
|
if (BGW_DEBUG) {
|
|
fmt::print(
|
|
"Set up blob granule range for tenant {0}: {1}\n", name.printable(), entry.get().prefix.printable());
|
|
}
|
|
|
|
return entry.get();
|
|
}
|
|
|
|
Future<Void> setup(Database const& cx) override { return _setup(cx, this); }
|
|
|
|
ACTOR Future<Void> _setup(Database cx, BlobGranuleCorrectnessWorkload* self) {
|
|
if (self->doSetup) {
|
|
// FIXME: run the actual FDBCLI command instead of copy/pasting its implementation
|
|
wait(success(ManagementAPI::changeConfig(cx.getReference(), "blob_granules_enabled=1", true)));
|
|
}
|
|
|
|
if (self->directories.empty()) {
|
|
return Void();
|
|
}
|
|
|
|
state int directoryIdx = 0;
|
|
state std::vector<std::pair<TenantName, TenantMapEntry>> tenants;
|
|
state BGTenantMap tenantData(self->dbInfo);
|
|
state Reference<GranuleTenantData> data;
|
|
for (; directoryIdx < self->directories.size(); directoryIdx++) {
|
|
// Set up the blob range first
|
|
TenantMapEntry tenantEntry = wait(self->setUpTenant(cx, self->directories[directoryIdx]->tenantName));
|
|
|
|
self->directories[directoryIdx]->tenant = tenantEntry;
|
|
self->directories[directoryIdx]->directoryRange =
|
|
KeyRangeRef(tenantEntry.prefix, tenantEntry.prefix.withSuffix(normalKeys.end));
|
|
tenants.push_back({ self->directories[directoryIdx]->tenantName, tenantEntry });
|
|
bool _success = wait(cx->blobbifyRange(self->directories[directoryIdx]->directoryRange));
|
|
ASSERT(_success);
|
|
}
|
|
tenantData.addTenants(tenants);
|
|
|
|
// wait for tenant data to be loaded
|
|
|
|
for (directoryIdx = 0; directoryIdx < self->directories.size(); directoryIdx++) {
|
|
wait(store(data, tenantData.getDataForGranule(self->directories[directoryIdx]->directoryRange)));
|
|
wait(data->bstoreLoaded.getFuture());
|
|
wait(delay(0));
|
|
self->directories[directoryIdx]->bstore = data->bstore;
|
|
}
|
|
|
|
return Void();
|
|
}
|
|
|
|
// handle retries + errors
|
|
// It's ok to reset the transaction here because its read version is only used for reading the granule mapping from
|
|
// the system keyspace
|
|
ACTOR Future<Version> doGrv(Transaction* tr) {
|
|
loop {
|
|
try {
|
|
Version readVersion = wait(tr->getReadVersion());
|
|
return readVersion;
|
|
} catch (Error& e) {
|
|
wait(tr->onError(e));
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> waitFirstSnapshot(BlobGranuleCorrectnessWorkload* self,
|
|
Database cx,
|
|
Reference<ThreadData> threadData,
|
|
bool doSetup) {
|
|
// read entire keyspace at the start until granules for the entire thing are available
|
|
loop {
|
|
state Transaction tr(cx, threadData->tenantName);
|
|
try {
|
|
Version rv = wait(self->doGrv(&tr));
|
|
state Version readVersion = rv;
|
|
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob =
|
|
wait(readFromBlob(cx,
|
|
threadData->bstore,
|
|
normalKeys /* tenant handles range */,
|
|
0,
|
|
readVersion,
|
|
threadData->tenantName));
|
|
fmt::print("Directory {0} got {1} RV {2}\n",
|
|
threadData->directoryID,
|
|
doSetup ? "initial" : "final",
|
|
readVersion);
|
|
threadData->minSuccessfulReadVersion = readVersion;
|
|
return Void();
|
|
} catch (Error& e) {
|
|
if (e.code() == error_code_operation_cancelled) {
|
|
throw e;
|
|
}
|
|
if (e.code() != error_code_blob_granule_transaction_too_old) {
|
|
wait(tr.onError(e));
|
|
} else {
|
|
wait(delay(1.0));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void logMismatch(Reference<ThreadData> threadData,
|
|
const Optional<Key>& lastMatching,
|
|
const Optional<Key>& expectedKey,
|
|
const Optional<Key>& blobKey,
|
|
const Optional<Value>& expectedValue,
|
|
const Optional<Value>& blobValue,
|
|
uint32_t startKey,
|
|
uint32_t endKey,
|
|
Version beginVersion,
|
|
Version readVersion,
|
|
const std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>>& blob) {
|
|
threadData->mismatches++;
|
|
if (!BGW_DEBUG) {
|
|
return;
|
|
}
|
|
|
|
TraceEvent ev(SevError, "BGMismatch");
|
|
ev.detail("DirectoryID", format("%08x", threadData->directoryID))
|
|
.detail("RangeStart", format("%08x", startKey))
|
|
.detail("RangeEnd", format("%08x", endKey))
|
|
.detail("BeginVersion", beginVersion)
|
|
.detail("Version", readVersion);
|
|
fmt::print("Found mismatch! Request for dir {0} [{1} - {2}) @ {3} - {4}\n",
|
|
format("%08x", threadData->directoryID),
|
|
format("%08x", startKey),
|
|
format("%08x", endKey),
|
|
beginVersion,
|
|
readVersion);
|
|
if (lastMatching.present()) {
|
|
fmt::print(" last correct: {}\n", lastMatching.get().printable());
|
|
}
|
|
if (expectedValue.present() || blobValue.present()) {
|
|
// value mismatch
|
|
ASSERT(blobKey.present());
|
|
ASSERT(blobKey == expectedKey);
|
|
fmt::print(" Value mismatch for {0}.\n Expected={1}\n Actual={2}\n",
|
|
blobKey.get().printable(),
|
|
expectedValue.get().printable(),
|
|
blobValue.get().printable());
|
|
} else {
|
|
// key mismatch
|
|
fmt::print(" Expected Key: {0}\n", expectedKey.present() ? expectedKey.get().printable() : "<missing>");
|
|
fmt::print(" Actual Key: {0}\n", blobKey.present() ? blobKey.get().printable() : "<missing>");
|
|
}
|
|
|
|
fmt::print("Chunks: {0}\n", blob.second.size());
|
|
for (auto& chunk : blob.second) {
|
|
fmt::print("[{0} - {1})\n", chunk.keyRange.begin.printable(), chunk.keyRange.end.printable());
|
|
|
|
fmt::print(" SnapshotFile:\n {}\n",
|
|
chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : "<none>");
|
|
fmt::print(" DeltaFiles:\n");
|
|
for (auto& df : chunk.deltaFiles) {
|
|
fmt::print(" {}\n", df.toString());
|
|
}
|
|
fmt::print(" Deltas: ({})", chunk.newDeltas.size());
|
|
if (chunk.newDeltas.size() > 0) {
|
|
fmt::print(" with version [{0} - {1}]",
|
|
chunk.newDeltas[0].version,
|
|
chunk.newDeltas[chunk.newDeltas.size() - 1].version);
|
|
}
|
|
fmt::print(" IncludedVersion: {}\n", chunk.includedVersion);
|
|
}
|
|
printf("\n");
|
|
}
|
|
|
|
Value genVal(uint32_t val, uint16_t valLen) {
|
|
std::string v(valLen, 'x');
|
|
auto valFormatted = format("%08x", val);
|
|
ASSERT(valFormatted.size() <= v.size());
|
|
|
|
for (int i = 0; i < valFormatted.size(); i++) {
|
|
v[i] = valFormatted[i];
|
|
}
|
|
// copy into an arena
|
|
// TODO do this in original arena? a bit more efficient that way
|
|
Arena a;
|
|
return Standalone<StringRef>(StringRef(a, v), a);
|
|
}
|
|
|
|
bool validateValue(const Value& v, uint32_t val, uint16_t valLen) {
|
|
if (v.size() != valLen) {
|
|
return false;
|
|
}
|
|
// check for correct value portion
|
|
auto valFormatted = format("%08x", val);
|
|
ASSERT(valFormatted.size() <= v.size());
|
|
if (v.substr(0, valFormatted.size()) != valFormatted) {
|
|
return false;
|
|
}
|
|
// check for corruption
|
|
for (int i = valFormatted.size(); i < v.size(); i++) {
|
|
if (v[i] != 'x') {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool validateResult(Reference<ThreadData> threadData,
|
|
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob,
|
|
int startKeyInclusive,
|
|
int endKeyExclusive,
|
|
Version beginVersion,
|
|
Version readVersion) {
|
|
auto checkIt = threadData->keyData.lower_bound(startKeyInclusive);
|
|
if (checkIt != threadData->keyData.end() && checkIt->first < startKeyInclusive) {
|
|
checkIt++;
|
|
}
|
|
int resultIdx = 0;
|
|
Optional<Key> lastMatching;
|
|
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
|
fmt::print("DBG READ: [{0} - {1}) @ {2} ({3} rows)\n",
|
|
format("%08x", startKeyInclusive),
|
|
format("%08x", endKeyExclusive),
|
|
readVersion,
|
|
blob.first.size());
|
|
}
|
|
|
|
// because each chunk could be separately collapsed or not if we set beginVersion, we have to track it by chunk
|
|
KeyRangeMap<Version> beginVersionByChunk;
|
|
beginVersionByChunk.insert(normalKeys, 0);
|
|
int beginCollapsed = 0;
|
|
int beginNotCollapsed = 0;
|
|
Key lastBeginKey = ""_sr;
|
|
for (auto& chunk : blob.second) {
|
|
KeyRange beginVersionRange;
|
|
if (chunk.tenantPrefix.present()) {
|
|
beginVersionRange = KeyRangeRef(chunk.keyRange.begin.removePrefix(chunk.tenantPrefix.get()),
|
|
chunk.keyRange.end.removePrefix(chunk.tenantPrefix.get()));
|
|
} else {
|
|
beginVersionRange = chunk.keyRange;
|
|
}
|
|
|
|
if (!chunk.snapshotFile.present()) {
|
|
ASSERT(beginVersion > 0);
|
|
ASSERT(chunk.snapshotVersion == invalidVersion);
|
|
beginCollapsed++;
|
|
|
|
beginVersionByChunk.insert(beginVersionRange, beginVersion);
|
|
} else {
|
|
ASSERT(chunk.snapshotVersion != invalidVersion);
|
|
if (beginVersion > 0) {
|
|
beginNotCollapsed++;
|
|
}
|
|
}
|
|
|
|
// Validate boundary alignment.
|
|
threadData->validateGranuleBoundary(beginVersionRange.begin, beginVersionRange.end, lastBeginKey);
|
|
lastBeginKey = beginVersionRange.begin;
|
|
}
|
|
CODE_PROBE(beginCollapsed > 0, "BGCorrectness got collapsed request with beginVersion > 0");
|
|
CODE_PROBE(beginNotCollapsed > 0, "BGCorrectness got un-collapsed request with beginVersion > 0");
|
|
CODE_PROBE(beginCollapsed > 0 && beginNotCollapsed > 0,
|
|
"BGCorrectness got both collapsed and uncollapsed in the same request!");
|
|
|
|
while (checkIt != threadData->keyData.end() && checkIt->first < endKeyExclusive) {
|
|
uint32_t key = checkIt->first;
|
|
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
|
fmt::print("DBG READ: Key {0}\n", format("%08x", key));
|
|
}
|
|
|
|
// TODO could binary search this to find clearVersion if it gets long
|
|
int idIdx = 0;
|
|
for (; idIdx < checkIt->second.writes.size() && checkIt->second.writes[idIdx].clearVersion <= readVersion;
|
|
idIdx++) {
|
|
// iterate until we find the oldest tag that should have not been cleared
|
|
/*if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
|
fmt::print(
|
|
"DBG READ: Skip ID {0} cleared @ {1}\n", idIdx, checkIt->second.writes[idIdx].clearVersion);
|
|
}*/
|
|
}
|
|
for (; idIdx < checkIt->second.writes.size() && checkIt->second.writes[idIdx].writeVersion <= readVersion;
|
|
idIdx++) {
|
|
Key nextKeyShouldBe = threadData->getKey(key, idIdx);
|
|
Version keyBeginVersion = beginVersionByChunk.rangeContaining(nextKeyShouldBe).cvalue();
|
|
if (keyBeginVersion > checkIt->second.writes[idIdx].writeVersion) {
|
|
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
|
fmt::print("DBG READ: Skip ID {0} written @ {1} < beginVersion {2}\n",
|
|
idIdx,
|
|
checkIt->second.writes[idIdx].clearVersion,
|
|
keyBeginVersion);
|
|
}
|
|
continue;
|
|
}
|
|
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
|
fmt::print("DBG READ: Checking ID {0} ({1}) written @ {2}\n",
|
|
format("%08x", idIdx),
|
|
idIdx,
|
|
checkIt->second.writes[idIdx].writeVersion);
|
|
}
|
|
if (resultIdx >= blob.first.size()) {
|
|
// missing at end!!
|
|
logMismatch(threadData,
|
|
lastMatching,
|
|
nextKeyShouldBe,
|
|
Optional<Key>(),
|
|
Optional<Value>(),
|
|
Optional<Value>(),
|
|
startKeyInclusive,
|
|
endKeyExclusive,
|
|
beginVersion,
|
|
readVersion,
|
|
blob);
|
|
return false;
|
|
}
|
|
|
|
if (nextKeyShouldBe != blob.first[resultIdx].key) {
|
|
// key mismatch!
|
|
if (DEBUG_READ_OP(threadData->directoryID, readVersion)) {
|
|
printf("key mismatch!\n");
|
|
}
|
|
logMismatch(threadData,
|
|
lastMatching,
|
|
nextKeyShouldBe,
|
|
blob.first[resultIdx].key,
|
|
Optional<Value>(),
|
|
Optional<Value>(),
|
|
startKeyInclusive,
|
|
endKeyExclusive,
|
|
beginVersion,
|
|
readVersion,
|
|
blob);
|
|
return false;
|
|
} else if (!validateValue(blob.first[resultIdx].value,
|
|
checkIt->second.writes[idIdx].val,
|
|
checkIt->second.writes[idIdx].valLength)) {
|
|
logMismatch(threadData,
|
|
lastMatching,
|
|
nextKeyShouldBe,
|
|
blob.first[resultIdx].key,
|
|
genVal(checkIt->second.writes[idIdx].val, checkIt->second.writes[idIdx].valLength),
|
|
blob.first[resultIdx].value,
|
|
startKeyInclusive,
|
|
endKeyExclusive,
|
|
beginVersion,
|
|
readVersion,
|
|
blob);
|
|
return false;
|
|
// value mismatch for same key
|
|
} else {
|
|
lastMatching = nextKeyShouldBe;
|
|
}
|
|
resultIdx++;
|
|
}
|
|
checkIt++;
|
|
}
|
|
|
|
if (resultIdx < blob.first.size()) {
|
|
// blob has extra stuff!!
|
|
logMismatch(threadData,
|
|
lastMatching,
|
|
Optional<Key>(),
|
|
blob.first[resultIdx].key,
|
|
Optional<Value>(),
|
|
Optional<Value>(),
|
|
startKeyInclusive,
|
|
endKeyExclusive,
|
|
beginVersion,
|
|
readVersion,
|
|
blob);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
ACTOR Future<Void> readWorker(BlobGranuleCorrectnessWorkload* self,
|
|
Future<Void> firstSnapshot,
|
|
Database cx,
|
|
Reference<ThreadData> threadData) {
|
|
state double last = now();
|
|
state double targetBytesReadPerQuery =
|
|
SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES * 2.0 / deterministicRandom()->randomInt(1, 11);
|
|
|
|
// read at higher read rate than write rate to validate data
|
|
state double targetReadBytesPerSec = threadData->targetByteRate * 4;
|
|
ASSERT(targetReadBytesPerSec > 0);
|
|
|
|
state Version beginVersion;
|
|
state Version readVersion;
|
|
state KeyRange range;
|
|
|
|
TraceEvent("BlobGranuleCorrectnessReaderStart").log();
|
|
if (BGW_DEBUG) {
|
|
printf("BGW read thread starting\n");
|
|
}
|
|
|
|
// wait for data to read
|
|
wait(firstSnapshot);
|
|
wait(threadData->firstWriteSuccessful.getFuture());
|
|
|
|
TraceEvent("BlobGranuleCorrectnessReaderReady").log();
|
|
if (BGW_DEBUG) {
|
|
printf("BGW read thread ready\n");
|
|
}
|
|
|
|
loop {
|
|
try {
|
|
// Do 1 read
|
|
|
|
// pick key range by doing random start key, and then picking the end key based on that
|
|
int startKeyIdx = deterministicRandom()->randomInt(0, threadData->usedKeys.size());
|
|
state uint32_t startKey = threadData->usedKeys[startKeyIdx];
|
|
auto endKeyIt = threadData->keyData.find(startKey);
|
|
ASSERT(endKeyIt != threadData->keyData.end());
|
|
|
|
// sometimes force single key read, for edge case
|
|
state uint32_t endKey;
|
|
if (deterministicRandom()->random01() < 0.01) {
|
|
endKey = startKey + 1;
|
|
} else {
|
|
int targetQueryBytes = (deterministicRandom()->randomInt(1, 20) * targetBytesReadPerQuery) / 10;
|
|
int estimatedQueryBytes = 0;
|
|
for (int i = 0; estimatedQueryBytes < targetQueryBytes && endKeyIt != threadData->keyData.end();
|
|
i++, endKeyIt++) {
|
|
// iterate forward until end or target keys have passed
|
|
estimatedQueryBytes += (1 + endKeyIt->second.writes.size() - endKeyIt->second.nextClearIdx) *
|
|
threadData->targetValLength;
|
|
}
|
|
|
|
if (endKeyIt == threadData->keyData.end()) {
|
|
endKey = std::numeric_limits<uint32_t>::max();
|
|
} else {
|
|
endKey = endKeyIt->first;
|
|
}
|
|
}
|
|
|
|
range = KeyRangeRef(threadData->getKey(startKey, 0), threadData->getKey(endKey, 0));
|
|
|
|
// pick read version
|
|
ASSERT(threadData->writeVersions.back() >= threadData->minSuccessfulReadVersion);
|
|
size_t readVersionIdx;
|
|
// randomly choose up to date vs time travel read
|
|
if (deterministicRandom()->random01() < 0.5) {
|
|
threadData->reads++;
|
|
readVersionIdx = threadData->writeVersions.size() - 1;
|
|
readVersion = threadData->writeVersions.back();
|
|
} else {
|
|
threadData->timeTravelReads++;
|
|
size_t startIdx = 0;
|
|
loop {
|
|
readVersionIdx = deterministicRandom()->randomInt(startIdx, threadData->writeVersions.size());
|
|
readVersion = threadData->writeVersions[readVersionIdx];
|
|
if (readVersion >= threadData->minSuccessfulReadVersion) {
|
|
break;
|
|
} else {
|
|
startIdx = readVersionIdx + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// randomly choose begin version or not
|
|
beginVersion = 0;
|
|
if (deterministicRandom()->random01() < 0.5) {
|
|
int startIdx = 0;
|
|
int endIdxExclusive = readVersionIdx + 1;
|
|
// Choose skewed towards later versions. It's ok if beginVersion isn't readable though because it
|
|
// will collapse
|
|
size_t beginVersionIdx = (size_t)std::sqrt(
|
|
deterministicRandom()->randomInt(startIdx * startIdx, endIdxExclusive * endIdxExclusive));
|
|
beginVersion = threadData->writeVersions[beginVersionIdx];
|
|
}
|
|
|
|
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(
|
|
readFromBlob(cx, threadData->bstore, range, beginVersion, readVersion, threadData->tenantName));
|
|
self->validateResult(threadData, blob, startKey, endKey, beginVersion, readVersion);
|
|
|
|
int resultBytes = blob.first.expectedSize();
|
|
threadData->rowsRead += blob.first.size();
|
|
threadData->bytesRead += resultBytes;
|
|
|
|
wait(poisson(&last, (resultBytes + 1) / targetReadBytesPerSec));
|
|
} catch (Error& e) {
|
|
if (e.code() == error_code_operation_cancelled) {
|
|
throw;
|
|
}
|
|
if (e.code() == error_code_blob_granule_transaction_too_old) {
|
|
if (BGW_DEBUG) {
|
|
fmt::print("ERROR: TTO for [{0} - {1}) @ {2} for tenant {3}\n",
|
|
range.begin.printable(),
|
|
range.end.printable(),
|
|
readVersion,
|
|
threadData->tenantName.printable());
|
|
}
|
|
threadData->timeTravelTooOld++;
|
|
} else {
|
|
threadData->errors++;
|
|
if (BGW_DEBUG) {
|
|
printf("BGWorkload got unexpected error %s\n", e.name());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<Void> writeWorker(BlobGranuleCorrectnessWorkload* self,
|
|
Future<Void> firstSnapshot,
|
|
Database cx,
|
|
Reference<ThreadData> threadData) {
|
|
|
|
state double last = now();
|
|
state int keysPerQuery = 100;
|
|
// state int targetBytesPerQuery = threadData->targetValLength * keysPerQuery;
|
|
// state double targetTps = (1.0 * threadData->targetByteRate) / targetBytesPerQuery;
|
|
state uint32_t nextVal = 0;
|
|
|
|
TraceEvent("BlobGranuleCorrectnessWriterStart").log();
|
|
|
|
wait(firstSnapshot);
|
|
|
|
TraceEvent("BlobGranuleCorrectnessWriterReady").log();
|
|
|
|
loop {
|
|
state Transaction tr(cx, threadData->tenantName);
|
|
|
|
// pick rows to write and clear, generate values for writes
|
|
state std::vector<std::tuple<uint32_t, uint32_t, uint32_t, uint16_t>> keyAndIdToWrite;
|
|
state std::vector<std::pair<uint32_t, uint32_t>> keyAndIdToClear;
|
|
|
|
state int queryKeys =
|
|
keysPerQuery * (0.1 + deterministicRandom()->random01() * 1.8); // 10% to 190% of target keys per query
|
|
for (int i = 0; i < queryKeys; i++) {
|
|
uint32_t key;
|
|
if (threadData->keyData.empty() || deterministicRandom()->random01() > threadData->reuseKeyProb) {
|
|
// new key
|
|
if (threadData->nextKeySequential) {
|
|
key = threadData->usedKeys.size();
|
|
} else {
|
|
key = std::numeric_limits<uint32_t>::max();
|
|
while (key == std::numeric_limits<uint32_t>::max() ||
|
|
threadData->keyData.find(key) != threadData->keyData.end()) {
|
|
key = deterministicRandom()->randomUInt32();
|
|
}
|
|
}
|
|
|
|
// add new key to data structures
|
|
threadData->usedKeys.push_back(key);
|
|
threadData->keyData.insert({ key, KeyData() });
|
|
} else {
|
|
int keyIdx = deterministicRandom()->randomInt(0, threadData->usedKeys.size());
|
|
key = threadData->usedKeys[keyIdx];
|
|
}
|
|
|
|
auto keyIt = threadData->keyData.find(key);
|
|
ASSERT(keyIt != threadData->keyData.end());
|
|
|
|
int unclearedIds = keyIt->second.writes.size() - keyIt->second.nextClearIdx;
|
|
// if we are at targetIDs, 50% chance of adding one or clearing. If we are closer to 0, higher chance of
|
|
// adding one, if we are closer to 2x target IDs, higher chance of clearing one
|
|
double probAddId = (threadData->targetIDsPerKey * 2.0 - unclearedIds) / threadData->targetIDsPerKey;
|
|
if (deterministicRandom()->random01() < probAddId ||
|
|
keyIt->second.nextClearIdx == keyIt->second.writes.size()) {
|
|
int32_t val = nextVal++;
|
|
int16_t valLen = (0.5 + deterministicRandom()->random01()) * threadData->targetValLength;
|
|
if (valLen < 10) {
|
|
valLen = 10;
|
|
}
|
|
|
|
uint32_t nextId = keyIt->second.writes.size();
|
|
keyIt->second.writes.push_back(WriteData(val, valLen));
|
|
|
|
keyAndIdToWrite.push_back(std::tuple(key, nextId, val, valLen));
|
|
} else {
|
|
uint32_t idToClear = keyIt->second.nextClearIdx++;
|
|
keyAndIdToClear.push_back(std::pair(key, idToClear));
|
|
}
|
|
}
|
|
|
|
state int64_t txnBytes;
|
|
loop {
|
|
try {
|
|
// write rows in txn
|
|
for (auto& it : keyAndIdToWrite) {
|
|
Value v = self->genVal(std::get<2>(it), std::get<3>(it));
|
|
tr.set(threadData->getKey(std::get<0>(it), std::get<1>(it)), v);
|
|
}
|
|
for (auto& it : keyAndIdToClear) {
|
|
tr.clear(singleKeyRange(threadData->getKey(it.first, it.second)));
|
|
}
|
|
txnBytes = tr.getSize();
|
|
wait(tr.commit());
|
|
break;
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
|
|
Version commitVersion = tr.getCommittedVersion();
|
|
|
|
// once txn is committed, update write map
|
|
|
|
for (auto& it : keyAndIdToWrite) {
|
|
uint32_t key = std::get<0>(it);
|
|
uint32_t id = std::get<1>(it);
|
|
auto keyIt = threadData->keyData.find(key);
|
|
ASSERT(keyIt != threadData->keyData.end());
|
|
|
|
keyIt->second.writes[id].writeVersion = commitVersion;
|
|
if (DEBUG_KEY_OP(threadData->directoryID, key)) {
|
|
fmt::print("DBG: {0} WRITE {1} = {2}:{3}\n",
|
|
commitVersion,
|
|
format("%08x/%08x/%08x", threadData->directoryID, key, id),
|
|
std::get<2>(it),
|
|
std::get<3>(it));
|
|
}
|
|
}
|
|
|
|
for (auto& it : keyAndIdToClear) {
|
|
auto keyIt = threadData->keyData.find(it.first);
|
|
ASSERT(keyIt != threadData->keyData.end());
|
|
keyIt->second.writes[it.second].clearVersion = commitVersion;
|
|
if (DEBUG_KEY_OP(threadData->directoryID, it.first)) {
|
|
fmt::print("DBG: {0} CLEAR {1}\n",
|
|
commitVersion,
|
|
format("%08x/%08x/%08x", threadData->directoryID, it.first, it.second));
|
|
}
|
|
}
|
|
|
|
threadData->writeVersions.push_back(commitVersion);
|
|
|
|
if (threadData->firstWriteSuccessful.canBeSet()) {
|
|
threadData->firstWriteSuccessful.send(Void());
|
|
}
|
|
|
|
threadData->rowsWritten += queryKeys;
|
|
threadData->bytesWritten += txnBytes;
|
|
|
|
// wait
|
|
wait(poisson(&last, (txnBytes + 1.0) / threadData->targetByteRate));
|
|
}
|
|
}
|
|
|
|
Future<Void> start(Database const& cx) override {
|
|
clients.reserve(3 * directories.size());
|
|
for (auto& it : directories) {
|
|
// Wait for blob worker to initialize snapshot before starting test for that range
|
|
Future<Void> start = waitFirstSnapshot(this, cx, it, true);
|
|
it->summaryClient = validateGranuleSummaries(cx, normalKeys, it->tenantName, it->triggerSummaryComplete);
|
|
clients.push_back(timeout(writeWorker(this, start, cx, it), testDuration, Void()));
|
|
clients.push_back(timeout(readWorker(this, start, cx, it), testDuration, Void()));
|
|
}
|
|
return delay(testDuration);
|
|
}
|
|
|
|
ACTOR Future<Void> checkTenantRanges(BlobGranuleCorrectnessWorkload* self,
|
|
Database cx,
|
|
|
|
Reference<ThreadData> threadData) {
|
|
// check that reading ranges with tenant name gives valid result of ranges just for tenant, with no tenant
|
|
// prefix
|
|
loop {
|
|
state Transaction tr(cx, threadData->tenantName);
|
|
try {
|
|
Standalone<VectorRef<KeyRangeRef>> ranges = wait(tr.getBlobGranuleRanges(normalKeys, 1000000));
|
|
ASSERT(ranges.size() >= 1 && ranges.size() < 1000000);
|
|
ASSERT(ranges.front().begin == normalKeys.begin);
|
|
ASSERT(ranges.back().end == normalKeys.end);
|
|
for (int i = 0; i < ranges.size() - 1; i++) {
|
|
ASSERT(ranges[i].end == ranges[i + 1].begin);
|
|
}
|
|
return Void();
|
|
} catch (Error& e) {
|
|
wait(tr.onError(e));
|
|
}
|
|
}
|
|
}
|
|
|
|
ACTOR Future<bool> checkDirectory(Database cx,
|
|
BlobGranuleCorrectnessWorkload* self,
|
|
Reference<ThreadData> threadData) {
|
|
|
|
if (threadData->triggerSummaryComplete.canBeSet()) {
|
|
threadData->triggerSummaryComplete.send(Void());
|
|
}
|
|
state bool result = true;
|
|
state int finalRowsValidated;
|
|
if (threadData->writeVersions.empty()) {
|
|
// never had a successful write during the test, likely due to many chaos events. Just wait for granules to
|
|
// become available and call that a pass, since writer is stopped and will never guarantee anything is
|
|
// written
|
|
if (BGW_DEBUG) {
|
|
fmt::print("Directory {0} doing final availability check\n", threadData->directoryID);
|
|
}
|
|
wait(self->waitFirstSnapshot(self, cx, threadData, false));
|
|
} else {
|
|
// otherwise, read at last write version and ensure everything becomes available and matches
|
|
// it's possible that waitFirstSnapshot finished but then writer never wrote anything before test timed out
|
|
state Version readVersion = threadData->writeVersions.back();
|
|
if (BGW_DEBUG) {
|
|
fmt::print("Directory {0} doing final data check @ {1}\n", threadData->directoryID, readVersion);
|
|
}
|
|
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> blob = wait(readFromBlob(
|
|
cx, threadData->bstore, normalKeys /*tenant handles range*/, 0, readVersion, threadData->tenantName));
|
|
result = self->validateResult(threadData, blob, 0, std::numeric_limits<uint32_t>::max(), 0, readVersion);
|
|
finalRowsValidated = blob.first.size();
|
|
|
|
// then if we are still good, do another check at a higher version (not checking data) to ensure availabiity
|
|
// of empty versions
|
|
if (result) {
|
|
if (BGW_DEBUG) {
|
|
fmt::print("Directory {0} doing final availability check after data check\n",
|
|
threadData->directoryID);
|
|
}
|
|
wait(self->waitFirstSnapshot(self, cx, threadData, false));
|
|
}
|
|
}
|
|
// read granule ranges with tenant and validate
|
|
if (BGW_DEBUG) {
|
|
fmt::print("Directory {0} checking tenant ranges\n", threadData->directoryID);
|
|
}
|
|
wait(self->checkTenantRanges(self, cx, threadData));
|
|
|
|
state bool initialCheck = result;
|
|
result &= threadData->mismatches == 0 && (threadData->timeTravelTooOld == 0);
|
|
|
|
fmt::print("Blob Granule Workload Directory {0} {1}:\n", threadData->directoryID, result ? "passed" : "failed");
|
|
fmt::print(" Final granule check {0}successful\n", initialCheck ? "" : "un");
|
|
fmt::print(" {} Rows read in final check\n", finalRowsValidated);
|
|
fmt::print(" {} mismatches\n", threadData->mismatches);
|
|
fmt::print(" {} time travel too old\n", threadData->timeTravelTooOld);
|
|
fmt::print(" {} errors\n", threadData->errors);
|
|
fmt::print(" {} rows written\n", threadData->rowsWritten);
|
|
fmt::print(" {} bytes written\n", threadData->bytesWritten);
|
|
fmt::print(" {} unique keys\n", threadData->usedKeys.size());
|
|
fmt::print(" {} real-time reads\n", threadData->reads);
|
|
fmt::print(" {} time travel reads\n", threadData->timeTravelReads);
|
|
fmt::print(" {} rows read\n", threadData->rowsRead);
|
|
fmt::print(" {} bytes read\n", threadData->bytesRead);
|
|
// FIXME: add above as details to trace event
|
|
|
|
TraceEvent("BlobGranuleWorkloadChecked").detail("Directory", threadData->directoryID).detail("Result", result);
|
|
|
|
// For some reason simulation is still passing when this fails?.. so assert for now
|
|
ASSERT(result);
|
|
|
|
if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && self->doMergeCheckAtEnd) {
|
|
CODE_PROBE(true, "BGCorrectness clearing database and awaiting merge");
|
|
wait(clearAndAwaitMerge(cx, threadData->directoryRange));
|
|
}
|
|
|
|
// validate that summary completes without error
|
|
wait(threadData->summaryClient);
|
|
|
|
return result;
|
|
}
|
|
|
|
ACTOR Future<bool> _check(Database cx, BlobGranuleCorrectnessWorkload* self) {
|
|
// check error counts, and do an availability check at the end
|
|
state std::vector<Future<bool>> results;
|
|
state Future<Void> checkFeedCleanupFuture;
|
|
if (self->clientId == 0) {
|
|
checkFeedCleanupFuture = checkFeedCleanup(cx, BGW_DEBUG);
|
|
} else {
|
|
checkFeedCleanupFuture = Future<Void>(Void());
|
|
}
|
|
|
|
for (auto& it : self->directories) {
|
|
results.push_back(self->checkDirectory(cx, self, it));
|
|
}
|
|
state bool allSuccessful = true;
|
|
for (auto& f : results) {
|
|
bool dirSuccess = wait(f);
|
|
allSuccessful &= dirSuccess;
|
|
}
|
|
wait(checkFeedCleanupFuture);
|
|
return allSuccessful;
|
|
}
|
|
|
|
Future<bool> check(Database const& cx) override { return _check(cx, this); }
|
|
void getMetrics(std::vector<PerfMetric>& m) override {}
|
|
};
|
|
|
|
WorkloadFactory<BlobGranuleCorrectnessWorkload> BlobGranuleCorrectnessWorkloadFactory;
|