adding logic to disable splitting within a truncated tuple, and validating it in test (#10106)

This commit is contained in:
Josh Slocum 2023-05-03 10:23:46 -05:00 committed by GitHub
parent 7e872c4a59
commit 22155c84f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 265 additions and 8 deletions

View File

@ -1085,6 +1085,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BG_CONSISTENCY_CHECK_ENABLED, true ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_ENABLED = false;
init( BG_CONSISTENCY_CHECK_TARGET_SPEED_KB, 1000 ); if (randomize && BUGGIFY) BG_CONSISTENCY_CHECK_TARGET_SPEED_KB *= (deterministicRandom()->randomInt(2, 50) / 10);
init( BG_KEY_TUPLE_TRUNCATE_OFFSET, 0 );
init( BG_ENABLE_SPLIT_TRUNCATED, false ); if (randomize && BUGGIFY) BG_ENABLE_SPLIT_TRUNCATED = true;
init( BG_ENABLE_READ_DRIVEN_COMPACTION, true ); if (randomize && BUGGIFY) BG_ENABLE_READ_DRIVEN_COMPACTION = false;
init( BG_RDC_BYTES_FACTOR, 2 ); if (randomize && BUGGIFY) BG_RDC_BYTES_FACTOR = deterministicRandom()->randomInt(1, 10);
init( BG_RDC_READ_FACTOR, 3 ); if (randomize && BUGGIFY) BG_RDC_READ_FACTOR = deterministicRandom()->randomInt(1, 10);

View File

@ -1092,6 +1092,7 @@ public:
int BG_MERGE_CANDIDATE_THRESHOLD_SECONDS;
int BG_MERGE_CANDIDATE_DELAY_SECONDS;
int BG_KEY_TUPLE_TRUNCATE_OFFSET;
bool BG_ENABLE_SPLIT_TRUNCATED;
bool BG_ENABLE_READ_DRIVEN_COMPACTION;
int BG_RDC_BYTES_FACTOR;
int BG_RDC_READ_FACTOR;

View File

@ -578,12 +578,14 @@ static void alignKeyBoundary(Reference<BlobManagerData> bmData,
alignedKey = alignedKey.withPrefix(tenantData->entry.prefix, keys.arena());
}
// Only add the alignedKey if it's larger than the last key. If it's the same, drop the split.
// Only add the alignedKey if it's larger than the last key. If it's the same, drop the split if not allowed.
if (alignedKey <= keys.back()) {
// Set split boundary.
BlobGranuleMergeBoundary boundary = { /*buddy=*/true };
boundaries[key] = boundary;
keys.push_back_deep(keys.arena(), key);
if (SERVER_KNOBS->BG_ENABLE_SPLIT_TRUNCATED) {
// Set split boundary.
BlobGranuleMergeBoundary boundary = { /*buddy=*/true };
boundaries[key] = boundary;
keys.push_back_deep(keys.arena(), key);
} // else drop the split
} else {
keys.push_back_deep(keys.arena(), alignedKey);
}
@ -1596,8 +1598,6 @@ ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
// FIXME: only need to align propsedSplitKey in the middle
state BlobGranuleSplitPoints finalSplit = wait(alignKeys(bmData, granuleRange, newRanges));
ASSERT(finalSplit.keys.size() > 2);
if (BM_DEBUG) {
fmt::print("Aligned split ({0}):\n", finalSplit.keys.size());
for (auto& it : finalSplit.keys) {
@ -1605,6 +1605,8 @@ ACTOR Future<Void> reevaluateInitialSplit(Reference<BlobManagerData> bmData,
}
}
ASSERT(finalSplit.keys.size() > 2);
// Check lock to see if lock is still the specified epoch and seqno, and there are no files for the granule.
// If either of these are false, some other worker now has the granule. if there are files, it already succeeded at
// a split. if not, and it fails too, it will retry and get back here

View File

@ -941,13 +941,15 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
state std::string fileName = randomBGFilename(bwData->id, granuleID, version, ".snapshot");
state Standalone<GranuleSnapshot> snapshot;
state int64_t bytesRead = 0;
state bool canStopEarly =
(SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET == 0 || SERVER_KNOBS->BG_ENABLE_SPLIT_TRUNCATED);
state bool injectTooBig = initialSnapshot && g_network->isSimulated() && BUGGIFY_WITH_PROB(0.1);
wait(delay(0, TaskPriority::BlobWorkerUpdateStorage));
loop {
try {
if (initialSnapshot && snapshot.size() > 1 &&
if (initialSnapshot && snapshot.size() > 1 && canStopEarly &&
(injectTooBig || bytesRead >= 3 * SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES)) {
// throw transaction too old either on injection for simulation, or if snapshot would be too large now
throw transaction_too_old();

View File

@ -0,0 +1,229 @@
/*
* BlobGranuleMergeBoundariesWorkload.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2023 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/FDBTypes.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbclient/TenantManagement.actor.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/Util.h"
#include <cstring>
#include <limits>
#include "flow/actorcompiler.h" // This must be the last #include.
/*
* Verifies that, depending on the specified functionality for splitting a tuple prefix with
* bg_key_tuple_truncate_offset=1 Writes several normal granules' worth of data to one tuple prefix, and validates the
* desired configuration of splitting within a tuple prefix being allowed/disallowed.
*/
struct BlobGranuleMergeBoundariesWorkload : TestWorkload {
static constexpr auto NAME = "BlobGranuleMergeBoundaries";
int targetGranules;
bool initAfter;
int nodeCount;
int targetValueLen;
Optional<TenantName> tenantName;
Optional<Reference<Tenant>> tenant;
BlobGranuleMergeBoundariesWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
targetGranules = 3 + sharedRandomNumber % 6;
sharedRandomNumber /= 6;
initAfter = (sharedRandomNumber % 4) == 0;
sharedRandomNumber /= 4;
targetValueLen = 100 * (1 + sharedRandomNumber % 10);
sharedRandomNumber /= 10;
int64_t targetBytes = targetGranules * SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES;
targetBytes = std::max<int64_t>(
1000000, targetBytes); // write at least 1 MB to avoid very small granule/byte sample issues
nodeCount = (int)(targetBytes / targetValueLen);
tenantName = "bgMergeBoundsTenant"_sr;
// FIXME: maybe enable for completeness at some point? We probably will never convert non-empty ranges to blob
// after 71.3
initAfter = false;
TraceEvent("BlobGranuleMergeBoundariesWorkloadInit")
.detail("TargetGranules", targetGranules)
.detail("InitAfter", initAfter)
.detail("TargetValSize", targetValueLen)
.detail("TargetBytes", targetBytes)
.detail("GranuleSize", SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_BYTES)
.detail("NodeCount", nodeCount);
}
Future<Void> setup(Database const& cx) override { return _setup(cx, this); }
ACTOR Future<Void> setUpBlobRange(Database cx, BlobGranuleMergeBoundariesWorkload* self) {
bool success = wait(cx->blobbifyRange(normalKeys, self->tenant));
ASSERT(success);
return Void();
}
// Functions required by `bulkSetup()`
// key is always a 2-tuple with the same first element and a different last element
Key keyForIndex(int n) { return Tuple::makeTuple(7, n).pack(); }
Value value(int n) {
// FIXME: shared with BlobGranuleCorrectnessWorkload
int valLen = deterministicRandom()->randomInt(1, 2 * targetValueLen);
valLen = std::max(10, valLen);
std::string v(valLen, 'z');
auto valFormatted = format("%08x", n);
ASSERT(valFormatted.size() <= v.size());
for (int i = 0; i < valFormatted.size(); i++) {
v[i] = valFormatted[i];
}
// copy into an arena
// TODO do this in original arena? a bit more efficient that way
Arena a;
return Standalone<StringRef>(StringRef(a, v), a);
}
Standalone<KeyValueRef> operator()(int n) { return KeyValueRef(keyForIndex(n), value(n)); }
ACTOR Future<Void> _setup(Database cx, BlobGranuleMergeBoundariesWorkload* self) {
if (self->clientId != 0) {
return Void();
}
TraceEvent("BlobGranuleMergeBoundariesInit")
.detail("TargetGranules", self->targetGranules)
.detail("InitAfter", self->initAfter);
// set up blob granules
wait(success(ManagementAPI::changeConfig(cx.getReference(), "blob_granules_enabled=1", true)));
Optional<TenantMapEntry> entry = wait(TenantAPI::createTenant(cx.getReference(), self->tenantName.get()));
ASSERT(entry.present());
self->tenant = makeReference<Tenant>(cx, self->tenantName.get());
if (!self->initAfter) {
wait(self->setUpBlobRange(cx, self));
TraceEvent("BlobGranuleMergeBoundariesSetupVerifying");
loop {
Version checkVersion = wait(cx->verifyBlobRange(normalKeys, latestVersion, self->tenant));
if (checkVersion != -1) {
break;
}
TraceEvent("BlobGranuleMergeBoundariesSetupVerifyRetrying");
wait(delay(1.0));
}
}
TraceEvent("BlobGranuleMergeBoundariesLoading");
// we only have one client and bulk setup divides the writes amongst them, so multiply node count by client
// count
wait(bulkSetup(cx,
self,
self->nodeCount * self->clientCount,
Promise<double>(),
true,
0.0,
1e12,
std::vector<uint64_t>(),
Promise<std::vector<std::pair<uint64_t, double>>>(),
0,
0.1,
0,
0,
{ self->tenant.get() }));
TraceEvent("BlobGranuleMergeBoundariesLoadingComplete");
if (self->initAfter) {
wait(self->setUpBlobRange(cx, self));
}
TraceEvent("BlobGranuleMergeBoundariesSetupComplete");
return Void();
}
Future<Void> start(Database const& cx) override {
// no test phase
return Void();
}
Future<bool> check(Database const& cx) override { return _check(cx, this); }
ACTOR Future<bool> _check(Database cx, BlobGranuleMergeBoundariesWorkload* self) {
if (self->clientId != 0) {
return true;
}
state Key tuplePrefix = Tuple::makeTuple(7).pack();
// FIXME: checking normalKeys finds another empty granule, that's metadata overhead we should fix at some point
state KeyRange tupleRange(KeyRangeRef(tuplePrefix, strinc(tuplePrefix)));
TraceEvent("BlobGranuleMergeBoundariesCheckStart").detail("Range", tupleRange);
loop {
Version checkVersion = wait(cx->verifyBlobRange(tupleRange, latestVersion, self->tenant));
if (checkVersion != -1) {
TraceEvent("BlobGranuleMergeBoundariesCheckRead").detail("CheckVersion", checkVersion);
break;
}
TraceEvent("BlobGranuleMergeBoundariesCheckRetrying");
wait(delay(1.0));
}
state Transaction tr(cx, self->tenant);
loop {
try {
Standalone<VectorRef<KeyRangeRef>> granules = wait(tr.getBlobGranuleRanges(tupleRange, 1000000));
TraceEvent("BlobGranuleMergeBoundariesCheckGranules")
.detail("GranuleCount", granules.size())
.detail("EnableSplitTruncated", SERVER_KNOBS->BG_ENABLE_SPLIT_TRUNCATED)
.detail("TruncateOffset", SERVER_KNOBS->BG_KEY_TUPLE_TRUNCATE_OFFSET);
if (SERVER_KNOBS->BG_ENABLE_SPLIT_TRUNCATED) {
// test the test to ensure in the case where this knob wasn't set, we would be producing multiple
// granules
// FIXME: sometimes behind granule resnapshotting means we still only have one granule so we can't
// assert > 1
ASSERT(granules.size() >= 1);
} else {
ASSERT(granules.size() == 1);
}
break;
} catch (Error& e) {
wait(tr.onError(e));
}
}
TraceEvent("BlobGranuleMergeBoundariesCheckDone");
return true;
}
void getMetrics(std::vector<PerfMetric>& m) override {}
};
WorkloadFactory<BlobGranuleMergeBoundariesWorkload> BlobGranuleMergeBoundariesWorkloadFactory;

View File

@ -244,6 +244,7 @@ if(WITH_PYTHON)
endif()
add_fdb_test(TEST_FILES rare/BlobGranuleRanges.toml)
add_fdb_test(TEST_FILES rare/BlobGranuleMergeBoundaries.toml)
add_fdb_test(TEST_FILES rare/CheckRelocation.toml)
add_fdb_test(TEST_FILES rare/ClogTlog.toml)
add_fdb_test(TEST_FILES rare/ClogUnclog.toml)

View File

@ -0,0 +1,21 @@
# this test is lower value than the other blob granule tests, it's essentially a unit test, so run it less frequently
testPriority = '10'
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
tenantModes = ['required']
[[knobs]]
bg_key_tuple_truncate_offset = 1
bg_metadata_source = "tenant"
enable_rest_kms_communication = true
[[test]]
testTitle = 'BlobGranuleMergeBoundaries'
[[test.workload]]
testName = 'BlobGranuleMergeBoundaries'