Merge branch 'main' of github.com:sfc-gh-nwijetunga/foundationdb into nim/tenant-encryption-property

* 'main' of github.com:sfc-gh-nwijetunga/foundationdb: (42 commits)
  Get ShardedRocks ready for simulation test. (#7679)
  fixing specific unit test
  formatting
  addressing review comments
  Fix incorrect deserialization of FdbClientLogEvents::Event (#7707)
  Fix a crash bug during CC shutdown process (#7705)
  addressing review comments
  remove runAfter
  Add comments explaining the use of the TransactionState tenant() and hasTenant() functions
  Make sure resumeFromDataMoves() starts after resumeFromShards().
  Fix: during recovery, it was possible for tenant operations to briefly fail because the tenant mode is not known
  formatting
  Fixed granule purging bug and improved debugging for purging
  making purge failures fail test
  cleanup and polish
  Bug fix and cleanup
  First version of key-sorted delta files
  Added full granule read unit test
  Completed delta format unit test
  delta file test and delta generation
  ...
This commit is contained in:
Nim Wijetunga 2022-07-27 08:13:30 -07:00
commit 50391c35b1
38 changed files with 2746 additions and 596 deletions

View File

@ -194,7 +194,7 @@ class BaseInfo(object):
if protocol_version >= PROTOCOL_VERSION_6_3:
self.dc_id = bb.get_bytes_with_length()
if protocol_version >= PROTOCOL_VERSION_7_1:
if bb.get_bytes(1):
if bb.get_bool():
self.tenant = bb.get_bytes_with_length()
class GetVersionInfo(BaseInfo):

File diff suppressed because it is too large Load Diff

View File

@ -3234,13 +3234,26 @@ TenantInfo TransactionState::getTenantInfo() {
} else if (!t.present()) {
return TenantInfo();
} else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && t.present()) {
throw tenants_disabled();
// If we are running provisional proxies, we allow a tenant request to go through since we don't know the tenant
// mode. Such a transaction would not be allowed to commit without enabling provisional commits because either
// the commit proxies will be provisional or the read version will be too old.
if (!cx->clientInfo->get().grvProxies.empty() && !cx->clientInfo->get().grvProxies[0].provisional) {
throw tenants_disabled();
} else {
ASSERT(!useProvisionalProxies);
}
}
ASSERT(tenantId != TenantInfo::INVALID_TENANT);
return TenantInfo(t.get(), tenantId);
}
// Returns the tenant used in this transaction. If the tenant is unset and raw access isn't specified, then the default
// tenant from DatabaseContext is applied to this transaction (note: the default tenant is typically unset, but in
// simulation could be something different).
//
// This function should not be called in the transaction constructor or in the setOption function to allow a user the
// opportunity to set raw access.
Optional<TenantName> const& TransactionState::tenant() {
if (tenantSet) {
return tenant_;
@ -3253,6 +3266,9 @@ Optional<TenantName> const& TransactionState::tenant() {
}
}
// Returns true if the tenant has been set, but does not cause default tenant resolution. This is useful in setOption
// (where we do not want to call tenant()) if we want to enforce that an option not be set on a Tenant transaction (e.g.
// for raw access).
bool TransactionState::hasTenant() const {
return tenantSet && tenant_.present();
}
@ -6570,6 +6586,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, Optional<Strin
case FDBTransactionOptions::USE_PROVISIONAL_PROXIES:
validateOptionValueNotPresent(value);
if (trState->hasTenant()) {
Error e = invalid_option();
TraceEvent(SevWarn, "TenantTransactionUseProvisionalProxies").error(e).detail("Tenant", trState->tenant());
throw e;
}
trState->options.getReadVersionFlags |= GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES;
trState->useProvisionalProxies = UseProvisionalProxies::True;
break;
@ -9388,11 +9409,20 @@ Future<Void> DatabaseContext::getChangeFeedStream(Reference<ChangeFeedData> resu
Reference<DatabaseContext>::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped);
}
ACTOR Future<std::vector<OverlappingChangeFeedEntry>> singleLocationOverlappingChangeFeeds(
Database cx,
Reference<LocationInfo> location,
KeyRangeRef range,
Version minVersion) {
Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const {
Version v = invalidVersion;
for (auto& it : feedMetadataVersions) {
if (it.second > v && it.first.intersects(range)) {
v = it.second;
}
}
return v;
}
ACTOR Future<OverlappingChangeFeedsReply> singleLocationOverlappingChangeFeeds(Database cx,
Reference<LocationInfo> location,
KeyRangeRef range,
Version minVersion) {
state OverlappingChangeFeedsRequest req;
req.range = range;
req.minVersion = minVersion;
@ -9404,16 +9434,16 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> singleLocationOverlappingC
TaskPriority::DefaultPromiseEndpoint,
AtMostOnce::False,
cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr));
return rep.rangeIds;
return rep;
}
bool compareChangeFeedResult(const OverlappingChangeFeedEntry& i, const OverlappingChangeFeedEntry& j) {
return i.rangeId < j.rangeId;
return i.feedId < j.feedId;
}
ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsActor(Reference<DatabaseContext> db,
KeyRangeRef range,
Version minVersion) {
ACTOR Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeedsActor(Reference<DatabaseContext> db,
KeyRangeRef range,
Version minVersion) {
state Database cx(db);
state Span span("NAPI:GetOverlappingChangeFeeds"_loc);
@ -9439,19 +9469,33 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsA
throw all_alternatives_failed();
}
state std::vector<Future<std::vector<OverlappingChangeFeedEntry>>> allOverlappingRequests;
state std::vector<Future<OverlappingChangeFeedsReply>> allOverlappingRequests;
for (auto& it : locations) {
allOverlappingRequests.push_back(
singleLocationOverlappingChangeFeeds(cx, it.locations, it.range & range, minVersion));
}
wait(waitForAll(allOverlappingRequests));
std::vector<OverlappingChangeFeedEntry> result;
for (auto& it : allOverlappingRequests) {
result.insert(result.end(), it.get().begin(), it.get().end());
OverlappingChangeFeedsInfo result;
std::unordered_map<KeyRef, OverlappingChangeFeedEntry> latestFeedMetadata;
for (int i = 0; i < locations.size(); i++) {
result.arena.dependsOn(allOverlappingRequests[i].get().arena);
result.arena.dependsOn(locations[i].range.arena());
result.feedMetadataVersions.push_back(
{ locations[i].range, allOverlappingRequests[i].get().feedMetadataVersion });
for (auto& it : allOverlappingRequests[i].get().feeds) {
auto res = latestFeedMetadata.insert({ it.feedId, it });
if (!res.second) {
CODE_PROBE(true, "deduping fetched overlapping feed by higher metadata version");
if (res.first->second.feedMetadataVersion < it.feedMetadataVersion) {
res.first->second = it;
}
}
}
}
for (auto& it : latestFeedMetadata) {
result.feeds.push_back(result.arena, it.second);
}
std::sort(result.begin(), result.end(), compareChangeFeedResult);
result.resize(std::unique(result.begin(), result.end()) - result.begin());
return result;
} catch (Error& e) {
if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) {
@ -9464,8 +9508,7 @@ ACTOR Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeedsA
}
}
Future<std::vector<OverlappingChangeFeedEntry>> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range,
Version minVersion) {
Future<OverlappingChangeFeedsInfo> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range, Version minVersion) {
return getOverlappingChangeFeedsActor(Reference<DatabaseContext>::addRef(this), range, minVersion);
}
@ -9589,7 +9632,7 @@ ACTOR Future<Key> purgeBlobGranulesActor(Reference<DatabaseContext> db,
state bool loadedTenantPrefix = false;
// FIXME: implement force
if (!force) {
if (force) {
throw unsupported_operation();
}

View File

@ -701,8 +701,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( FETCH_BLOCK_BYTES, 2e6 );
init( FETCH_KEYS_PARALLELISM_BYTES, 4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6;
init( FETCH_KEYS_PARALLELISM, 2 );
init( FETCH_KEYS_PARALLELISM_FULL, 10 );
init( FETCH_KEYS_LOWER_PRIORITY, 0 );
init( FETCH_CHANGEFEED_PARALLELISM, 2 );
init( FETCH_CHANGEFEED_PARALLELISM, 4 );
init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 );
init( BUGGIFY_BLOCK_BYTES, 10000 );
init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS );
@ -907,11 +908,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
// BlobGranuleVerify* simulation tests use "blobRangeKeys", BlobGranuleCorrectness* use "tenant", default in real clusters is "tenant"
init( BG_RANGE_SOURCE, "tenant" );
// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY);
init( BG_METADATA_SOURCE, "knobs" );
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (simulationMediumShards || (randomize && BUGGIFY) ) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
init( BG_SNAPSHOT_FILE_TARGET_CHUNKS, 100 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNKS = 1 << deterministicRandom()->randomInt(0, 8);
init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000;
init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8));
init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 );
init( BG_DELTA_FILE_TARGET_BYTES, BG_DELTA_BYTES_BEFORE_COMPACT/10 );
init( BG_DELTA_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_DELTA_FILE_TARGET_CHUNK_BYTES = BG_DELTA_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 7));
init( BG_MAX_SPLIT_FANOUT, 10 ); if( randomize && BUGGIFY ) BG_MAX_SPLIT_FANOUT = deterministicRandom()->randomInt(5, 15);
init( BG_MAX_MERGE_FANIN, 10 ); if( randomize && BUGGIFY ) BG_MAX_MERGE_FANIN = deterministicRandom()->randomInt(2, 15);
init( BG_HOT_SNAPSHOT_VERSIONS, 5000000 );

View File

@ -46,6 +46,7 @@ struct GranuleSnapshot : VectorRef<KeyValueRef> {
}
};
// Deltas in version order
struct GranuleDeltas : VectorRef<MutationsAndVersionRef> {
constexpr static FileIdentifier file_identifier = 8563013;

View File

@ -27,11 +27,15 @@
#include "flow/CompressionUtils.h"
Value serializeChunkedSnapshot(Standalone<GranuleSnapshot> snapshot,
int chunks,
int chunkSize,
Optional<CompressionFilter> compressFilter,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = Optional<BlobGranuleCipherKeysCtx>());
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = {});
// FIXME: support sorted and chunked delta files
Value serializeChunkedDeltaFile(Standalone<GranuleDeltas> deltas,
const KeyRangeRef& fileRange,
int chunkSize,
Optional<CompressionFilter> compressFilter,
Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx = {});
ErrorOr<RangeResult> loadAndMaterializeBlobGranules(const Standalone<VectorRef<BlobGranuleChunkRef>>& files,
const KeyRangeRef& keyRange,

View File

@ -207,6 +207,16 @@ struct KeyRangeLocationInfo {
: tenantEntry(tenantEntry), range(range), locations(locations) {}
};
struct OverlappingChangeFeedsInfo {
Arena arena;
VectorRef<OverlappingChangeFeedEntry> feeds;
// would prefer to use key range map but it complicates copy/move constructors
std::vector<std::pair<KeyRangeRef, Version>> feedMetadataVersions;
// for a feed that wasn't present, returns the metadata version it would have been fetched at.
Version getFeedMetadataVersion(const KeyRangeRef& feedRange) const;
};
class DatabaseContext : public ReferenceCounted<DatabaseContext>, public FastAllocated<DatabaseContext>, NonCopyable {
public:
static DatabaseContext* allocateOnForeignThread() {
@ -361,7 +371,7 @@ public:
int replyBufferSize = -1,
bool canReadPopped = true);
Future<std::vector<OverlappingChangeFeedEntry>> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<OverlappingChangeFeedsInfo> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion);
Future<Void> popChangeFeedMutations(Key rangeID, Version version);
Future<Key> purgeBlobGranules(KeyRange keyRange,

View File

@ -659,6 +659,7 @@ public:
int FETCH_BLOCK_BYTES;
int FETCH_KEYS_PARALLELISM_BYTES;
int FETCH_KEYS_PARALLELISM;
int FETCH_KEYS_PARALLELISM_FULL;
int FETCH_KEYS_LOWER_PRIORITY;
int FETCH_CHANGEFEED_PARALLELISM;
int SERVE_FETCH_CHECKPOINT_PARALLELISM;
@ -887,8 +888,9 @@ public:
std::string BG_METADATA_SOURCE;
int BG_SNAPSHOT_FILE_TARGET_BYTES;
int BG_SNAPSHOT_FILE_TARGET_CHUNKS;
int BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES;
int BG_DELTA_FILE_TARGET_BYTES;
int BG_DELTA_FILE_TARGET_CHUNK_BYTES;
int BG_DELTA_BYTES_BEFORE_COMPACT;
int BG_MAX_SPLIT_FANOUT;
int BG_MAX_MERGE_FANIN;

View File

@ -970,39 +970,51 @@ struct FetchCheckpointKeyValuesRequest {
};
struct OverlappingChangeFeedEntry {
Key rangeId;
KeyRange range;
KeyRef feedId;
KeyRangeRef range;
Version emptyVersion;
Version stopVersion;
Version feedMetadataVersion;
bool operator==(const OverlappingChangeFeedEntry& r) const {
return rangeId == r.rangeId && range == r.range && emptyVersion == r.emptyVersion &&
stopVersion == r.stopVersion;
return feedId == r.feedId && range == r.range && emptyVersion == r.emptyVersion &&
stopVersion == r.stopVersion && feedMetadataVersion == r.feedMetadataVersion;
}
OverlappingChangeFeedEntry() {}
OverlappingChangeFeedEntry(Key const& rangeId, KeyRange const& range, Version emptyVersion, Version stopVersion)
: rangeId(rangeId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion) {}
OverlappingChangeFeedEntry(KeyRef const& feedId,
KeyRangeRef const& range,
Version emptyVersion,
Version stopVersion,
Version feedMetadataVersion)
: feedId(feedId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion),
feedMetadataVersion(feedMetadataVersion) {}
OverlappingChangeFeedEntry(Arena& arena, const OverlappingChangeFeedEntry& rhs)
: feedId(arena, rhs.feedId), range(arena, rhs.range), emptyVersion(rhs.emptyVersion),
stopVersion(rhs.stopVersion), feedMetadataVersion(rhs.feedMetadataVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeId, range, emptyVersion, stopVersion);
serializer(ar, feedId, range, emptyVersion, stopVersion, feedMetadataVersion);
}
};
struct OverlappingChangeFeedsReply {
constexpr static FileIdentifier file_identifier = 11815134;
std::vector<OverlappingChangeFeedEntry> rangeIds;
VectorRef<OverlappingChangeFeedEntry> feeds;
bool cached;
Arena arena;
Version feedMetadataVersion;
OverlappingChangeFeedsReply() : cached(false) {}
explicit OverlappingChangeFeedsReply(std::vector<OverlappingChangeFeedEntry> const& rangeIds)
: rangeIds(rangeIds), cached(false) {}
OverlappingChangeFeedsReply() : cached(false), feedMetadataVersion(invalidVersion) {}
explicit OverlappingChangeFeedsReply(VectorRef<OverlappingChangeFeedEntry> const& feeds,
Version feedMetadataVersion)
: feeds(feeds), cached(false), feedMetadataVersion(feedMetadataVersion) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, rangeIds, arena);
serializer(ar, feeds, arena, feedMetadataVersion);
}
};

View File

@ -23,6 +23,7 @@
#include "flow/ProtocolVersion.h"
#include <algorithm>
#include <string>
#include <limits>
#pragma once
#include "flow/flow.h"
@ -469,6 +470,8 @@ public:
bool setDiffProtocol; // true if a process with a different protocol version has been started
bool allowStorageMigrationTypeChange = false;
double injectTargetedSSRestartTime = std::numeric_limits<double>::max();
double injectSSDelayTime = std::numeric_limits<double>::max();
flowGlobalType global(int id) const final { return getCurrentProcess()->global(id); };
void setGlobal(size_t id, flowGlobalType v) final { getCurrentProcess()->setGlobal(id, v); };

View File

@ -143,30 +143,34 @@ bool compareFDBAndBlob(RangeResult fdb,
}
}
printf("Chunks:\n");
for (auto& chunk : blob.second) {
printf("[%s - %s)\n", chunk.keyRange.begin.printable().c_str(), chunk.keyRange.end.printable().c_str());
printf(" SnapshotFile:\n %s\n",
chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : "<none>");
printf(" DeltaFiles:\n");
for (auto& df : chunk.deltaFiles) {
printf(" %s\n", df.toString().c_str());
}
printf(" Deltas: (%d)", chunk.newDeltas.size());
if (chunk.newDeltas.size() > 0) {
fmt::print(" with version [{0} - {1}]",
chunk.newDeltas[0].version,
chunk.newDeltas[chunk.newDeltas.size() - 1].version);
}
fmt::print(" IncludedVersion: {}\n", chunk.includedVersion);
}
printf("\n");
printGranuleChunks(blob.second);
}
}
return correct;
}
void printGranuleChunks(const Standalone<VectorRef<BlobGranuleChunkRef>>& chunks) {
printf("Chunks:\n");
for (auto& chunk : chunks) {
printf("[%s - %s)\n", chunk.keyRange.begin.printable().c_str(), chunk.keyRange.end.printable().c_str());
printf(" SnapshotFile:\n %s\n",
chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : "<none>");
printf(" DeltaFiles:\n");
for (auto& df : chunk.deltaFiles) {
printf(" %s\n", df.toString().c_str());
}
printf(" Deltas: (%d)", chunk.newDeltas.size());
if (chunk.newDeltas.size() > 0) {
fmt::print(" with version [{0} - {1}]",
chunk.newDeltas[0].version,
chunk.newDeltas[chunk.newDeltas.size() - 1].version);
}
fmt::print(" IncludedVersion: {}\n", chunk.includedVersion);
}
printf("\n");
}
ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range) {
// clear key range and check whether it is merged or not, repeatedly
state Transaction tr(cx);

View File

@ -52,6 +52,7 @@
*/
#define BM_DEBUG false
#define BM_PURGE_DEBUG false
void handleClientBlobRange(KeyRangeMap<bool>* knownBlobRanges,
Arena& ar,
@ -1649,7 +1650,9 @@ ACTOR Future<Void> persistMergeGranulesDone(Reference<BlobManagerData> bmData,
state Key lockKey = blobGranuleLockKeyFor(parentRange);
state Future<Optional<Value>> oldLockFuture = tr->get(lockKey);
wait(updateChangeFeed(tr,
// This has to be
// TODO: fix this better! (privatize change feed key clear)
wait(updateChangeFeed(&tr->getTransaction(),
granuleIDToCFKey(parentGranuleIDs[parentIdx]),
ChangeFeedStatus::CHANGE_FEED_DESTROY,
parentRange));
@ -3168,8 +3171,8 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
Key historyKey,
Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: init\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: init\n", self->epoch, granuleId.toString());
}
// if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially
@ -3195,8 +3198,11 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
filesToDelete.emplace_back(fname);
}
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: deleting {2} files\n",
self->epoch,
granuleId.toString(),
filesToDelete.size());
for (auto filename : filesToDelete) {
fmt::print(" - {}\n", filename.c_str());
}
@ -3209,8 +3215,9 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
wait(waitForAll(deletions));
// delete metadata in FDB (history entry and file keys)
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: deleting history and file keys\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print(
"BM {0} Fully deleting granule {1}: deleting history and file keys\n", self->epoch, granuleId.toString());
}
state Transaction tr(self->db);
@ -3229,8 +3236,8 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
}
}
if (BM_DEBUG) {
fmt::print("Fully deleting granule {0}: success\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: success\n", self->epoch, granuleId.toString());
}
TraceEvent("GranuleFullPurge", self->id)
@ -3242,6 +3249,8 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
++self->stats.granulesFullyPurged;
self->stats.filesPurged += filesToDelete.size();
CODE_PROBE(true, "full granule purged");
return Void();
}
@ -3257,8 +3266,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
UID granuleId,
Version purgeVersion,
KeyRange granuleRange) {
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: init\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: init\n", self->epoch, granuleId.toString());
}
state Reference<BlobConnectionProvider> bstore = wait(getBStoreForGranule(self, granuleRange));
@ -3307,8 +3316,11 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
filesToDelete.emplace_back(fname);
}
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: deleting {2} files\n",
self->epoch,
granuleId.toString(),
filesToDelete.size());
for (auto filename : filesToDelete) {
fmt::print(" - {0}\n", filename);
}
@ -3325,8 +3337,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
wait(waitForAll(deletions));
// delete metadata in FDB (deleted file keys)
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: deleting file keys\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: deleting file keys\n", self->epoch, granuleId.toString());
}
state Transaction tr(self->db);
@ -3345,8 +3357,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
}
}
if (BM_DEBUG) {
fmt::print("Partially deleting granule {0}: success\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Partially deleting granule {1}: success\n", self->epoch, granuleId.toString());
}
TraceEvent("GranulePartialPurge", self->id)
.detail("Epoch", self->epoch)
@ -3357,6 +3369,8 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
++self->stats.granulesPartiallyPurged;
self->stats.filesPurged += filesToDelete.size();
CODE_PROBE(true, " partial granule purged");
return Void();
}
@ -3369,8 +3383,9 @@ ACTOR Future<Void> partiallyDeleteGranule(Reference<BlobManagerData> self,
* processing this purge intent.
*/
ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range, Version purgeVersion, bool force) {
if (BM_DEBUG) {
fmt::print("purgeRange starting for range [{0} - {1}) @ purgeVersion={2}, force={3}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} purgeRange starting for range [{1} - {2}) @ purgeVersion={3}, force={4}\n",
self->epoch,
range.begin.printable(),
range.end.printable(),
purgeVersion,
@ -3392,8 +3407,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// track which granules we have already added to traversal
// note: (startKey, startVersion) uniquely identifies a granule
state std::unordered_set<std::pair<const uint8_t*, Version>, boost::hash<std::pair<const uint8_t*, Version>>>
visited;
state std::unordered_set<std::pair<std::string, Version>, boost::hash<std::pair<std::string, Version>>> visited;
// find all active granules (that comprise the range) and add to the queue
state KeyRangeMap<UID>::Ranges activeRanges = self->workerAssignments.intersectingRanges(range);
@ -3404,8 +3418,9 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
state KeyRangeMap<UID>::iterator activeRange;
for (activeRange = activeRanges.begin(); activeRange != activeRanges.end(); ++activeRange) {
if (BM_DEBUG) {
fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be purged\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Checking if active range [{1} - {2}), owned by BW {3}, should be purged\n",
self->epoch,
activeRange.begin().printable(),
activeRange.end().printable(),
activeRange.value().toString());
@ -3413,6 +3428,10 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// assumption: purge boundaries must respect granule boundaries
if (activeRange.begin() < range.begin || activeRange.end() > range.end) {
TraceEvent(SevWarn, "GranulePurgeRangesUnaligned", self->id)
.detail("Epoch", self->epoch)
.detail("PurgeRange", range)
.detail("GranuleRange", activeRange.range());
continue;
}
@ -3422,20 +3441,29 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
loop {
try {
if (BM_DEBUG) {
fmt::print("Fetching latest history entry for range [{0} - {1})\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fetching latest history entry for range [{1} - {2})\n",
self->epoch,
activeRange.begin().printable(),
activeRange.end().printable());
}
// FIXME: doing this serially will likely be too slow for large purges
Optional<GranuleHistory> history = wait(getLatestGranuleHistory(&tr, activeRange.range()));
// TODO: can we tell from the krm that this range is not valid, so that we don't need to do a
// get
if (history.present()) {
if (BM_DEBUG) {
printf("Adding range to history queue\n");
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Adding range to history queue: [{1} - {2}) @ {3} ({4})\n",
self->epoch,
activeRange.begin().printable(),
activeRange.end().printable(),
history.get().version,
(void*)(activeRange.range().begin.begin()));
}
visited.insert({ activeRange.range().begin.begin(), history.get().version });
visited.insert({ activeRange.range().begin.toString(), history.get().version });
historyEntryQueue.push({ activeRange.range(), history.get().version, MAX_VERSION });
} else if (BM_PURGE_DEBUG) {
fmt::print("BM {0} No history for range, ignoring\n", self->epoch);
}
break;
} catch (Error& e) {
@ -3444,8 +3472,12 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
}
if (BM_DEBUG) {
printf("Beginning BFS traversal of history\n");
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Beginning BFS traversal of {1} history items for range [{2} - {3}) \n",
self->epoch,
historyEntryQueue.size(),
range.begin.printable(),
range.end.printable());
}
while (!historyEntryQueue.empty()) {
// process the node at the front of the queue and remove it
@ -3455,8 +3487,9 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front();
historyEntryQueue.pop();
if (BM_DEBUG) {
fmt::print("Processing history node [{0} - {1}) with versions [{2}, {3})\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Processing history node [{1} - {2}) with versions [{3}, {4})\n",
self->epoch,
currRange.begin.printable(),
currRange.end.printable(),
startVersion,
@ -3481,11 +3514,15 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
}
if (!foundHistory) {
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} No history for this node, skipping\n", self->epoch);
}
continue;
}
if (BM_DEBUG) {
fmt::print("Found history entry for this node. It's granuleID is {0}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Found history entry for this node. It's granuleID is {1}\n",
self->epoch,
currHistoryNode.granuleID.toString());
}
@ -3496,33 +3533,45 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// and so this granule should be partially deleted
// - otherwise, this granule is active, so don't schedule it for deletion
if (force || endVersion <= purgeVersion) {
if (BM_DEBUG) {
fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString());
if (BM_PURGE_DEBUG) {
fmt::print(
"BM {0} Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString());
}
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange });
} else if (startVersion < purgeVersion) {
if (BM_DEBUG) {
fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Granule {1} will be partially deleted\n",
self->epoch,
currHistoryNode.granuleID.toString());
}
toPartiallyDelete.push_back({ currHistoryNode.granuleID, currRange });
}
// add all of the node's parents to the queue
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Checking {1} parents\n", self->epoch, currHistoryNode.parentVersions.size());
}
for (int i = 0; i < currHistoryNode.parentVersions.size(); i++) {
// for (auto& parent : currHistoryNode.parentVersions.size()) {
// if we already added this node to queue, skip it; otherwise, mark it as visited
KeyRangeRef parentRange(currHistoryNode.parentBoundaries[i], currHistoryNode.parentBoundaries[i + 1]);
Version parentVersion = currHistoryNode.parentVersions[i];
if (visited.count({ parentRange.begin.begin(), parentVersion })) {
if (BM_DEBUG) {
fmt::print("Already added {0} to queue, so skipping it\n", currHistoryNode.granuleID.toString());
std::string beginStr = parentRange.begin.toString();
if (!visited.insert({ beginStr, parentVersion }).second) {
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Already added [{1} - {2}) @ {3} - {4} to queue, so skipping it\n",
self->epoch,
parentRange.begin.printable(),
parentRange.end.printable(),
parentVersion,
startVersion);
}
continue;
}
visited.insert({ parentRange.begin.begin(), parentVersion });
if (BM_DEBUG) {
fmt::print("Adding parent [{0} - {1}) with versions [{2} - {3}) to queue\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Adding parent [{1} - {2}) @ {3} - {4} to queue\n",
self->epoch,
parentRange.begin.printable(),
parentRange.end.printable(),
parentVersion,
@ -3550,10 +3599,19 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// we won't run into any issues with trying to "re-delete" a blob file since deleting
// a file that doesn't exist is considered successful
TraceEvent("PurgeGranulesTraversalComplete", self->id)
.detail("Epoch", self->epoch)
.detail("Range", range)
.detail("PurgeVersion", purgeVersion)
.detail("Force", force)
.detail("VisitedCount", visited.size())
.detail("DeletingFullyCount", toFullyDelete.size())
.detail("DeletingPartiallyCount", toPartiallyDelete.size());
state std::vector<Future<Void>> partialDeletions;
state int i;
if (BM_DEBUG) {
fmt::print("{0} granules to fully delete\n", toFullyDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to fully delete\n", self->epoch, toFullyDelete.size());
}
for (i = toFullyDelete.size() - 1; i >= 0; --i) {
state UID granuleId;
@ -3561,22 +3619,22 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
KeyRange keyRange;
std::tie(granuleId, historyKey, keyRange) = toFullyDelete[i];
// FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_DEBUG) {
fmt::print("About to fully delete granule {0}\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
}
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, range));
}
if (BM_DEBUG) {
fmt::print("{0} granules to partially delete\n", toPartiallyDelete.size());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to partially delete\n", self->epoch, toPartiallyDelete.size());
}
for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
UID granuleId;
KeyRange range;
std::tie(granuleId, range) = toPartiallyDelete[i];
if (BM_DEBUG) {
fmt::print("About to partially delete granule {0}\n", granuleId.toString());
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to partially delete granule {1}\n", self->epoch, granuleId.toString());
}
partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion, range));
}
@ -3588,8 +3646,9 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
// another purgeIntent that got written for this table while we were processing this one.
// If that is the case, we should not clear the key. Otherwise, we can just clear the key.
if (BM_DEBUG) {
fmt::print("Successfully purged range [{0} - {1}) at purgeVersion={2}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: Successfully purged range [{1} - {2}) at purgeVersion={3}\n",
self->epoch,
range.begin.printable(),
range.end.printable(),
purgeVersion);
@ -3601,6 +3660,8 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
.detail("PurgeVersion", purgeVersion)
.detail("Force", force);
CODE_PROBE(true, "range purge complete");
++self->stats.purgesProcessed;
return Void();
}
@ -3651,6 +3712,7 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
// TODO: replace 10000 with a knob
state RangeResult purgeIntents = wait(tr->getRange(blobGranulePurgeKeys, BUGGIFY ? 1 : 10000));
if (purgeIntents.size()) {
CODE_PROBE(true, "BM found purges to process");
int rangeIdx = 0;
for (; rangeIdx < purgeIntents.size(); ++rangeIdx) {
Version purgeVersion;
@ -3672,8 +3734,9 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
}
purgeMap.insert(range, std::make_pair(purgeVersion, force));
if (BM_DEBUG) {
fmt::print("about to purge range [{0} - {1}) @ {2}, force={3}\n",
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} about to purge range [{1} - {2}) @ {3}, force={4}\n",
self->epoch,
range.begin.printable(),
range.end.printable(),
purgeVersion,
@ -3725,9 +3788,11 @@ ACTOR Future<Void> monitorPurgeKeys(Reference<BlobManagerData> self) {
}
}
if (BM_DEBUG) {
printf("Done clearing current set of purge intents.\n");
if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Done clearing current set of purge intents.\n", self->epoch);
}
CODE_PROBE(true, "BM finished processing purge intents");
}
}

View File

@ -602,7 +602,20 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
state std::string fileName = randomBGFilename(bwData->id, granuleID, currentDeltaVersion, ".delta");
state Value serialized = ObjectWriter::toValue(deltasToWrite, Unversioned());
state Optional<BlobGranuleCipherKeysCtx> cipherKeysCtx;
state Optional<BlobGranuleCipherKeysMeta> cipherKeysMeta;
state Arena arena;
// TODO support encryption, figure out proper state stuff
/*if (isBlobFileEncryptionSupported()) {
BlobGranuleCipherKeysCtx ciphKeysCtx = wait(getLatestGranuleCipherKeys(bwData, keyRange, &arena));
cipherKeysCtx = ciphKeysCtx;
cipherKeysMeta = BlobGranuleCipherKeysCtx::toCipherKeysMeta(cipherKeysCtx.get());
}*/
Optional<CompressionFilter> compressFilter = getBlobFileCompressFilter();
state Value serialized = serializeChunkedDeltaFile(
deltasToWrite, keyRange, SERVER_KNOBS->BG_DELTA_FILE_TARGET_CHUNK_BYTES, compressFilter, cipherKeysCtx);
state size_t serializedSize = serialized.size();
// Free up deltasToWrite here to reduce memory
@ -640,7 +653,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
Key dfKey = blobGranuleFileKeyFor(granuleID, currentDeltaVersion, 'D');
// TODO change once we support file multiplexing
Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize);
Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize, cipherKeysMeta);
tr->set(dfKey, dfValue);
if (oldGranuleComplete.present()) {
@ -668,7 +681,7 @@ ACTOR Future<BlobFileIndex> writeDeltaFile(Reference<BlobWorkerData> bwData,
wait(delay(deterministicRandom()->random01()));
}
// FIXME: change when we implement multiplexing
return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize, serializedSize);
return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize, serializedSize, cipherKeysMeta);
} catch (Error& e) {
wait(tr->onError(e));
}
@ -753,8 +766,8 @@ ACTOR Future<BlobFileIndex> writeSnapshot(Reference<BlobWorkerData> bwData,
}
Optional<CompressionFilter> compressFilter = getBlobFileCompressFilter();
state Value serialized =
serializeChunkedSnapshot(snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNKS, compressFilter, cipherKeysCtx);
state Value serialized = serializeChunkedSnapshot(
snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, compressFilter, cipherKeysCtx);
state size_t serializedSize = serialized.size();
// free snapshot to reduce memory
@ -970,6 +983,7 @@ ACTOR Future<BlobFileIndex> compactFromBlob(Reference<BlobWorkerData> bwData,
snapshotF.cipherKeysMeta);
// TODO: optimization - batch 'encryption-key' lookup given the GranuleFile set is known
// FIXME: get cipher keys for delta as well!
if (chunk.snapshotFile.get().cipherKeysMetaRef.present()) {
ASSERT(isBlobFileEncryptionSupported());
BlobGranuleCipherKeysCtx cipherKeysCtx =
@ -3187,6 +3201,8 @@ ACTOR Future<Void> doBlobGranuleFileRequest(Reference<BlobWorkerData> bwData, Bl
getGranuleCipherKeys(bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &rep.arena);
}
// FIXME: get cipher keys for delta files too!
// new deltas (if version is larger than version of last delta file)
// FIXME: do trivial key bounds here if key range is not fully contained in request key
// range

View File

@ -484,8 +484,11 @@ public:
}
// TODO: unit test needed
ACTOR static Future<Void> resumeFromDataMoves(Reference<DataDistributor> self) {
ACTOR static Future<Void> resumeFromDataMoves(Reference<DataDistributor> self, Future<Void> readyToStart) {
state KeyRangeMap<std::shared_ptr<DataMove>>::iterator it = self->initData->dataMoveMap.ranges().begin();
wait(readyToStart);
for (; it != self->initData->dataMoveMap.ranges().end(); ++it) {
const DataMoveMetaData& meta = it.value()->meta;
if (it.value()->isCancelled() || (it.value()->valid && !CLIENT_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) {
@ -528,8 +531,8 @@ public:
// usage if it turns out to be a problem.
Future<Void> resumeRelocations() {
ASSERT(shardsAffectedByTeamFailure); // has to be allocated
return runAfter(resumeFromShards(Reference<DataDistributor>::addRef(this), g_network->isSimulated()),
resumeFromDataMoves(Reference<DataDistributor>::addRef(this)));
Future<Void> shardsReady = resumeFromShards(Reference<DataDistributor>::addRef(this), g_network->isSimulated());
return resumeFromDataMoves(Reference<DataDistributor>::addRef(this), shardsReady);
}
};

View File

@ -110,9 +110,9 @@ class RocksDBErrorListener : public rocksdb::EventListener {
public:
RocksDBErrorListener(){};
void OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status* bg_error) override {
TraceEvent(SevError, "RocksDBBGError")
TraceEvent(SevError, "ShardedRocksDBBGError")
.detail("Reason", getErrorReason(reason))
.detail("RocksDBSeverity", bg_error->severity())
.detail("ShardedRocksDBSeverity", bg_error->severity())
.detail("Status", bg_error->ToString());
std::unique_lock<std::mutex> lock(mutex);
if (!errorPromise.isValid())
@ -186,8 +186,8 @@ std::vector<std::pair<KeyRange, std::string>> decodeShardMapping(const RangeResu
void logRocksDBError(const rocksdb::Status& status, const std::string& method) {
auto level = status.IsTimedOut() ? SevWarn : SevError;
TraceEvent e(level, "RocksDBError");
e.detail("Error", status.ToString()).detail("Method", method).detail("RocksDBSeverity", status.severity());
TraceEvent e(level, "ShardedRocksDBError");
e.detail("Error", status.ToString()).detail("Method", method).detail("ShardedRocksDBSeverity", status.severity());
if (status.IsIOError()) {
e.detail("SubCode", status.subcode());
}
@ -219,7 +219,7 @@ const char* ShardOpToString(ShardOp op) {
}
}
void logShardEvent(StringRef name, ShardOp op, Severity severity = SevInfo, const std::string& message = "") {
TraceEvent e(severity, "KVSShardEvent");
TraceEvent e(severity, "ShardedRocksKVSShardEvent");
e.detail("Name", name).detail("Action", ShardOpToString(op));
if (!message.empty()) {
e.detail("Message", message);
@ -230,7 +230,7 @@ void logShardEvent(StringRef name,
ShardOp op,
Severity severity = SevInfo,
const std::string& message = "") {
TraceEvent e(severity, "KVSShardEvent");
TraceEvent e(severity, "ShardedRocksKVSShardEvent");
e.detail("Name", name).detail("Action", ShardOpToString(op)).detail("Begin", range.begin).detail("End", range.end);
if (message != "") {
e.detail("Message", message);
@ -343,7 +343,7 @@ public:
ASSERT(cf);
readRangeOptions.background_purge_on_iterator_cleanup = true;
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
TraceEvent(SevDebug, "ReadIteratorPool")
TraceEvent(SevVerbose, "ShardedRocksReadIteratorPool")
.detail("Path", path)
.detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS)
.detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN);
@ -425,7 +425,7 @@ private:
ACTOR Future<Void> flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) {
loop {
wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY));
TraceEvent e("RocksDBFlowLock");
TraceEvent e("ShardedRocksDBFlowLock");
e.detail("ReadAvailable", readLock->available());
e.detail("ReadActivePermits", readLock->activePermits());
e.detail("ReadWaiters", readLock->waiters());
@ -588,13 +588,13 @@ public:
if (rState->closing) {
break;
}
TraceEvent(SevInfo, "KVSPhysialShardMetrics")
TraceEvent(SevInfo, "ShardedRocksKVSPhysialShardMetrics")
.detail("NumActiveShards", shardManager->numActiveShards())
.detail("TotalPhysicalShards", shardManager->numPhysicalShards());
}
} catch (Error& e) {
if (e.code() != error_code_actor_cancelled) {
TraceEvent(SevError, "ShardMetricsLoggerError").errorUnsuppressed(e);
TraceEvent(SevError, "ShardedRocksShardMetricsLoggerError").errorUnsuppressed(e);
}
}
return Void();
@ -602,7 +602,7 @@ public:
rocksdb::Status init() {
// Open instance.
TraceEvent(SevVerbose, "ShardManagerInitBegin", this->logId).detail("DataPath", path);
TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path);
std::vector<std::string> columnFamilies;
rocksdb::Options options = getOptions();
rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies);
@ -632,6 +632,8 @@ public:
}
if (foundMetadata) {
TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId)
.detail("PhysicalShardCount", handles.size());
for (auto handle : handles) {
if (handle->GetName() == "kvs-metadata") {
metadataShard = std::make_shared<PhysicalShard>(db, "kvs-metadata", handle);
@ -639,7 +641,8 @@ public:
physicalShards[handle->GetName()] = std::make_shared<PhysicalShard>(db, handle->GetName(), handle);
}
columnFamilyMap[handle->GetID()] = handle;
TraceEvent(SevInfo, "ShardedRocskDB").detail("FoundShard", handle->GetName()).detail("Action", "Init");
TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId)
.detail("PhysicalShard", handle->GetName());
}
RangeResult metadata;
readRangeInDb(metadataShard.get(), prefixRange(shardMappingPrefix), UINT16_MAX, UINT16_MAX, &metadata);
@ -647,7 +650,7 @@ public:
std::vector<std::pair<KeyRange, std::string>> mapping = decodeShardMapping(metadata, shardMappingPrefix);
for (const auto& [range, name] : mapping) {
TraceEvent(SevDebug, "ShardedRocksLoadPhysicalShard", this->logId)
TraceEvent(SevVerbose, "ShardedRocksLoadRange", this->logId)
.detail("Range", range)
.detail("PhysicalShard", name);
auto it = physicalShards.find(name);
@ -662,10 +665,10 @@ public:
activePhysicalShardIds.emplace(name);
}
// TODO: remove unused column families.
} else {
// DB is opened with default shard.
ASSERT(handles.size() == 1);
// Add SpecialKeys range. This range should not be modified.
std::shared_ptr<PhysicalShard> defaultShard = std::make_shared<PhysicalShard>(db, "default", handles[0]);
columnFamilyMap[defaultShard->cf->GetID()] = defaultShard->cf;
@ -688,7 +691,7 @@ public:
return status;
}
metadataShard->readIterPool->update();
TraceEvent(SevInfo, "InitializeMetaDataShard", this->logId)
TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId)
.detail("MetadataShardCF", metadataShard->cf->GetID());
}
physicalShards["kvs-metadata"] = metadataShard;
@ -696,7 +699,7 @@ public:
writeBatch = std::make_unique<rocksdb::WriteBatch>();
dirtyShards = std::make_unique<std::set<PhysicalShard*>>();
TraceEvent(SevDebug, "ShardManagerInitEnd", this->logId).detail("DataPath", path);
TraceEvent(SevInfo, "ShardedRocksShardManagerInitEnd", this->logId).detail("DataPath", path);
return status;
}
@ -712,7 +715,7 @@ public:
for (auto it = rangeIterator.begin(); it != rangeIterator.end(); ++it) {
if (it.value() == nullptr) {
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevVerbose, "ShardedRocksDB")
.detail("Info", "ShardNotFound")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
@ -724,9 +727,10 @@ public:
}
PhysicalShard* addRange(KeyRange range, std::string id) {
TraceEvent(SevVerbose, "ShardedRocksAddRangeBegin", this->logId)
TraceEvent(SevInfo, "ShardedRocksAddRangeBegin", this->logId)
.detail("Range", range)
.detail("PhysicalShardID", id);
// Newly added range should not overlap with any existing range.
auto ranges = dataShardMap.intersectingRanges(range);
@ -750,7 +754,7 @@ public:
validate();
TraceEvent(SevVerbose, "ShardedRocksAddRangeEnd", this->logId)
TraceEvent(SevInfo, "ShardedRocksAddRangeEnd", this->logId)
.detail("Range", range)
.detail("PhysicalShardID", id);
@ -758,7 +762,7 @@ public:
}
std::vector<std::string> removeRange(KeyRange range) {
TraceEvent(SevVerbose, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range);
TraceEvent(SevInfo, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range);
std::vector<std::string> shardIds;
@ -796,6 +800,7 @@ public:
}
continue;
}
// Range modification could result in more than one segments. Remove the original segment key here.
existingShard->dataShards.erase(shardRange.begin.toString());
if (shardRange.begin < range.begin) {
@ -826,7 +831,7 @@ public:
validate();
TraceEvent(SevVerbose, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range);
TraceEvent(SevInfo, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range);
return shardIds;
}
@ -849,7 +854,7 @@ public:
TraceEvent(SevError, "ShardedRocksDB").detail("Error", "write to non-exist shard").detail("WriteKey", key);
return;
}
TraceEvent(SevVerbose, "ShardManagerPut", this->logId)
TraceEvent(SevVerbose, "ShardedRocksShardManagerPut", this->logId)
.detail("WriteKey", key)
.detail("Value", value)
.detail("MapRange", it.range())
@ -859,7 +864,9 @@ public:
ASSERT(dirtyShards != nullptr);
writeBatch->Put(it.value()->physicalShard->cf, toSlice(key), toSlice(value));
dirtyShards->insert(it.value()->physicalShard);
TraceEvent(SevVerbose, "ShardManagerPutEnd", this->logId).detail("WriteKey", key).detail("Value", value);
TraceEvent(SevVerbose, "ShardedRocksShardManagerPutEnd", this->logId)
.detail("WriteKey", key)
.detail("Value", value);
}
void clear(KeyRef key) {
@ -884,7 +891,7 @@ public:
}
void persistRangeMapping(KeyRangeRef range, bool isAdd) {
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Info", "RangeToPersist")
.detail("BeginKey", range.begin)
.detail("EndKey", range.end);
@ -902,7 +909,7 @@ public:
writeBatch->Put(metadataShard->cf,
getShardMappingKey(it.range().begin, shardMappingPrefix),
it.value()->physicalShard->id);
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("BeginKey", it.range().begin)
.detail("EndKey", it.range().end)
@ -911,7 +918,7 @@ public:
} else {
// Empty range.
writeBatch->Put(metadataShard->cf, getShardMappingKey(it.range().begin, shardMappingPrefix), "");
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("BeginKey", it.range().begin)
.detail("EndKey", it.range().end)
@ -921,7 +928,7 @@ public:
}
} else {
writeBatch->Put(metadataShard->cf, getShardMappingKey(range.begin, shardMappingPrefix), "");
TraceEvent(SevDebug, "ShardedRocksDB")
TraceEvent(SevDebug, "ShardedRocksDB", this->logId)
.detail("Action", "PersistRangeMapping")
.detail("RemoveRange", "True")
.detail("BeginKey", range.begin)
@ -972,7 +979,7 @@ public:
if (!s.ok()) {
logRocksDBError(s, "DestroyDB");
}
TraceEvent("RocksDB").detail("Info", "DBDestroyed");
TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBDestroyed");
}
rocksdb::DB* getDb() const { return db; }
@ -997,9 +1004,9 @@ public:
}
void validate() {
TraceEvent(SevVerbose, "ValidateShardManager", this->logId);
TraceEvent(SevVerbose, "ShardedRocksValidateShardManager", this->logId);
for (auto s = dataShardMap.ranges().begin(); s != dataShardMap.ranges().end(); ++s) {
TraceEvent e(SevVerbose, "ValidateDataShardMap", this->logId);
TraceEvent e(SevVerbose, "ShardedRocksValidateDataShardMap", this->logId);
e.detail("Range", s->range());
const DataShard* shard = s->value();
e.detail("ShardAddress", reinterpret_cast<std::uintptr_t>(shard));
@ -1008,6 +1015,13 @@ public:
} else {
e.detail("Shard", "Empty");
}
if (shard != nullptr) {
ASSERT(shard->range == static_cast<KeyRangeRef>(s->range()));
ASSERT(shard->physicalShard != nullptr);
auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString());
ASSERT(it != shard->physicalShard->dataShards.end());
ASSERT(it->second.get() == shard);
}
}
}
@ -1338,7 +1352,7 @@ std::shared_ptr<rocksdb::Statistics> RocksDBMetrics::getStatsObjForRocksDB() {
}
void RocksDBMetrics::logStats(rocksdb::DB* db) {
TraceEvent e("RocksDBMetrics");
TraceEvent e("ShardedRocksDBMetrics");
uint64_t stat;
for (auto& [name, ticker, cumulation] : tickerStats) {
stat = stats->getTickerCount(ticker);
@ -1361,7 +1375,7 @@ void RocksDBMetrics::logStats(rocksdb::DB* db) {
}
void RocksDBMetrics::logMemUsagePerShard(std::string shardName, rocksdb::DB* db) {
TraceEvent e("RocksDBShardMemMetrics");
TraceEvent e("ShardedRocksDBShardMemMetrics");
uint64_t stat;
ASSERT(db != nullptr);
ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat));
@ -1387,7 +1401,7 @@ void RocksDBMetrics::setPerfContext(int index) {
}
void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) {
TraceEvent e("RocksDBPerfContextMetrics");
TraceEvent e("ShardedRocksDBPerfContextMetrics");
e.setMaxEventLength(20000);
for (auto& [name, metric, vals] : perfContextMetrics) {
uint64_t s = 0;
@ -1650,7 +1664,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
return;
}
TraceEvent(SevInfo, "RocksDB").detail("Method", "Open");
TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Open");
a.done.send(Void());
}
@ -1841,7 +1855,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
} else {
a.shardManager->closeAllShards();
}
TraceEvent(SevInfo, "RocksDB").detail("Method", "Close");
TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Close");
a.done.send(Void());
}
};
@ -1908,7 +1922,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.Before");
}
if (readBeginTime - a.startTime > readValueTimeout) {
TraceEvent(SevWarn, "RocksDBError")
TraceEvent(SevWarn, "ShardedRocksDBError")
.detail("Error", "Read value request timedout")
.detail("Method", "ReadValueAction")
.detail("Timeout value", readValueTimeout);
@ -1995,7 +2009,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
"Reader.Before"); //.detail("TaskID", g_network->getCurrentTask());
}
if (readBeginTime - a.startTime > readValuePrefixTimeout) {
TraceEvent(SevWarn, "RocksDBError")
TraceEvent(SevWarn, "ShardedRocksDBError")
.detail("Error", "Read value prefix request timedout")
.detail("Method", "ReadValuePrefixAction")
.detail("Timeout value", readValuePrefixTimeout);
@ -2080,7 +2094,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
rocksDBMetrics->getReadRangeQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime);
}
if (readBeginTime - a.startTime > readRangeTimeout) {
TraceEvent(SevWarn, "KVSReadTimeout")
TraceEvent(SevWarn, "ShardedRocksKVSReadTimeout")
.detail("Error", "Read range request timedout")
.detail("Method", "ReadRangeAction")
.detail("Timeout value", readRangeTimeout);
@ -2127,10 +2141,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
}
}
Histogram::getHistogram(
ROCKSDBSTORAGE_HISTOGRAM_GROUP, "ShardedRocksDBNumShardsInRangeRead"_sr, Histogram::Unit::countLinear)
->sample(numShards);
result.more =
(result.size() == a.rowLimit) || (result.size() == -a.rowLimit) || (accumulatedBytes >= a.byteLimit);
if (result.more) {
@ -2184,7 +2194,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
readThreads = createGenericThreadPool();
}
writeThread->addThread(new Writer(id, 0, shardManager.getColumnFamilyMap(), rocksDBMetrics), "fdb-rocksdb-wr");
TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
TraceEvent("ShardedRocksDBReadThreads", id)
.detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM);
for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) {
readThreads->addThread(new Reader(id, i, rocksDBMetrics), "fdb-rocksdb-re");
}
@ -2302,7 +2313,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto* shard = shardManager.getDataShard(key);
if (shard == nullptr || !shard->physicalShard->initialized()) {
// TODO: read non-exist system key range should not cause an error.
TraceEvent(SevWarnAlways, "ShardedRocksDB")
TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
.detail("Detail", "Read non-exist key range")
.detail("ReadKey", key);
return Optional<Value>();
@ -2330,7 +2341,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
auto* shard = shardManager.getDataShard(key);
if (shard == nullptr || !shard->physicalShard->initialized()) {
// TODO: read non-exist system key range should not cause an error.
TraceEvent(SevWarnAlways, "ShardedRocksDB")
TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id)
.detail("Detail", "Read non-exist key range")
.detail("ReadKey", key);
return Optional<Value>();
@ -2452,7 +2463,7 @@ IKeyValueStore* keyValueStoreShardedRocksDB(std::string const& path,
#ifdef SSD_ROCKSDB_EXPERIMENTAL
return new ShardedRocksDBKeyValueStore(path, logID);
#else
TraceEvent(SevError, "RocksDBEngineInitFailure").detail("Reason", "Built without RocksDB");
TraceEvent(SevError, "ShardedRocksDBEngineInitFailure").detail("Reason", "Built without RocksDB");
ASSERT(false);
return nullptr;
#endif // SSD_ROCKSDB_EXPERIMENTAL

View File

@ -280,6 +280,13 @@ class TestConfig {
if (attrib == "blobGranulesEnabled") {
blobGranulesEnabled = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSTargetedRestart") {
injectTargetedSSRestart = strcmp(value.c_str(), "true") == 0;
}
if (attrib == "injectSSDelay") {
injectSSDelay = strcmp(value.c_str(), "true") == 0;
}
}
ifs.close();
@ -327,6 +334,8 @@ public:
bool allowDefaultTenant = true;
bool allowDisablingTenants = true;
bool injectTargetedSSRestart = false;
bool injectSSDelay = false;
ConfigDBType getConfigDBType() const { return configDBType; }
@ -384,7 +393,9 @@ public:
.add("blobGranulesEnabled", &blobGranulesEnabled)
.add("allowDefaultTenant", &allowDefaultTenant)
.add("allowDisablingTenants", &allowDisablingTenants)
.add("randomlyRenameZoneId", &randomlyRenameZoneId);
.add("randomlyRenameZoneId", &randomlyRenameZoneId)
.add("injectTargetedSSRestart", &injectTargetedSSRestart)
.add("injectSSDelay", &injectSSDelay);
try {
auto file = toml::parse(testFile);
if (file.contains("configuration") && toml::find(file, "configuration").is_table()) {
@ -1384,7 +1395,7 @@ void SimulationConfig::setDatacenters(const TestConfig& testConfig) {
void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
// Using [0, 4) to disable the RocksDB storage engine.
// TODO: Figure out what is broken with the RocksDB engine in simulation.
int storage_engine_type = deterministicRandom()->randomInt(0, 4);
int storage_engine_type = deterministicRandom()->randomInt(0, 6);
if (testConfig.storageEngineType.present()) {
storage_engine_type = testConfig.storageEngineType.get();
} else {
@ -1392,7 +1403,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
while (std::find(testConfig.storageEngineExcludeTypes.begin(),
testConfig.storageEngineExcludeTypes.end(),
storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) {
storage_engine_type = deterministicRandom()->randomInt(0, 5);
storage_engine_type = deterministicRandom()->randomInt(0, 6);
}
}
@ -1435,6 +1446,8 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) {
TraceEvent(SevWarnAlways, "RocksDBNonDeterminism")
.detail("Explanation", "The Sharded RocksDB storage engine is threaded and non-deterministic");
noUnseed = true;
auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection();
g_knobs.setKnob("shard_encode_location_metadata", KnobValueRef::create(bool{ true }));
break;
}
default:
@ -2364,6 +2377,13 @@ ACTOR void setupAndRun(std::string dataFolder,
testConfig.readFromConfig(testFile);
g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess;
g_simulator.setDiffProtocol = false;
if (testConfig.injectTargetedSSRestart && deterministicRandom()->random01() < 0.25) {
g_simulator.injectTargetedSSRestartTime = 60.0 + 340.0 * deterministicRandom()->random01();
}
if (testConfig.injectSSDelay && deterministicRandom()->random01() < 0.25) {
g_simulator.injectSSDelayTime = 60.0 + 240.0 * deterministicRandom()->random01();
}
// Build simulator allow list
allowList.addTrustedSubnet("0.0.0.0/2"sv);
@ -2377,6 +2397,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// https://github.com/apple/foundationdb/issues/5155
if (std::string_view(testFile).find("restarting") != std::string_view::npos) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
// Disable the default tenant in restarting tests for now
// TODO: persist the chosen default tenant in the restartInfo.ini file for the second test
@ -2389,6 +2410,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// Re-enable the backup and restore related simulation tests when the tests are passing again.
if (std::string_view(testFile).find("Backup") != std::string_view::npos) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
}
// Disable the default tenant in backup and DR tests for now. This is because backup does not currently duplicate
@ -2402,6 +2424,7 @@ ACTOR void setupAndRun(std::string dataFolder,
// in the build.
if (!rocksDBEnabled) {
testConfig.storageEngineExcludeTypes.push_back(4);
testConfig.storageEngineExcludeTypes.push_back(5);
}
state ProtocolVersion protocolVersion = currentProtocolVersion;

View File

@ -2751,6 +2751,9 @@ ACTOR Future<JsonBuilderObject> lockedStatusFetcher(Reference<AsyncVar<ServerDBI
try {
wait(tr.onError(e));
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;
incomplete_reasons->insert(format("Unable to determine if database is locked (%s).", e.what()));
break;
}

View File

@ -51,6 +51,8 @@ bool compareFDBAndBlob(RangeResult fdb,
Version v,
bool debug);
void printGranuleChunks(const Standalone<VectorRef<BlobGranuleChunkRef>>& chunks);
ACTOR Future<Void> clearAndAwaitMerge(Database cx, KeyRange range);
#include "flow/unactorcompiler.h"

View File

@ -294,8 +294,8 @@ Future<Void> bulkSetup(Database cx,
// Here we wait for data in flight to go to 0 (this will not work on a database with other users)
if (postSetupWarming != 0) {
try {
wait(delay(5.0) >>
waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start
wait(delay(5.0));
wait(waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled)
throw;

File diff suppressed because it is too large Load Diff

View File

@ -385,7 +385,9 @@ ACTOR Future<Reference<TestWorkload>> getWorkloadIface(WorkloadRequest work,
wcx.sharedRandomNumber = work.sharedRandomNumber;
workload = IWorkloadFactory::create(testName.toString(), wcx);
wait(workload->initialized());
if (workload) {
wait(workload->initialized());
}
auto unconsumedOptions = checkAllOptionsConsumed(workload ? workload->options : VectorRef<KeyValueRef>());
if (!workload || unconsumedOptions.size()) {

View File

@ -237,57 +237,64 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
while (timeTravelIt != timeTravelChecks.end() && currentTime >= timeTravelIt->first) {
state OldRead oldRead = timeTravelIt->second;
timeTravelChecksMemory -= oldRead.oldResult.expectedSize();
// advance iterator before doing read, so if it gets error we don't retry it
timeTravelIt = timeTravelChecks.erase(timeTravelIt);
if (prevPurgeVersion == -1) {
prevPurgeVersion = oldRead.v;
}
// advance iterator before doing read, so if it gets error we don't retry it
try {
state Version newPurgeVersion = 0;
state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
if (doPurging) {
Version maxPurgeVersion = oldRead.v;
for (auto& it : timeTravelChecks) {
maxPurgeVersion = std::min(it.second.v, maxPurgeVersion);
}
if (prevPurgeVersion < maxPurgeVersion) {
newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, false));
wait(cx->waitPurgeGranulesComplete(purgeKey));
self->purges++;
} else {
doPurging = false;
}
// before doing read, purge just before read version
state Version newPurgeVersion = 0;
state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5;
if (doPurging) {
CODE_PROBE(true, "BGV considering purge");
Version maxPurgeVersion = oldRead.v;
for (auto& it : timeTravelChecks) {
maxPurgeVersion = std::min(it.second.v, maxPurgeVersion);
}
if (prevPurgeVersion < maxPurgeVersion) {
CODE_PROBE(true, "BGV doing purge");
newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion);
prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion);
if (BGV_DEBUG) {
fmt::print("BGV Purging @ {0}\n", newPurgeVersion);
}
try {
Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, false));
if (BGV_DEBUG) {
fmt::print("BGV Purged @ {0}, waiting\n", newPurgeVersion);
}
wait(cx->waitPurgeGranulesComplete(purgeKey));
} catch (Error& e) {
if (e.code() == error_code_operation_cancelled) {
throw e;
}
// purging shouldn't error, it should retry.
if (BGV_DEBUG) {
fmt::print("Unexpected error {0} purging @ {1}!\n", e.name(), newPurgeVersion);
}
ASSERT(false);
}
CODE_PROBE(true, "BGV purge complete");
if (BGV_DEBUG) {
fmt::print("BGV Purge complete @ {0}\n", newPurgeVersion);
}
self->purges++;
} else {
doPurging = false;
}
}
// do time travel read
try {
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> reReadResult =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, oldRead.v));
if (!compareFDBAndBlob(oldRead.oldResult, reReadResult, oldRead.range, oldRead.v, BGV_DEBUG)) {
self->mismatches++;
}
self->timeTravelReads++;
if (doPurging) {
wait(self->killBlobWorkers(cx, self));
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
try {
Version minSnapshotVersion = newPurgeVersion;
for (auto& it : versionRead.second) {
minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion);
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, minSnapshotVersion - 1));
ASSERT(false);
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
ASSERT(e.code() == error_code_blob_granule_transaction_too_old);
}
}
} catch (Error& e) {
fmt::print("Error TT: {0}\n", e.name());
if (e.code() == error_code_blob_granule_transaction_too_old) {
self->timeTravelTooOld++;
// TODO: add debugging info for when this is a failure
@ -297,6 +304,51 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
oldRead.v);
}
}
// if purged just before read, verify that purge cleaned up data by restarting blob workers and
// reading older than the purge version
if (doPurging) {
wait(self->killBlobWorkers(cx, self));
if (BGV_DEBUG) {
fmt::print("BGV Reading post-purge [{0} - {1}) @ {2}\n",
oldRead.range.begin.printable(),
oldRead.range.end.printable(),
prevPurgeVersion);
}
// ensure purge version exactly is still readable
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead1 =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion));
if (BGV_DEBUG) {
fmt::print("BGV Post-purge first read:\n");
printGranuleChunks(versionRead1.second);
}
try {
// read at purgeVersion - 1, should NOT be readable
Version minSnapshotVersion = newPurgeVersion;
for (auto& it : versionRead1.second) {
minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion);
}
if (BGV_DEBUG) {
fmt::print("BGV Reading post-purge again [{0} - {1}) @ {2}\n",
oldRead.range.begin.printable(),
oldRead.range.end.printable(),
minSnapshotVersion - 1);
}
std::pair<RangeResult, Standalone<VectorRef<BlobGranuleChunkRef>>> versionRead2 =
wait(readFromBlob(cx, self->bstore, oldRead.range, 0, minSnapshotVersion - 1));
if (BGV_DEBUG) {
fmt::print("BGV ERROR: data not purged! Read successful!!\n");
printGranuleChunks(versionRead2.second);
}
ASSERT(false);
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
ASSERT(e.code() == error_code_blob_granule_transaction_too_old);
CODE_PROBE(true, "BGV verified too old after purge");
}
}
}
// pick a random range
@ -471,6 +523,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload {
// For some reason simulation is still passing when this fails?.. so assert for now
ASSERT(result);
// FIXME: if doPurging was set, possibly do one last purge here, and verify it succeeds with no errors
if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && deterministicRandom()->random01() < 0.1) {
CODE_PROBE(true, "BGV clearing database and awaiting merge");
wait(clearAndAwaitMerge(cx, normalKeys));

View File

@ -0,0 +1,767 @@
/*
* ChangeFeedOperations.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2022 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "fdbclient/FDBOptions.g.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/NativeAPI.actor.h"
#include "fdbclient/SystemData.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "fdbserver/workloads/BulkSetup.actor.h"
#include "flow/Arena.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/Util.h"
#include "flow/serialize.h"
#include <cstring>
#include <limits>
#include "flow/actorcompiler.h" // This must be the last #include.
// enable to debug specific operations for a given change feed
#define DEBUG_KEY ""_sr
#define DEBUG_CF(feedKey) (feedKey.printable() == DEBUG_KEY)
ACTOR Future<Void> doPop(Database cx, Key key, Key feedID, Version version, Version* doneOut) {
wait(cx->popChangeFeedMutations(feedID, version));
if (*doneOut < version) {
*doneOut = version;
}
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Popped through {1}\n", key.printable(), version);
}
// TODO: could strengthen pop checking by validating that a read immediately after the pop completes has no data
return Void();
}
struct FeedTestData : ReferenceCounted<FeedTestData>, NonCopyable {
Key key;
KeyRange keyRange;
Key feedID;
int nextVal;
Future<Void> liveReader;
bool lastCleared = false;
std::vector<Future<Void>> pops;
Version poppingVersion;
Version poppedVersion;
Optional<Version> stopVersion;
bool destroying;
bool destroyed;
bool complete;
int popWindow;
int popDelayWindow;
std::deque<std::pair<Version, Optional<Value>>> writesByVersion;
// these were all committed
std::deque<std::pair<Version, Optional<Value>>> pendingCheck;
NotifiedVersion checkVersion;
FeedTestData(Key key, bool doPops)
: key(key), keyRange(KeyRangeRef(key, keyAfter(key))), feedID(key.withPrefix(LiteralStringRef("CF"))), nextVal(0),
lastCleared(false), poppingVersion(0), poppedVersion(0), destroying(false), destroyed(false), complete(false),
checkVersion(0) {
if (doPops) {
popWindow = deterministicRandom()->randomExp(1, 8);
popDelayWindow = deterministicRandom()->randomInt(0, 2) * deterministicRandom()->randomExp(1, 4);
} else {
popWindow = -1;
popDelayWindow = -1;
}
}
Value nextValue() {
std::string v = std::to_string(nextVal);
nextVal++;
return Value(v);
}
void update(Version version, Optional<Value> value) {
if (!stopVersion.present()) {
// if feed is stopped, value should not get read
writesByVersion.push_back({ version, value });
pendingCheck.push_back(writesByVersion.back());
checkVersion.set(version);
}
}
void testComplete() {
complete = true;
checkVersion.set(checkVersion.get() + 1);
}
void pop(Database cx, Version v) {
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Popping through {1}\n", key.printable(), v);
}
ASSERT(poppingVersion < v);
poppingVersion = v;
while (!writesByVersion.empty() && v > writesByVersion.front().first) {
writesByVersion.pop_front();
}
while (!pendingCheck.empty() && v > pendingCheck.front().first) {
pendingCheck.pop_front();
}
pops.push_back(doPop(cx, key, feedID, v, &poppedVersion));
}
};
static void rollbackFeed(Key key,
std::deque<Standalone<MutationsAndVersionRef>>& buffered,
Version version,
MutationRef rollbackMutation) {
Version rollbackVersion;
BinaryReader br(rollbackMutation.param2, Unversioned());
br >> rollbackVersion;
TraceEvent("ChangeFeedRollback").detail("Key", key).detail("Ver", version).detail("RollbackVer", rollbackVersion);
if (DEBUG_CF(key)) {
fmt::print("DBG) {0} Rolling back {1} -> {2}\n", key.printable(), version, rollbackVersion);
}
while (!buffered.empty() && buffered.back().version > rollbackVersion) {
TraceEvent("ChangeFeedRollbackVer").detail("Ver", buffered.back().version);
buffered.pop_back();
}
}
static void checkNextResult(Key key,
std::deque<Standalone<MutationsAndVersionRef>>& buffered,
std::deque<std::pair<Version, Optional<Value>>>& checkData) {
// First asserts are checking data is in the form the test is supposed to produce
ASSERT(!buffered.empty());
ASSERT(buffered.front().mutations.size() == 1);
ASSERT(buffered.front().mutations[0].param1 == key);
// Below asserts are correctness of change feed invariants.
// Handle case where txn retried and wrote same value twice. checkData's version is the committed one, so the same
// update may appear at an earlier version. This is fine, as long as it then actually appears at the committed
// version
// TODO: could strengthen this check a bit and only allow it to appear at the lower version if the txn retried on
// commit_unknown_result?
if (checkData.front().first < buffered.front().version) {
fmt::print("ERROR. {0} Check version {1} != {2}.\n Check: {3} {4}\n Buffered: {5} {6}\n",
key.printable(),
checkData.front().first,
buffered.front().version,
checkData.front().second.present() ? "SET" : "CLEAR",
checkData.front().second.present() ? checkData.front().second.get().printable()
: keyAfter(key).printable(),
buffered.front().mutations[0].type == MutationRef::SetValue ? "SET" : "CLEAR",
buffered.front().mutations[0].param2.printable());
}
ASSERT(checkData.front().first >= buffered.front().version);
if (checkData.front().second.present()) {
ASSERT(buffered.front().mutations[0].type == MutationRef::SetValue);
ASSERT(buffered.front().mutations[0].param2 == checkData.front().second.get());
} else {
ASSERT(buffered.front().mutations[0].type == MutationRef::ClearRange);
ASSERT(buffered.front().mutations[0].param2 == keyAfter(key));
}
if (checkData.front().first == buffered.front().version) {
checkData.pop_front();
}
buffered.pop_front();
}
ACTOR Future<Void> liveReader(Database cx, Reference<FeedTestData> data, Version begin) {
state Version lastCheckVersion = 0;
state Version nextCheckVersion = 0;
state std::deque<Standalone<MutationsAndVersionRef>> buffered;
state Reference<ChangeFeedData> results = makeReference<ChangeFeedData>();
state Future<Void> stream =
cx->getChangeFeedStream(results, data->feedID, begin, std::numeric_limits<Version>::max(), data->keyRange);
try {
loop {
if (data->complete && data->pendingCheck.empty()) {
return Void();
}
nextCheckVersion = data->pendingCheck.empty() ? invalidVersion : data->pendingCheck.front().first;
choose {
when(Standalone<VectorRef<MutationsAndVersionRef>> res = waitNext(results->mutations.getFuture())) {
for (auto& it : res) {
if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) {
rollbackFeed(data->key, buffered, it.version, it.mutations.back());
} else {
if (it.mutations.size() == 0) {
// FIXME: THIS SHOULD NOT HAPPEN
// FIXME: these are also getting sent past stopVersion!!
} else {
if (data->stopVersion.present()) {
if (it.version > data->stopVersion.get()) {
fmt::print("DBG) {0} Read data with version {1} > stop version {2} ({3})\n",
data->key.printable(),
it.version,
data->stopVersion.get(),
it.mutations.size());
}
ASSERT(it.version <= data->stopVersion.get());
}
buffered.push_back(Standalone<MutationsAndVersionRef>(it));
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live read through {1} ({2})\n",
data->key.printable(),
it.version,
it.mutations.size());
}
}
}
}
}
when(wait(data->checkVersion.whenAtLeast(lastCheckVersion + 1))) {
// wake loop and start new whenAtLeast whenever checkVersion is set
lastCheckVersion = data->checkVersion.get();
}
when(wait(data->pendingCheck.empty() ? Never()
: results->whenAtLeast(data->pendingCheck.front().first))) {
if (data->pendingCheck.empty() || data->pendingCheck.front().first > nextCheckVersion) {
// pendingCheck wasn't empty before whenAtLeast, and nextCheckVersion = the front version, so if
// either of these are true, the data was popped concurrently and we can move on to checking the
// next value
CODE_PROBE(true, "popped while waiting for whenAtLeast to check next value");
continue;
}
while (!buffered.empty() && buffered.front().version < data->poppingVersion) {
CODE_PROBE(true, "live reader ignoring data that is being popped");
buffered.pop_front();
}
if (buffered.empty()) {
if (data->poppingVersion < data->pendingCheck.front().first) {
fmt::print("DBG) {0} Buffered empty after ready for check, and data not popped! popped "
"{1}, popping {2}, check {3}\n",
data->key.printable(),
data->poppedVersion,
data->poppingVersion,
data->pendingCheck.front().first);
}
ASSERT(data->poppingVersion >= data->pendingCheck.front().first);
data->pendingCheck.pop_front();
} else {
Version v = buffered.front().version;
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live checking through {1}\n",
data->key.printable(),
data->pendingCheck.front().first);
}
checkNextResult(data->key, buffered, data->pendingCheck);
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Live Checked through {1}\n", data->key.printable(), v);
}
if (data->popDelayWindow >= 0 && data->popWindow >= 0 &&
data->writesByVersion.size() == data->popWindow + data->popDelayWindow) {
data->pop(cx, data->writesByVersion[data->popWindow - 1].first + 1);
ASSERT(data->writesByVersion.size() == data->popDelayWindow);
}
}
}
}
}
} catch (Error& e) {
throw e;
}
}
ACTOR Future<Void> historicReader(Database cx,
Reference<FeedTestData> data,
Version begin,
Version end,
bool skipPopped) {
state std::deque<std::pair<Version, Optional<Value>>> checkData;
state std::deque<Standalone<MutationsAndVersionRef>> buffered;
state Reference<ChangeFeedData> results = makeReference<ChangeFeedData>();
state Future<Void> stream = cx->getChangeFeedStream(results, data->feedID, begin, end, data->keyRange);
state Version poppedVersionAtStart = data->poppedVersion;
if (DEBUG_CF(data->key)) {
fmt::print("DBG) {0} Starting historical read {1} - {2}\n", data->key.printable(), begin, end);
}
// TODO could cpu optimize this
for (auto& it : data->writesByVersion) {
if (it.first >= end) {
break;
}
if (it.first >= begin) {
checkData.push_back(it);
}
}
try {
loop {
Standalone<VectorRef<MutationsAndVersionRef>> res = waitNext(results->mutations.getFuture());
for (auto& it : res) {
if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) {
rollbackFeed(data->key, buffered, it.version, it.mutations.back());
} else {
if (it.mutations.size() == 0) {
// FIXME: THIS SHOULD NOT HAPPEN
// FIXME: these are also getting sent past stopVersion!!
} else {
if (data->stopVersion.present()) {
ASSERT(it.version <= data->stopVersion.get());
}
buffered.push_back(Standalone<MutationsAndVersionRef>(it));
}
}
}
}
} catch (Error& e) {
if (e.code() != error_code_end_of_stream) {
throw;
}
}
if (skipPopped) {
while (!buffered.empty() && buffered.front().version < data->poppingVersion) {
// ignore data
buffered.pop_front();
}
while (!checkData.empty() && checkData.front().first < data->poppingVersion) {
checkData.pop_front();
}
}
while (!checkData.empty() && !buffered.empty()) {
checkNextResult(data->key, buffered, checkData);
}
// Change feed missing data it should have
ASSERT(checkData.empty());
// Change feed read extra data it shouldn't have
ASSERT(buffered.empty());
// check pop version of cursor
// TODO: this check might not always work if read is for old data and SS is way behind
// FIXME: this check doesn't work for now, probably due to above comment
/*if (data->poppingVersion != 0) {
ASSERT(results->popVersion >= poppedVersionAtStart && results->popVersion <= data->poppingVersion);
}*/
return Void();
}
enum Op {
CREATE_DELETE = 0,
READ = 1,
UPDATE_CLEAR = 2,
STOP = 3,
POP = 4,
OP_COUNT = 5 /* keep this last */
};
struct ChangeFeedOperationsWorkload : TestWorkload {
// test settings
double testDuration;
int operationsPerSecond;
int targetFeeds;
bool clientsDisjointKeyspace;
bool clearKeyWhenDestroy;
double clearFrequency;
int popMode;
int opWeights[Op::OP_COUNT];
int totalOpWeight;
Future<Void> client;
std::unordered_set<Key> usedKeys;
std::vector<Reference<FeedTestData>> data;
ChangeFeedOperationsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
testDuration = getOption(options, "testDuration"_sr, 60.0);
operationsPerSecond = getOption(options, "opsPerSecond"_sr, 100.0);
int64_t rand = wcx.sharedRandomNumber;
targetFeeds = deterministicRandom()->randomExp(1, 1 + rand % 10);
targetFeeds *= (0.8 + (deterministicRandom()->random01() * 0.4));
targetFeeds = std::max(1, targetFeeds / clientCount);
rand /= 10;
clientsDisjointKeyspace = rand % 2;
rand /= 2;
clearKeyWhenDestroy = rand % 2;
rand /= 2;
bool doStops = rand % 2;
rand /= 2;
bool noCreateDelete = rand % 10 == 0;
rand /= 10;
popMode = rand % 3; // 0=none, 1=read-driven, 2=op-driven
rand /= 3;
ASSERT(clientId >= 0);
ASSERT(clientId < clientCount);
ASSERT(clientCount < 255);
clearFrequency = deterministicRandom()->random01();
for (int i = 0; i < Op::OP_COUNT; i++) {
int randWeight = deterministicRandom()->randomExp(0, 5);
ASSERT(randWeight > 0);
opWeights[i] = randWeight;
}
if (!doStops) {
opWeights[Op::STOP] = 0;
}
if (noCreateDelete) {
opWeights[Op::CREATE_DELETE] = 0;
}
if (popMode != 2) {
opWeights[Op::POP] = 0;
}
std::string weightString = "|";
totalOpWeight = 0;
for (int i = 0; i < Op::OP_COUNT; i++) {
totalOpWeight += opWeights[i];
weightString += std::to_string(opWeights[i]) + "|";
}
TraceEvent("ChangeFeedOperationsInit")
.detail("TargetFeeds", targetFeeds)
.detail("DisjointKeyspace", clientsDisjointKeyspace)
.detail("ClearWhenDestroy", clearKeyWhenDestroy)
.detail("DoStops", doStops)
.detail("NoCreateDelete", noCreateDelete)
.detail("Weights", weightString);
}
Key unusedNewRandomKey() {
while (true) {
Key k = newRandomKey();
if (usedKeys.insert(k).second) {
return k;
}
}
}
Key newRandomKey() {
if (clientsDisjointKeyspace) {
double keyspaceRange = (1.0 / clientCount);
double randPartOfRange = deterministicRandom()->random01() * (keyspaceRange - 0.0001);
double randomDouble = clientId * keyspaceRange + 0.0001 + randPartOfRange;
return doubleToTestKey(randomDouble);
} else {
// this is kinda hacky but it guarantees disjoint keys per client
Key ret = doubleToTestKey(deterministicRandom()->random01());
std::string str = ret.toString();
str.back() = (uint8_t)clientId;
return Key(str);
}
}
// Pick op with weighted average
Op pickRandomOp() {
int r = deterministicRandom()->randomInt(0, totalOpWeight);
int i = 0;
while (i < Op::OP_COUNT && (opWeights[i] <= r || opWeights[i] == 0)) {
r -= opWeights[i];
i++;
}
ASSERT(i < Op::OP_COUNT);
return (Op)i;
}
ACTOR Future<Void> createNewFeed(Database cx, ChangeFeedOperationsWorkload* self) {
state Transaction tr(cx);
state Key key = self->unusedNewRandomKey();
state Reference<FeedTestData> feedData = makeReference<FeedTestData>(key, self->popMode == 1);
state Value initialValue = feedData->nextValue();
if (DEBUG_CF(key)) {
fmt::print("DBG) Creating {0}\n", key.printable());
}
loop {
try {
tr.set(key, initialValue);
wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_CREATE, feedData->keyRange));
wait(tr.commit());
Version createVersion = tr.getCommittedVersion();
if (DEBUG_CF(key)) {
fmt::print("DBG) Created {0} @ {1}\n", key.printable(), createVersion);
}
feedData->update(createVersion, initialValue);
feedData->liveReader = liveReader(cx, feedData, createVersion);
self->data.push_back(feedData);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
std::string description() const override { return "ChangeFeedOperationsWorkload"; }
Future<Void> setup(Database const& cx) override { return _setup(cx, this); }
ACTOR Future<Void> _setup(Database cx, ChangeFeedOperationsWorkload* self) {
// create initial targetFeeds feeds
TraceEvent("ChangeFeedOperationsSetup").detail("InitialFeeds", self->targetFeeds).log();
state int i;
for (i = 0; i < self->targetFeeds; i++) {
wait(self->createNewFeed(cx, self));
}
TraceEvent("ChangeFeedOperationsSetupComplete");
return Void();
}
Future<Void> start(Database const& cx) override {
client = changeFeedOperationsClient(cx->clone(), this);
return delay(testDuration);
}
Future<bool> check(Database const& cx) override {
client = Future<Void>();
return _check(cx, this);
}
ACTOR Future<Void> checkFeed(Database cx, ChangeFeedOperationsWorkload* self, Reference<FeedTestData> feedData) {
state int popIdx;
feedData->testComplete();
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on live reader\n", feedData->key.printable());
}
// wait on live reader and pops to make sure they complete without error
wait(feedData->liveReader);
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on {1} pops\n", feedData->key.printable(), feedData->pops.size());
}
for (popIdx = 0; popIdx < feedData->pops.size(); popIdx++) {
wait(feedData->pops[popIdx]);
}
// do final check, read everything not popped
if (DEBUG_CF(feedData->key)) {
fmt::print("Final check {0} waiting on data check\n", feedData->key.printable(), feedData->pops.size());
}
wait(self->doRead(cx, feedData, feedData->writesByVersion.size()));
// ensure reading [0, poppedVersion) returns no results
if (feedData->poppedVersion > 0) {
if (DEBUG_CF(feedData->key)) {
fmt::print(
"Final check {0} waiting on read popped check\n", feedData->key.printable(), feedData->pops.size());
}
// FIXME: re-enable checking for popped data by changing skipPopped back to false!
wait(historicReader(cx, feedData, 0, feedData->poppedVersion, true));
}
return Void();
}
ACTOR Future<bool> _check(Database cx, ChangeFeedOperationsWorkload* self) {
TraceEvent("ChangeFeedOperationsCheck").detail("FeedCount", self->data.size()).log();
fmt::print("Checking {0} feeds\n", self->data.size()); // TODO REMOVE
state std::vector<Future<Void>> feedChecks;
for (int i = 0; i < self->data.size(); i++) {
if (self->data[i]->destroying) {
continue;
}
if (DEBUG_CF(self->data[i]->key)) {
fmt::print("Final check {0}\n", self->data[i]->key.printable());
}
feedChecks.push_back(self->checkFeed(cx, self, self->data[i]));
}
wait(waitForAll(feedChecks));
// FIXME: check that all destroyed feeds are actually destroyed?
TraceEvent("ChangeFeedOperationsCheckComplete");
return true;
}
void getMetrics(std::vector<PerfMetric>& m) override {}
ACTOR Future<Void> stopFeed(Database cx, Reference<FeedTestData> feedData) {
state Transaction tr(cx);
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Stopping\n", feedData->key.printable());
}
loop {
try {
wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_STOP, feedData->keyRange));
wait(tr.commit());
Version stopVersion = tr.getCommittedVersion();
if (!feedData->stopVersion.present()) {
feedData->stopVersion = stopVersion;
}
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Stopped @ {1}\n", feedData->key.printable(), stopVersion);
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
void popFeed(Database cx, Reference<FeedTestData> feedData) {
if (!feedData->writesByVersion.empty()) {
feedData->pop(cx, feedData->writesByVersion.front().first + 1);
}
}
ACTOR Future<Void> destroyFeed(Database cx, ChangeFeedOperationsWorkload* self, int feedIdx) {
state Reference<FeedTestData> feedData = self->data[feedIdx];
state Transaction tr(cx);
feedData->destroying = true;
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Destroying\n", feedData->key.printable());
}
loop {
try {
wait(
updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_DESTROY, feedData->keyRange));
if (self->clearKeyWhenDestroy) {
tr.clear(feedData->key);
}
wait(tr.commit());
feedData->destroyed = true;
// remove feed from list
ASSERT(self->data[feedIdx]->key == feedData->key);
swapAndPop(&self->data, feedIdx);
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Destroyed @ {1}\n", feedData->key.printable(), tr.getCommittedVersion());
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> doRead(Database cx, Reference<FeedTestData> feedData, int targetReadWidth) {
if (feedData->writesByVersion.empty()) {
return Void();
}
Version beginVersion;
Version endVersion;
if (targetReadWidth >= feedData->writesByVersion.size()) {
beginVersion = feedData->writesByVersion.front().first;
endVersion = feedData->writesByVersion.back().first + 1;
} else {
// either up to or including end
int randStart = deterministicRandom()->randomInt(0, feedData->writesByVersion.size() - targetReadWidth);
beginVersion = feedData->writesByVersion[randStart].first;
int end = randStart + targetReadWidth;
if (end == feedData->writesByVersion.size()) {
endVersion = feedData->writesByVersion.back().first + 1;
} else {
// Make sure last included value (end version -1) is a committed version for checking
endVersion = feedData->writesByVersion[end].first + 1;
}
}
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Reading @ {1} - {2}\n", feedData->key.printable(), beginVersion, endVersion);
}
// FIXME: this sometimes reads popped data!
wait(historicReader(cx, feedData, beginVersion, endVersion, true));
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Read complete\n", feedData->key.printable());
}
return Void();
}
ACTOR Future<Void> doUpdateClear(Database cx,
ChangeFeedOperationsWorkload* self,
Reference<FeedTestData> feedData) {
state Transaction tr(cx);
state Optional<Value> updateValue;
// if value is already not set, don't do a clear, otherwise pick either
if (feedData->lastCleared || deterministicRandom()->random01() > self->clearFrequency) {
updateValue = feedData->nextValue();
if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Setting {1}\n", feedData->key.printable(), updateValue.get().printable());
}
} else if (DEBUG_CF(feedData->key)) {
fmt::print("DBG) {0} Clearing\n", feedData->key.printable());
}
loop {
try {
if (updateValue.present()) {
tr.set(feedData->key, updateValue.get());
} else {
tr.clear(feedData->key);
}
wait(tr.commit());
Version writtenVersion = tr.getCommittedVersion();
if (DEBUG_CF(feedData->key) && updateValue.present()) {
fmt::print("DBG) {0} Set {1} @ {2}\n",
feedData->key.printable(),
updateValue.get().printable(),
writtenVersion);
}
if (DEBUG_CF(feedData->key) && !updateValue.present()) {
fmt::print("DBG) {0} Cleared @ {1}\n", feedData->key.printable(), writtenVersion);
}
feedData->update(writtenVersion, updateValue);
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
ACTOR Future<Void> changeFeedOperationsClient(Database cx, ChangeFeedOperationsWorkload* self) {
state double last = now();
loop {
state Future<Void> waitNextOp = poisson(&last, 1.0 / self->operationsPerSecond);
Op op = self->pickRandomOp();
int feedIdx = deterministicRandom()->randomInt(0, self->data.size());
if (op == Op::CREATE_DELETE) {
// bundle these together so random creates/deletes keep about the target number of feeds
if (deterministicRandom()->random01() < 0.5 || self->data.size() == 1) {
wait(self->createNewFeed(cx, self));
} else {
wait(self->destroyFeed(cx, self, feedIdx));
}
} else if (op == Op::READ) {
// relatively small random read
wait(self->doRead(cx, self->data[feedIdx], deterministicRandom()->randomExp(2, 8)));
} else if (op == Op::UPDATE_CLEAR) {
wait(self->doUpdateClear(cx, self, self->data[feedIdx]));
} else if (op == Op::STOP) {
wait(self->stopFeed(cx, self->data[feedIdx]));
} else if (op == Op::POP) {
self->popFeed(cx, self->data[feedIdx]);
} else {
ASSERT(false);
}
wait(waitNextOp);
}
}
};
WorkloadFactory<ChangeFeedOperationsWorkload> ChangeFeedOperationsWorkloadFactory("ChangeFeedOperations");

View File

@ -325,6 +325,7 @@ struct PhysicalShardMoveWorkLoad : TestWorkload {
TraceEvent("TestCancelDataMoveEnd").detail("DataMove", dataMove.toString());
}
TraceEvent("TestMoveShardStartMoveKeys").detail("DataMove", dataMoveId);
wait(moveKeys(cx,
dataMoveId,
keys,

View File

@ -215,7 +215,8 @@ struct SkewedReadWriteWorkload : ReadWriteCommon {
self->startReadWriteClients(cx, clients);
wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void()));
clients.clear();
wait(delay(5.0) >> updateServerShards(cx, self));
wait(delay(5.0));
wait(updateServerShards(cx, self));
}
return Void();

View File

@ -173,6 +173,19 @@ public:
}
bool coinflip() { return (this->random01() < 0.5); }
// Picks a number between 2^minExp and 2^maxExp, but uniformly distributed over exponential buckets 2^n - 2^n+1
// For example, randomExp(0, 4) would have a 25% chance of returning 1, a 25% chance of returning 2-3, a 25% chance
// of returning 4-7, and a 25% chance of returning 8-15
// Similar in Expected Value to doing 1 << randomInt(minExp, maxExp+1), except numbers returned aren't just powers
// of 2
int randomExp(int minExp, int maxExp) {
if (minExp == maxExp) { // N=2, case
return 1 << minExp;
}
int val = 1 << this->randomInt(minExp, maxExp);
return this->randomInt(val, val * 2);
}
};
extern FILE* randLog;

View File

@ -174,6 +174,7 @@ public: // introduced features
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, EncryptionAtRest);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, ShardEncodeLocationMetaData);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, Tenants);
PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, BlobGranuleFile);
};
template <>

View File

@ -1978,22 +1978,25 @@ Future<decltype(std::declval<Fun>()(std::declval<T>()).getValue())> runAfter(Fut
return res;
}
ACTOR template <class T, class U>
Future<U> runAfter(Future<T> lhs, Future<U> rhs) {
T val1 = wait(lhs);
U res = wait(rhs);
return res;
}
template <class T, class Fun>
auto operator>>=(Future<T> lhs, Fun&& rhs) -> Future<decltype(rhs(std::declval<T>()))> {
return runAfter(lhs, std::forward<Fun>(rhs));
}
/*
* NOTE: This implementation can't guarantee the doesn't really enforce the ACTOR execution order. See issue #7708
ACTOR template <class T, class U>
Future<U> runAfter(Future<T> lhs, Future<U> rhs) {
T val1 = wait(lhs);
U res = wait(rhs);
return res;
}
template <class T, class U>
Future<U> operator>>(Future<T> const& lhs, Future<U> const& rhs) {
return runAfter(lhs, rhs);
return runAfter(lhs, rhs);
}
*/
/*
* IAsyncListener is similar to AsyncVar, but it decouples the input and output, so the translation unit

View File

@ -130,8 +130,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/BackupToDBCorrectnessClean.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmall.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmallClean.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifyAtomicOps.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleVerifyCycle.toml)
add_fdb_test(TEST_FILES fast/BlobGranuleMoveVerifyCycle.toml)
add_fdb_test(TEST_FILES fast/CacheTest.toml)
add_fdb_test(TEST_FILES fast/CloggedSideband.toml)
add_fdb_test(TEST_FILES fast/CompressionUtilsUnit.toml)
@ -140,6 +139,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/CycleAndLock.toml)
add_fdb_test(TEST_FILES fast/CycleTest.toml)
add_fdb_test(TEST_FILES fast/ChangeFeeds.toml)
add_fdb_test(TEST_FILES fast/ChangeFeedOperations.toml)
add_fdb_test(TEST_FILES fast/DataLossRecovery.toml)
add_fdb_test(TEST_FILES fast/EncryptionOps.toml)
# TODO: fix failures and renable the test
@ -199,6 +199,8 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml IGNORE)
add_fdb_test(TEST_FILES fast/StorageServerCheckpointRestore.toml IGNORE)
endif()
add_fdb_test(TEST_FILES rare/BlobGranuleVerifyAtomicOps.toml)
add_fdb_test(TEST_FILES rare/BlobGranuleVerifyCycle.toml)
add_fdb_test(TEST_FILES rare/CheckRelocation.toml)
add_fdb_test(TEST_FILES rare/ClogUnclog.toml)
add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml)

View File

@ -0,0 +1,48 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4]
[[knobs]]
bg_range_source = "blobRangeKeys"
[[test]]
testTitle = 'BlobGranuleMoveVerifyCycle'
[[test.workload]]
testName = 'Cycle'
transactionsPerSecond = 250.0
testDuration = 60.0
expectedRate = 0
[[test.workload]]
testName = 'RandomMoveKeys'
testDuration = 60.0
[[test.workload]]
testName = 'BlobGranuleVerifier'
testDuration = 60.0
[[test.workload]]
testName = 'RandomClogging'
testDuration = 60.0
[[test.workload]]
testName = 'Rollback'
meanDelay = 60.0
testDuration = 60.0
[[test.workload]]
testName = 'Attrition'
machinesToKill = 10
machinesToLeave = 3
reboot = true
testDuration = 60.0
[[test.workload]]
testName = 'Attrition'
machinesToKill = 10
machinesToLeave = 3
reboot = true
testDuration = 60.0

View File

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4, 5]

View File

@ -3,7 +3,7 @@ blobGranulesEnabled = true
allowDefaultTenant = false
# FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [3, 4]
storageEngineExcludeTypes = [3, 4, 5]
[[knobs]]
bg_range_source = "blobRangeKeys"

View File

@ -0,0 +1,10 @@
[configuration]
allowDefaultTenant = false
# TODO add failure events, and then add a version that also supports randomMoveKeys
[[test]]
testTitle = 'ChangeFeedOperationsTest'
[[test.workload]]
testName = 'ChangeFeedOperations'

View File

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

View File

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

View File

@ -2,6 +2,8 @@
blobGranulesEnabled = true
allowDefaultTenant = false
allowDisablingTenants = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

View File

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]

View File

@ -1,6 +1,8 @@
[configuration]
blobGranulesEnabled = true
allowDefaultTenant = false
injectTargetedSSRestart = true
injectSSDelay = true
# FIXME: re-enable rocks at some point
storageEngineExcludeTypes = [4, 5]