Merge remote-tracking branch 'apple/main' into vgasiunas-fdbmonitor-in-python

This commit is contained in:
Vaidas Gasiunas 2022-11-07 10:33:06 +01:00
commit 286f9b729a
53 changed files with 806 additions and 299 deletions

View File

@ -459,8 +459,10 @@ int main(int argc, char** argv) {
retCode = 1; retCode = 1;
} }
fprintf(stderr, "Stopping FDB network thread\n");
fdb_check(fdb::network::stop(), "Failed to stop FDB thread"); fdb_check(fdb::network::stop(), "Failed to stop FDB thread");
network_thread.join(); network_thread.join();
fprintf(stderr, "FDB network thread successfully stopped\n");
} catch (const std::exception& err) { } catch (const std::exception& err) {
fmt::print(stderr, "ERROR: {}\n", err.what()); fmt::print(stderr, "ERROR: {}\n", err.what());
retCode = 1; retCode = 1;

View File

@ -142,6 +142,8 @@ Here is a complete list of valid parameters:
*multipart_min_part_size* (or *minps*) - Min part size for multipart uploads. *multipart_min_part_size* (or *minps*) - Min part size for multipart uploads.
*enable_read_cache* (or *erc*) - Whether to enable read block cache.
*read_block_size* (or *rbs*) - Block size in bytes to be used for reads. *read_block_size* (or *rbs*) - Block size in bytes to be used for reads.
*read_ahead_blocks* (or *rab*) - Number of blocks to read ahead of requested offset. *read_ahead_blocks* (or *rab*) - Number of blocks to read ahead of requested offset.

View File

@ -11,6 +11,7 @@ Release Notes
* Released with AVX disabled. * Released with AVX disabled.
* Fixed a transaction log data corruption bug. `(PR #8525) <https://github.com/apple/foundationdb/pull/8525>`_, `(PR #8562) <https://github.com/apple/foundationdb/pull/8562>`_, and `(PR #8647) <https://github.com/apple/foundationdb/pull/8647>`_ * Fixed a transaction log data corruption bug. `(PR #8525) <https://github.com/apple/foundationdb/pull/8525>`_, `(PR #8562) <https://github.com/apple/foundationdb/pull/8562>`_, and `(PR #8647) <https://github.com/apple/foundationdb/pull/8647>`_
* Fixed a rare data race in transaction logs when PEEK_BATCHING_EMPTY_MSG is enabled. `(PR #8660) <https://github.com/apple/foundationdb/pull/8660>`_ * Fixed a rare data race in transaction logs when PEEK_BATCHING_EMPTY_MSG is enabled. `(PR #8660) <https://github.com/apple/foundationdb/pull/8660>`_
* Fixed a heap-use-after-free bug in cluster controller. `(PR #8683) <https://github.com/apple/foundationdb/pull/8683>`_
* Changed consistency check to report all corruptions. `(PR #8571) <https://github.com/apple/foundationdb/pull/8571>`_ * Changed consistency check to report all corruptions. `(PR #8571) <https://github.com/apple/foundationdb/pull/8571>`_
* Fixed a rare storage server crashing bug after recovery. `(PR #8468) <https://github.com/apple/foundationdb/pull/8468>`_ * Fixed a rare storage server crashing bug after recovery. `(PR #8468) <https://github.com/apple/foundationdb/pull/8468>`_
* Added client knob UNLINKONLOAD_FDBCLIB to control deletion of external client libraries. `(PR #8434) <https://github.com/apple/foundationdb/pull/8434>`_ * Added client knob UNLINKONLOAD_FDBCLIB to control deletion of external client libraries. `(PR #8434) <https://github.com/apple/foundationdb/pull/8434>`_

View File

@ -175,11 +175,13 @@ Future<Reference<IAsyncFile>> BackupContainerS3BlobStore::readFile(const std::st
if (usesEncryption()) { if (usesEncryption()) {
f = makeReference<AsyncFileEncrypted>(f, AsyncFileEncrypted::Mode::READ_ONLY); f = makeReference<AsyncFileEncrypted>(f, AsyncFileEncrypted::Mode::READ_ONLY);
} }
f = makeReference<AsyncFileReadAheadCache>(f, if (m_bstore->knobs.enable_read_cache) {
m_bstore->knobs.read_block_size, f = makeReference<AsyncFileReadAheadCache>(f,
m_bstore->knobs.read_ahead_blocks, m_bstore->knobs.read_block_size,
m_bstore->knobs.concurrent_reads_per_file, m_bstore->knobs.read_ahead_blocks,
m_bstore->knobs.read_cache_blocks_per_file); m_bstore->knobs.concurrent_reads_per_file,
m_bstore->knobs.read_cache_blocks_per_file);
}
return f; return f;
} }

View File

@ -76,6 +76,10 @@ BlobCipherMetrics::BlobCipherMetrics()
UID(), UID(),
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL, FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE), FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
getBlobMetadataLatency("GetBlobMetadataLatency",
UID(),
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_INTERVAL,
FLOW_KNOBS->ENCRYPT_KEY_CACHE_LOGGING_SAMPLE_SIZE),
counterSets({ CounterSet(cc, "TLog"), counterSets({ CounterSet(cc, "TLog"),
CounterSet(cc, "KVMemory"), CounterSet(cc, "KVMemory"),
CounterSet(cc, "KVRedwood"), CounterSet(cc, "KVRedwood"),

View File

@ -650,12 +650,12 @@ struct IndexedBlobGranuleFile {
IndexBlobGranuleFileChunkRef chunkRef = IndexBlobGranuleFileChunkRef chunkRef =
IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena); IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena);
ChildType child;
ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), IncludeVersion());
dataReader.deserialize(FileIdentifierFor<ChildType>::value, child, childArena);
// TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused? // TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused?
return Standalone<ChildType>(child, childArena);
BinaryReader br(chunkRef.chunkBytes.get(), IncludeVersion());
Standalone<ChildType> child;
br >> child;
return child;
} }
template <class Ar> template <class Ar>
@ -751,7 +751,7 @@ Value serializeChunkedSnapshot(const Standalone<StringRef>& fileNameRef,
if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) { if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) {
Value serialized = Value serialized =
ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); BinaryWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
Value chunkBytes = Value chunkBytes =
IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena()); IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
chunks.push_back(chunkBytes); chunks.push_back(chunkBytes);
@ -1020,7 +1020,7 @@ Value serializeChunkedDeltaFile(const Standalone<StringRef>& fileNameRef,
if (currentChunkBytesEstimate >= chunkSize || i == boundaries.size() - 1) { if (currentChunkBytesEstimate >= chunkSize || i == boundaries.size() - 1) {
Value serialized = Value serialized =
ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); BinaryWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile()));
Value chunkBytes = Value chunkBytes =
IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena()); IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena());
chunks.push_back(chunkBytes); chunks.push_back(chunkBytes);

View File

@ -220,6 +220,7 @@ void ClientKnobs::initialize(Randomize randomize) {
init( BLOBSTORE_CONCURRENT_WRITES_PER_FILE, 5 ); init( BLOBSTORE_CONCURRENT_WRITES_PER_FILE, 5 );
init( BLOBSTORE_CONCURRENT_READS_PER_FILE, 3 ); init( BLOBSTORE_CONCURRENT_READS_PER_FILE, 3 );
init( BLOBSTORE_ENABLE_READ_CACHE, true );
init( BLOBSTORE_READ_BLOCK_SIZE, 1024 * 1024 ); init( BLOBSTORE_READ_BLOCK_SIZE, 1024 * 1024 );
init( BLOBSTORE_READ_AHEAD_BLOCKS, 0 ); init( BLOBSTORE_READ_AHEAD_BLOCKS, 0 );
init( BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE, 2 ); init( BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE, 2 );

View File

@ -658,7 +658,7 @@ bool DatabaseConfiguration::setInternal(KeyRef key, ValueRef value) {
parse((&type), value); parse((&type), value);
blobGranulesEnabled = (type != 0); blobGranulesEnabled = (type != 0);
} else if (ck == "encryption_at_rest_mode"_sr) { } else if (ck == "encryption_at_rest_mode"_sr) {
encryptionAtRestMode = EncryptionAtRestMode::fromValue(value); encryptionAtRestMode = EncryptionAtRestMode::fromValueRef(Optional<ValueRef>(value));
} else { } else {
return false; return false;
} }

View File

@ -18,6 +18,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "flow/Trace.h"
#ifdef ADDRESS_SANITIZER #ifdef ADDRESS_SANITIZER
#include <sanitizer/lsan_interface.h> #include <sanitizer/lsan_interface.h>
#endif #endif
@ -2812,11 +2813,19 @@ void MultiVersionApi::runNetwork() {
}); });
} }
localClient->api->runNetwork(); try {
localClient->api->runNetwork();
} catch (const Error& e) {
closeTraceFile();
throw e;
}
for (auto h : handles) { for (auto h : handles) {
waitThread(h); waitThread(h);
} }
TraceEvent("MultiVersionRunNetworkTerminating");
closeTraceFile();
} }
void MultiVersionApi::stopNetwork() { void MultiVersionApi::stopNetwork() {

View File

@ -6384,8 +6384,11 @@ ACTOR static Future<Void> tryCommit(Reference<TransactionState> trState,
} }
if (req.tagSet.present() && trState->options.priority < TransactionPriority::IMMEDIATE) { if (req.tagSet.present() && trState->options.priority < TransactionPriority::IMMEDIATE) {
wait(store(req.transaction.read_snapshot, readVersion) && state Future<Optional<ClientTrCommitCostEstimation>> commitCostFuture =
store(req.commitCostEstimation, estimateCommitCosts(trState, &req.transaction))); estimateCommitCosts(trState, &req.transaction);
// We need to wait for the read version first so that we can be notified if the database is locked
wait(store(req.transaction.read_snapshot, readVersion));
wait(store(req.commitCostEstimation, commitCostFuture));
} else { } else {
wait(store(req.transaction.read_snapshot, readVersion)); wait(store(req.transaction.read_snapshot, readVersion));
} }

View File

@ -88,6 +88,7 @@ S3BlobStoreEndpoint::BlobKnobs::BlobKnobs() {
concurrent_lists = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_LISTS; concurrent_lists = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_LISTS;
concurrent_reads_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_READS_PER_FILE; concurrent_reads_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_READS_PER_FILE;
concurrent_writes_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_WRITES_PER_FILE; concurrent_writes_per_file = CLIENT_KNOBS->BLOBSTORE_CONCURRENT_WRITES_PER_FILE;
enable_read_cache = CLIENT_KNOBS->BLOBSTORE_ENABLE_READ_CACHE;
read_block_size = CLIENT_KNOBS->BLOBSTORE_READ_BLOCK_SIZE; read_block_size = CLIENT_KNOBS->BLOBSTORE_READ_BLOCK_SIZE;
read_ahead_blocks = CLIENT_KNOBS->BLOBSTORE_READ_AHEAD_BLOCKS; read_ahead_blocks = CLIENT_KNOBS->BLOBSTORE_READ_AHEAD_BLOCKS;
read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE; read_cache_blocks_per_file = CLIENT_KNOBS->BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;
@ -125,6 +126,7 @@ bool S3BlobStoreEndpoint::BlobKnobs::set(StringRef name, int value) {
TRY_PARAM(concurrent_lists, cl); TRY_PARAM(concurrent_lists, cl);
TRY_PARAM(concurrent_reads_per_file, crpf); TRY_PARAM(concurrent_reads_per_file, crpf);
TRY_PARAM(concurrent_writes_per_file, cwpf); TRY_PARAM(concurrent_writes_per_file, cwpf);
TRY_PARAM(enable_read_cache, erc);
TRY_PARAM(read_block_size, rbs); TRY_PARAM(read_block_size, rbs);
TRY_PARAM(read_ahead_blocks, rab); TRY_PARAM(read_ahead_blocks, rab);
TRY_PARAM(read_cache_blocks_per_file, rcb); TRY_PARAM(read_cache_blocks_per_file, rcb);
@ -162,6 +164,7 @@ std::string S3BlobStoreEndpoint::BlobKnobs::getURLParameters() const {
_CHECK_PARAM(concurrent_lists, cl); _CHECK_PARAM(concurrent_lists, cl);
_CHECK_PARAM(concurrent_reads_per_file, crpf); _CHECK_PARAM(concurrent_reads_per_file, crpf);
_CHECK_PARAM(concurrent_writes_per_file, cwpf); _CHECK_PARAM(concurrent_writes_per_file, cwpf);
_CHECK_PARAM(enable_read_cache, erc);
_CHECK_PARAM(read_block_size, rbs); _CHECK_PARAM(read_block_size, rbs);
_CHECK_PARAM(read_ahead_blocks, rab); _CHECK_PARAM(read_ahead_blocks, rab);
_CHECK_PARAM(read_cache_blocks_per_file, rcb); _CHECK_PARAM(read_cache_blocks_per_file, rcb);

View File

@ -115,6 +115,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ENABLE_DETAILED_TLOG_POP_TRACE, false ); if ( randomize && BUGGIFY ) ENABLE_DETAILED_TLOG_POP_TRACE = true; init( ENABLE_DETAILED_TLOG_POP_TRACE, false ); if ( randomize && BUGGIFY ) ENABLE_DETAILED_TLOG_POP_TRACE = true;
init( PEEK_BATCHING_EMPTY_MSG, false ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG = true; init( PEEK_BATCHING_EMPTY_MSG, false ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG = true;
init( PEEK_BATCHING_EMPTY_MSG_INTERVAL, 0.001 ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG_INTERVAL = 0.01; init( PEEK_BATCHING_EMPTY_MSG_INTERVAL, 0.001 ); if ( randomize && BUGGIFY ) PEEK_BATCHING_EMPTY_MSG_INTERVAL = 0.01;
init( POP_FROM_LOG_DELAY, 1 ); if ( randomize && BUGGIFY ) POP_FROM_LOG_DELAY = 0;
// disk snapshot max timeout, to be put in TLog, storage and coordinator nodes // disk snapshot max timeout, to be put in TLog, storage and coordinator nodes
init( MAX_FORKED_PROCESS_OUTPUT, 1024 ); init( MAX_FORKED_PROCESS_OUTPUT, 1024 );
@ -295,7 +296,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000; init( DD_STORAGE_WIGGLE_PAUSE_THRESHOLD, 10 ); if( randomize && BUGGIFY ) DD_STORAGE_WIGGLE_PAUSE_THRESHOLD = 1000;
init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 ); init( DD_STORAGE_WIGGLE_STUCK_THRESHOLD, 20 );
init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120; init( DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC, isSimulated ? 2 : 21 * 60 * 60 * 24 ); if(randomize && BUGGIFY) DD_STORAGE_WIGGLE_MIN_SS_AGE_SEC = isSimulated ? 0: 120;
init( DD_TENANT_AWARENESS_ENABLED, false ); init( DD_TENANT_AWARENESS_ENABLED, false ); if(isSimulated) DD_TENANT_AWARENESS_ENABLED = deterministicRandom()->coinflip();
init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_LIST_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_LIST_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL, 2 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_USAGE_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10); init( TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL, 10 ); if( randomize && BUGGIFY ) TENANT_CACHE_STORAGE_QUOTA_REFRESH_INTERVAL = deterministicRandom()->randomInt(1, 10);
@ -407,6 +408,8 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( ROCKSDB_HISTOGRAMS_SAMPLE_RATE, 0.001 ); if( randomize && BUGGIFY ) ROCKSDB_HISTOGRAMS_SAMPLE_RATE = 0; init( ROCKSDB_HISTOGRAMS_SAMPLE_RATE, 0.001 ); if( randomize && BUGGIFY ) ROCKSDB_HISTOGRAMS_SAMPLE_RATE = 0;
init( ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME, 30.0 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME = 0.1; init( ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME, 30.0 ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME = 0.1;
init( ROCKSDB_READ_RANGE_REUSE_ITERATORS, true ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_ITERATORS = deterministicRandom()->coinflip() ? true : false; init( ROCKSDB_READ_RANGE_REUSE_ITERATORS, true ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_ITERATORS = deterministicRandom()->coinflip() ? true : false;
init( ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS, false ); if( randomize && BUGGIFY ) ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS = deterministicRandom()->coinflip() ? true : false;
init( ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT, 200 );
// Set to 0 to disable rocksdb write rate limiting. Rate limiter unit: bytes per second. // Set to 0 to disable rocksdb write rate limiting. Rate limiter unit: bytes per second.
init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 ); init( ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC, 0 );
// If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO. // If true, enables dynamic adjustment of ROCKSDB_WRITE_RATE_LIMITER_BYTES according to the recent demand of background IO.
@ -958,7 +961,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( KMS_CONNECTOR_TYPE, "RESTKmsConnector" ); init( KMS_CONNECTOR_TYPE, "RESTKmsConnector" );
// Blob granlues // Blob granlues
init( BG_URL, isSimulated ? "file://fdbblob/" : "" ); // TODO: store in system key space or something, eventually init( BG_URL, isSimulated ? "file://simfdb/fdbblob/" : "" ); // TODO: store in system key space or something, eventually
bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY); bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY);
// BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs" // BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs"
init( BG_METADATA_SOURCE, "knobs" ); init( BG_METADATA_SOURCE, "knobs" );
@ -1002,6 +1005,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( BLOB_MANIFEST_BACKUP, false ); init( BLOB_MANIFEST_BACKUP, false );
init( BLOB_MANIFEST_BACKUP_INTERVAL, isSimulated ? 5.0 : 30.0 ); init( BLOB_MANIFEST_BACKUP_INTERVAL, isSimulated ? 5.0 : 30.0 );
init( BLOB_FULL_RESTORE_MODE, false ); init( BLOB_FULL_RESTORE_MODE, false );
init( BLOB_MIGRATOR_CHECK_INTERVAL, isSimulated ? 1.0 : 5.0);
init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 ); init( BGCC_TIMEOUT, isSimulated ? 10.0 : 120.0 );
init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 ); init( BGCC_MIN_INTERVAL, isSimulated ? 1.0 : 10.0 );

View File

@ -743,10 +743,10 @@ void ThreadSafeApi::runNetwork() {
Optional<Error> runErr; Optional<Error> runErr;
try { try {
::runNetwork(); ::runNetwork();
} catch (Error& e) { } catch (const Error& e) {
TraceEvent(SevError, "RunNetworkError").error(e); TraceEvent(SevError, "RunNetworkError").error(e);
runErr = e; runErr = e;
} catch (std::exception& e) { } catch (const std::exception& e) {
runErr = unknown_error(); runErr = unknown_error();
TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what()); TraceEvent(SevError, "RunNetworkError").error(unknown_error()).detail("RootException", e.what());
} catch (...) { } catch (...) {
@ -757,9 +757,9 @@ void ThreadSafeApi::runNetwork() {
for (auto& hook : threadCompletionHooks) { for (auto& hook : threadCompletionHooks) {
try { try {
hook.first(hook.second); hook.first(hook.second);
} catch (Error& e) { } catch (const Error& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(e); TraceEvent(SevError, "NetworkShutdownHookError").error(e);
} catch (std::exception& e) { } catch (const std::exception& e) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what()); TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()).detail("RootException", e.what());
} catch (...) { } catch (...) {
TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error()); TraceEvent(SevError, "NetworkShutdownHookError").error(unknown_error());
@ -767,12 +767,10 @@ void ThreadSafeApi::runNetwork() {
} }
if (runErr.present()) { if (runErr.present()) {
closeTraceFile();
throw runErr.get(); throw runErr.get();
} }
TraceEvent("RunNetworkTerminating"); TraceEvent("RunNetworkTerminating");
closeTraceFile();
} }
void ThreadSafeApi::stopNetwork() { void ThreadSafeApi::stopNetwork() {

View File

@ -103,6 +103,7 @@ public:
Counter latestCipherKeyCacheNeedsRefresh; Counter latestCipherKeyCacheNeedsRefresh;
LatencySample getCipherKeysLatency; LatencySample getCipherKeysLatency;
LatencySample getLatestCipherKeysLatency; LatencySample getLatestCipherKeysLatency;
LatencySample getBlobMetadataLatency;
std::array<CounterSet, int(UsageType::MAX)> counterSets; std::array<CounterSet, int(UsageType::MAX)> counterSets;
}; };

View File

@ -235,6 +235,7 @@ public:
int BLOBSTORE_CONCURRENT_LISTS; int BLOBSTORE_CONCURRENT_LISTS;
int BLOBSTORE_CONCURRENT_WRITES_PER_FILE; int BLOBSTORE_CONCURRENT_WRITES_PER_FILE;
int BLOBSTORE_CONCURRENT_READS_PER_FILE; int BLOBSTORE_CONCURRENT_READS_PER_FILE;
int BLOBSTORE_ENABLE_READ_CACHE;
int BLOBSTORE_READ_BLOCK_SIZE; int BLOBSTORE_READ_BLOCK_SIZE;
int BLOBSTORE_READ_AHEAD_BLOCKS; int BLOBSTORE_READ_AHEAD_BLOCKS;
int BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE; int BLOBSTORE_READ_CACHE_BLOCKS_PER_FILE;

View File

@ -546,36 +546,37 @@ struct hash<KeyRange> {
enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits<int64_t>::max() }; enum { invalidVersion = -1, latestVersion = -2, MAX_VERSION = std::numeric_limits<int64_t>::max() };
inline Key keyAfter(const KeyRef& key) {
if (key == "\xff\xff"_sr)
return key;
Standalone<StringRef> r;
uint8_t* s = new (r.arena()) uint8_t[key.size() + 1];
if (key.size() > 0) {
memcpy(s, key.begin(), key.size());
}
s[key.size()] = 0;
((StringRef&)r) = StringRef(s, key.size() + 1);
return r;
}
inline KeyRef keyAfter(const KeyRef& key, Arena& arena) { inline KeyRef keyAfter(const KeyRef& key, Arena& arena) {
if (key == "\xff\xff"_sr) // Don't include fdbclient/SystemData.h for the allKeys symbol to avoid a cyclic include
return key; static const auto allKeysEnd = "\xff\xff"_sr;
if (key == allKeysEnd) {
return allKeysEnd;
}
uint8_t* t = new (arena) uint8_t[key.size() + 1]; uint8_t* t = new (arena) uint8_t[key.size() + 1];
memcpy(t, key.begin(), key.size()); if (!key.empty()) {
memcpy(t, key.begin(), key.size());
}
t[key.size()] = 0; t[key.size()] = 0;
return KeyRef(t, key.size() + 1); return KeyRef(t, key.size() + 1);
} }
inline KeyRange singleKeyRange(const KeyRef& a) { inline Key keyAfter(const KeyRef& key) {
return KeyRangeRef(a, keyAfter(a)); Key result;
result.contents() = keyAfter(key, result.arena());
return result;
} }
inline KeyRangeRef singleKeyRange(KeyRef const& key, Arena& arena) { inline KeyRangeRef singleKeyRange(KeyRef const& key, Arena& arena) {
uint8_t* t = new (arena) uint8_t[key.size() + 1]; uint8_t* t = new (arena) uint8_t[key.size() + 1];
memcpy(t, key.begin(), key.size()); if (!key.empty()) {
memcpy(t, key.begin(), key.size());
}
t[key.size()] = 0; t[key.size()] = 0;
return KeyRangeRef(KeyRef(t, key.size()), KeyRef(t, key.size() + 1)); return KeyRangeRef(KeyRef(t, key.size()), KeyRef(t, key.size() + 1));
} }
inline KeyRange singleKeyRange(const KeyRef& a) {
KeyRange result;
result.contents() = singleKeyRange(a, result.arena());
return result;
}
inline KeyRange prefixRange(KeyRef prefix) { inline KeyRange prefixRange(KeyRef prefix) {
Standalone<KeyRangeRef> range; Standalone<KeyRangeRef> range;
KeyRef start = KeyRef(range.arena(), prefix); KeyRef start = KeyRef(range.arena(), prefix);
@ -1494,7 +1495,7 @@ struct EncryptionAtRestMode {
bool operator==(const EncryptionAtRestMode& e) const { return isEquals(e); } bool operator==(const EncryptionAtRestMode& e) const { return isEquals(e); }
bool operator!=(const EncryptionAtRestMode& e) const { return !isEquals(e); } bool operator!=(const EncryptionAtRestMode& e) const { return !isEquals(e); }
static EncryptionAtRestMode fromValue(Optional<ValueRef> val) { static EncryptionAtRestMode fromValueRef(Optional<ValueRef> val) {
if (!val.present()) { if (!val.present()) {
return DISABLED; return DISABLED;
} }
@ -1508,6 +1509,14 @@ struct EncryptionAtRestMode {
return static_cast<Mode>(num); return static_cast<Mode>(num);
} }
static EncryptionAtRestMode fromValue(Optional<Value> val) {
if (!val.present()) {
return EncryptionAtRestMode();
}
return EncryptionAtRestMode::fromValueRef(Optional<ValueRef>(val.get().contents()));
}
uint32_t mode; uint32_t mode;
}; };

View File

@ -58,8 +58,8 @@ public:
requests_per_second, list_requests_per_second, write_requests_per_second, read_requests_per_second, requests_per_second, list_requests_per_second, write_requests_per_second, read_requests_per_second,
delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests, delete_requests_per_second, multipart_max_part_size, multipart_min_part_size, concurrent_requests,
concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file, concurrent_uploads, concurrent_lists, concurrent_reads_per_file, concurrent_writes_per_file,
read_block_size, read_ahead_blocks, read_cache_blocks_per_file, max_send_bytes_per_second, enable_read_cache, read_block_size, read_ahead_blocks, read_cache_blocks_per_file,
max_recv_bytes_per_second, sdk_auth; max_send_bytes_per_second, max_recv_bytes_per_second, sdk_auth;
bool set(StringRef name, int value); bool set(StringRef name, int value);
std::string getURLParameters() const; std::string getURLParameters() const;
static std::vector<std::string> getKnobDescriptions() { static std::vector<std::string> getKnobDescriptions() {
@ -86,6 +86,7 @@ public:
"concurrent_lists (or cl) Max concurrent list operations that can be in progress at once.", "concurrent_lists (or cl) Max concurrent list operations that can be in progress at once.",
"concurrent_reads_per_file (or crps) Max concurrent reads in progress for any one file.", "concurrent_reads_per_file (or crps) Max concurrent reads in progress for any one file.",
"concurrent_writes_per_file (or cwps) Max concurrent uploads in progress for any one file.", "concurrent_writes_per_file (or cwps) Max concurrent uploads in progress for any one file.",
"enable_read_cache (or erc) Whether read block caching is enabled.",
"read_block_size (or rbs) Block size in bytes to be used for reads.", "read_block_size (or rbs) Block size in bytes to be used for reads.",
"read_ahead_blocks (or rab) Number of blocks to read ahead of requested offset.", "read_ahead_blocks (or rab) Number of blocks to read ahead of requested offset.",
"read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.", "read_cache_blocks_per_file (or rcb) Size of the read cache for a file in blocks.",

View File

@ -110,6 +110,7 @@ public:
double BLOCKING_PEEK_TIMEOUT; double BLOCKING_PEEK_TIMEOUT;
bool PEEK_BATCHING_EMPTY_MSG; bool PEEK_BATCHING_EMPTY_MSG;
double PEEK_BATCHING_EMPTY_MSG_INTERVAL; double PEEK_BATCHING_EMPTY_MSG_INTERVAL;
double POP_FROM_LOG_DELAY;
// Data distribution queue // Data distribution queue
double HEALTH_POLL_TIME; double HEALTH_POLL_TIME;
@ -334,6 +335,8 @@ public:
double ROCKSDB_HISTOGRAMS_SAMPLE_RATE; double ROCKSDB_HISTOGRAMS_SAMPLE_RATE;
double ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME; double ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME;
bool ROCKSDB_READ_RANGE_REUSE_ITERATORS; bool ROCKSDB_READ_RANGE_REUSE_ITERATORS;
bool ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS;
int ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT;
int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC; int64_t ROCKSDB_WRITE_RATE_LIMITER_BYTES_PER_SEC;
bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE; bool ROCKSDB_WRITE_RATE_LIMITER_AUTO_TUNE;
std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY; std::string DEFAULT_FDB_ROCKSDB_COLUMN_FAMILY;
@ -982,6 +985,7 @@ public:
bool BLOB_MANIFEST_BACKUP; bool BLOB_MANIFEST_BACKUP;
double BLOB_MANIFEST_BACKUP_INTERVAL; double BLOB_MANIFEST_BACKUP_INTERVAL;
bool BLOB_FULL_RESTORE_MODE; bool BLOB_FULL_RESTORE_MODE;
double BLOB_MIGRATOR_CHECK_INTERVAL;
// Blob metadata // Blob metadata
int64_t BLOB_METADATA_CACHE_TTL; int64_t BLOB_METADATA_CACHE_TTL;

View File

@ -734,6 +734,7 @@ public:
// If cancelled, request was or will be delivered zero or more times. // If cancelled, request was or will be delivered zero or more times.
template <class X> template <class X>
Future<REPLY_TYPE(X)> getReply(const X& value) const { Future<REPLY_TYPE(X)> getReply(const X& value) const {
// Ensure the same request isn't used multiple times
ASSERT(!getReplyPromise(value).getFuture().isReady()); ASSERT(!getReplyPromise(value).getFuture().isReady());
if (queue->isRemoteEndpoint()) { if (queue->isRemoteEndpoint()) {
return sendCanceler(getReplyPromise(value), return sendCanceler(getReplyPromise(value),

View File

@ -477,6 +477,7 @@ public:
Optional<Standalone<StringRef>> primaryDcId; Optional<Standalone<StringRef>> primaryDcId;
Reference<IReplicationPolicy> remoteTLogPolicy; Reference<IReplicationPolicy> remoteTLogPolicy;
int32_t usableRegions; int32_t usableRegions;
bool quiesced = false;
std::string disablePrimary; std::string disablePrimary;
std::string disableRemote; std::string disableRemote;
std::string originalRegions; std::string originalRegions;

View File

@ -1410,6 +1410,7 @@ public:
for (auto processInfo : getAllProcesses()) { for (auto processInfo : getAllProcesses()) {
if (currentDcId != processInfo->locality.dcId() || // skip other dc if (currentDcId != processInfo->locality.dcId() || // skip other dc
processInfo->startingClass != ProcessClass::BlobWorkerClass || // skip non blob workers processInfo->startingClass != ProcessClass::BlobWorkerClass || // skip non blob workers
processInfo->failed || // if process was killed but has not yet been removed from the process list
processInfo->locality.machineId() == machineId) { // skip current machine processInfo->locality.machineId() == machineId) { // skip current machine
continue; continue;
} }

View File

@ -462,7 +462,7 @@ ACTOR Future<Void> loadBlobMetadataForTenants(
} }
// FIXME: if one tenant gets an error, don't kill whole process // FIXME: if one tenant gets an error, don't kill whole process
// TODO: add latency metrics state double startTime = now();
loop { loop {
Future<EKPGetLatestBlobMetadataReply> requestFuture; Future<EKPGetLatestBlobMetadataReply> requestFuture;
if (self->dbInfo.isValid() && self->dbInfo->get().encryptKeyProxy.present()) { if (self->dbInfo.isValid() && self->dbInfo->get().encryptKeyProxy.present()) {
@ -485,6 +485,8 @@ ACTOR Future<Void> loadBlobMetadataForTenants(
ASSERT(dataEntry.begin() == info->second.prefix); ASSERT(dataEntry.begin() == info->second.prefix);
dataEntry.cvalue()->updateBStore(metadata); dataEntry.cvalue()->updateBStore(metadata);
} }
double elapsed = now() - startTime;
BlobCipherMetrics::getInstance()->getBlobMetadataLatency.addMeasurement(elapsed);
return Void(); return Void();
} }
when(wait(self->dbInfo->onChange())) {} when(wait(self->dbInfo->onChange())) {}

View File

@ -4238,7 +4238,13 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
Version purgeVersion, Version purgeVersion,
KeyRange granuleRange, KeyRange granuleRange,
Optional<UID> mergeChildID, Optional<UID> mergeChildID,
bool force) { bool force,
Future<Void> parentFuture) {
// wait for parent to finish first to avoid ordering/orphaning issues
wait(parentFuture);
// yield to avoid a long callstack and to allow this to get cancelled
wait(delay(0));
if (BM_PURGE_DEBUG) { if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule [{1} - {2}): {3} @ {4}{5}\n", fmt::print("BM {0} Fully deleting granule [{1} - {2}): {3} @ {4}{5}\n",
self->epoch, self->epoch,
@ -4296,6 +4302,11 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
// deleting files before corresponding metadata reduces the # of orphaned files. // deleting files before corresponding metadata reduces the # of orphaned files.
wait(waitForAll(deletions)); wait(waitForAll(deletions));
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
// delete metadata in FDB (history entry and file keys) // delete metadata in FDB (history entry and file keys)
if (BM_PURGE_DEBUG) { if (BM_PURGE_DEBUG) {
fmt::print( fmt::print(
@ -4331,6 +4342,11 @@ ACTOR Future<Void> fullyDeleteGranule(Reference<BlobManagerData> self,
} }
} }
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
if (BM_PURGE_DEBUG) { if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Fully deleting granule {1}: success {2}\n", fmt::print("BM {0} Fully deleting granule {1}: success {2}\n",
self->epoch, self->epoch,
@ -4501,7 +4517,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
state std::queue<std::tuple<KeyRange, Version, Version, Optional<UID>>> historyEntryQueue; state std::queue<std::tuple<KeyRange, Version, Version, Optional<UID>>> historyEntryQueue;
// stacks of <granuleId, historyKey> and <granuleId> (and mergeChildID) to track which granules to delete // stacks of <granuleId, historyKey> and <granuleId> (and mergeChildID) to track which granules to delete
state std::vector<std::tuple<UID, Key, KeyRange, Optional<UID>>> toFullyDelete; state std::vector<std::tuple<UID, Key, KeyRange, Optional<UID>, Version>> toFullyDelete;
state std::vector<std::pair<UID, KeyRange>> toPartiallyDelete; state std::vector<std::pair<UID, KeyRange>> toPartiallyDelete;
// track which granules we have already added to traversal // track which granules we have already added to traversal
@ -4737,7 +4753,7 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
fmt::print( fmt::print(
"BM {0} Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString()); "BM {0} Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString());
} }
toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange, mergeChildID }); toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange, mergeChildID, startVersion });
} else if (startVersion < purgeVersion) { } else if (startVersion < purgeVersion) {
if (BM_PURGE_DEBUG) { if (BM_PURGE_DEBUG) {
fmt::print("BM {0} Granule {1} will be partially deleted\n", fmt::print("BM {0} Granule {1} will be partially deleted\n",
@ -4810,36 +4826,65 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
.detail("DeletingFullyCount", toFullyDelete.size()) .detail("DeletingFullyCount", toFullyDelete.size())
.detail("DeletingPartiallyCount", toPartiallyDelete.size()); .detail("DeletingPartiallyCount", toPartiallyDelete.size());
state std::vector<Future<Void>> partialDeletions;
state int i; state int i;
if (BM_PURGE_DEBUG) { if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to fully delete\n", self->epoch, toFullyDelete.size()); fmt::print("BM {0}: {1} granules to fully delete\n", self->epoch, toFullyDelete.size());
} }
// Go backwards through set of granules to guarantee deleting oldest first. This avoids orphaning granules in the // Go backwards through set of granules to guarantee deleting oldest first. This avoids orphaning granules in the
// deletion process // deletion process
// FIXME: could track explicit parent dependencies and parallelize so long as a parent and child aren't running in if (!toFullyDelete.empty()) {
// parallel, but that's non-trivial state std::vector<Future<Void>> fullDeletions;
for (i = toFullyDelete.size() - 1; i >= 0; --i) { KeyRangeMap<std::pair<Version, Future<Void>>> parentDelete;
state UID granuleId; parentDelete.insert(normalKeys, { 0, Future<Void>(Void()) });
Key historyKey;
KeyRange keyRange; std::vector<std::pair<Version, int>> deleteOrder;
Optional<UID> mergeChildId; deleteOrder.reserve(toFullyDelete.size());
std::tie(granuleId, historyKey, keyRange, mergeChildId) = toFullyDelete[i]; for (int i = 0; i < toFullyDelete.size(); i++) {
// FIXME: consider batching into a single txn (need to take care of txn size limit) deleteOrder.push_back({ std::get<4>(toFullyDelete[i]), i });
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
} }
wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, keyRange, mergeChildId, force)); std::sort(deleteOrder.begin(), deleteOrder.end());
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled for (i = 0; i < deleteOrder.size(); i++) {
ASSERT(false); state UID granuleId;
Key historyKey;
KeyRange keyRange;
Optional<UID> mergeChildId;
Version startVersion;
std::tie(granuleId, historyKey, keyRange, mergeChildId, startVersion) =
toFullyDelete[deleteOrder[i].second];
// FIXME: consider batching into a single txn (need to take care of txn size limit)
if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString());
}
std::vector<Future<Void>> parents;
auto parentRanges = parentDelete.intersectingRanges(keyRange);
for (auto& it : parentRanges) {
if (startVersion <= it.cvalue().first) {
fmt::print("ERROR: [{0} - {1}) @ {2} <= [{3} - {4}) @ {5}\n",
keyRange.begin.printable(),
keyRange.end.printable(),
startVersion,
it.begin().printable(),
it.end().printable(),
it.cvalue().first);
}
ASSERT(startVersion > it.cvalue().first);
parents.push_back(it.cvalue().second);
}
Future<Void> deleteFuture = fullyDeleteGranule(
self, granuleId, historyKey, purgeVersion, keyRange, mergeChildId, force, waitForAll(parents));
fullDeletions.push_back(deleteFuture);
parentDelete.insert(keyRange, { startVersion, deleteFuture });
} }
wait(waitForAll(fullDeletions));
} }
if (BM_PURGE_DEBUG) { if (BM_PURGE_DEBUG) {
fmt::print("BM {0}: {1} granules to partially delete\n", self->epoch, toPartiallyDelete.size()); fmt::print("BM {0}: {1} granules to partially delete\n", self->epoch, toPartiallyDelete.size());
} }
state std::vector<Future<Void>> partialDeletions;
for (i = toPartiallyDelete.size() - 1; i >= 0; --i) { for (i = toPartiallyDelete.size() - 1; i >= 0; --i) {
UID granuleId; UID granuleId;
KeyRange keyRange; KeyRange keyRange;
@ -4852,6 +4897,11 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
wait(waitForAll(partialDeletions)); wait(waitForAll(partialDeletions));
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
if (force) { if (force) {
tr.reset(); tr.reset();
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
@ -4877,6 +4927,11 @@ ACTOR Future<Void> purgeRange(Reference<BlobManagerData> self, KeyRangeRef range
} }
} }
if (BUGGIFY && self->maybeInjectTargetedRestart()) {
wait(delay(0)); // should be cancelled
ASSERT(false);
}
// Now that all the necessary granules and their files have been deleted, we can // Now that all the necessary granules and their files have been deleted, we can
// clear the purgeIntent key to signify that the work is done. However, there could have been // clear the purgeIntent key to signify that the work is done. However, there could have been
// another purgeIntent that got written for this table while we were processing this one. // another purgeIntent that got written for this table while we were processing this one.

View File

@ -18,8 +18,6 @@
* limitations under the License. * limitations under the License.
*/ */
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/ActorCollection.h" #include "flow/ActorCollection.h"
#include "flow/FastRef.h" #include "flow/FastRef.h"
#include "flow/IRandom.h" #include "flow/IRandom.h"
@ -35,6 +33,8 @@
#include "fdbserver/WaitFailure.h" #include "fdbserver/WaitFailure.h"
#include "fdbserver/MoveKeys.actor.h" #include "fdbserver/MoveKeys.actor.h"
#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h"
#include "flow/actorcompiler.h" // has to be last include #include "flow/actorcompiler.h" // has to be last include
#include "flow/network.h" #include "flow/network.h"
#include <algorithm> #include <algorithm>
@ -72,7 +72,7 @@ public:
self->blobGranules_ = granules; self->blobGranules_ = granules;
wait(prepare(self, normalKeys)); wait(prepare(self, normalKeys));
wait(advanceVersion(self));
wait(serverLoop(self)); wait(serverLoop(self));
return Void(); return Void();
} }
@ -148,9 +148,78 @@ private:
} }
} }
// Print migration progress periodically
ACTOR static Future<Void> logProgress(Reference<BlobMigrator> self) {
loop {
bool done = wait(checkProgress(self));
if (done)
return Void();
wait(delay(SERVER_KNOBS->BLOB_MIGRATOR_CHECK_INTERVAL));
}
}
// Check key ranges that are migrated. Return true if all ranges are done
ACTOR static Future<bool> checkProgress(Reference<BlobMigrator> self) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
// Get key ranges that are still owned by the migrator. Those ranges are
// incompleted migrations
state UID serverID = self->interf_.ssi.id();
RangeResult ranges = wait(krmGetRanges(&tr, serverKeysPrefixFor(serverID), normalKeys));
// Count incompleted size
int64_t incompleted = 0;
for (auto i = 0; i < ranges.size() - 1; ++i) {
if (ranges[i].value == serverKeysTrue) {
KeyRangeRef range(ranges[i].key, ranges[i + 1].key);
int64_t bytes = sizeInBytes(self, range);
dprint(" incompleted {}, size: {}\n", range.toString(), bytes);
incompleted += bytes;
}
}
// Calculated progress
int64_t total = sizeInBytes(self);
int progress = (total - incompleted) * 100 / total;
bool done = incompleted == 0;
dprint("Progress {} :{}%. done {}\n", serverID.toString(), progress, done);
return done;
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Advance version, so that future commits will have a larger version than the restored data
ACTOR static Future<Void> advanceVersion(Reference<BlobMigrator> self) {
state Transaction tr(self->db_);
loop {
tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS);
tr.setOption(FDBTransactionOptions::LOCK_AWARE);
try {
Version currentVersion = wait(tr.getRawReadVersion());
Version expectedVersion = maxVersion(self);
if (currentVersion <= expectedVersion) {
tr.set(minRequiredCommitVersionKey, BinaryWriter::toValue(expectedVersion + 1, Unversioned()));
dprint("Advance version from {} to {}\n", currentVersion, expectedVersion);
wait(tr.commit());
}
return Void();
} catch (Error& e) {
wait(tr.onError(e));
}
}
}
// Main server loop // Main server loop
ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) { ACTOR static Future<Void> serverLoop(Reference<BlobMigrator> self) {
self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture())); self->actors_.add(waitFailureServer(self->interf_.ssi.waitFailure.getFuture()));
self->actors_.add(logProgress(self));
self->actors_.add(handleRequest(self)); self->actors_.add(handleRequest(self));
self->actors_.add(handleUnsupportedRequest(self)); self->actors_.add(handleUnsupportedRequest(self));
loop { loop {

View File

@ -4470,9 +4470,10 @@ ACTOR Future<Void> handleRangeAssign(Reference<BlobWorkerData> bwData,
return Void(); return Void();
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_operation_cancelled) { if (e.code() == error_code_operation_cancelled) {
if (!bwData->shuttingDown) { if (!bwData->shuttingDown && !isSelfReassign) {
// the cancelled was because the granule open was cancelled, not because the whole blob // the cancelled was because the granule open was cancelled, not because the whole blob
// worker was. // worker was.
ASSERT(!req.reply.isSet());
req.reply.sendError(granule_assignment_conflict()); req.reply.sendError(granule_assignment_conflict());
} }
throw e; throw e;

View File

@ -25,6 +25,7 @@
#include <set> #include <set>
#include <vector> #include <vector>
#include "fdbclient/FDBTypes.h"
#include "fdbclient/SystemData.h" #include "fdbclient/SystemData.h"
#include "fdbclient/DatabaseContext.h" #include "fdbclient/DatabaseContext.h"
#include "fdbrpc/FailureMonitor.h" #include "fdbrpc/FailureMonitor.h"
@ -32,6 +33,7 @@
#include "fdbserver/BlobGranuleServerCommon.actor.h" #include "fdbserver/BlobGranuleServerCommon.actor.h"
#include "fdbserver/BlobMigratorInterface.h" #include "fdbserver/BlobMigratorInterface.h"
#include "fdbserver/Knobs.h" #include "fdbserver/Knobs.h"
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/ActorCollection.h" #include "flow/ActorCollection.h"
#include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/ClusterConnectionMemoryRecord.h"
#include "fdbclient/NativeAPI.actor.h" #include "fdbclient/NativeAPI.actor.h"
@ -66,6 +68,7 @@
#include "fdbrpc/ReplicationUtils.h" #include "fdbrpc/ReplicationUtils.h"
#include "fdbrpc/sim_validation.h" #include "fdbrpc/sim_validation.h"
#include "fdbclient/KeyBackedTypes.h" #include "fdbclient/KeyBackedTypes.h"
#include "flow/Error.h"
#include "flow/Trace.h" #include "flow/Trace.h"
#include "flow/Util.h" #include "flow/Util.h"
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
@ -389,7 +392,7 @@ ACTOR Future<Void> clusterWatchDatabase(ClusterControllerData* cluster,
wait(delay(0.0)); wait(delay(0.0));
recoveryCore.cancel(); recoveryCore.cancel();
wait(cleanupRecoveryActorCollection(recoveryData, true /* exThrown */)); wait(cleanupRecoveryActorCollection(recoveryData, /*exThrown=*/true));
ASSERT(addActor.isEmpty()); ASSERT(addActor.isEmpty());
CODE_PROBE(err.code() == error_code_tlog_failed, "Terminated due to tLog failure"); CODE_PROBE(err.code() == error_code_tlog_failed, "Terminated due to tLog failure");
@ -3025,6 +3028,18 @@ ACTOR Future<Void> updateClusterId(ClusterControllerData* self) {
} }
} }
ACTOR Future<Void> handleGetEncryptionAtRestMode(ClusterControllerData* self, ClusterControllerFullInterface ccInterf) {
loop {
state GetEncryptionAtRestModeRequest req = waitNext(ccInterf.getEncryptionAtRestMode.getFuture());
TraceEvent("HandleGetEncryptionAtRestModeStart").detail("TlogId", req.tlogId);
EncryptionAtRestMode mode = wait(self->encryptionAtRestMode.getFuture());
GetEncryptionAtRestModeResponse resp;
resp.mode = mode;
req.reply.send(resp);
TraceEvent("HandleGetEncryptionAtRestModeEnd").detail("TlogId", req.tlogId).detail("Mode", resp.mode);
}
}
ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf, ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
Future<Void> leaderFail, Future<Void> leaderFail,
ServerCoordinators coordinators, ServerCoordinators coordinators,
@ -3070,6 +3085,7 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
self.addActor.send(metaclusterMetricsUpdater(&self)); self.addActor.send(metaclusterMetricsUpdater(&self));
self.addActor.send(dbInfoUpdater(&self)); self.addActor.send(dbInfoUpdater(&self));
self.addActor.send(updateClusterId(&self)); self.addActor.send(updateClusterId(&self));
self.addActor.send(handleGetEncryptionAtRestMode(&self, interf));
self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics", self.addActor.send(self.clusterControllerMetrics.traceCounters("ClusterControllerMetrics",
self.id, self.id,
SERVER_KNOBS->STORAGE_LOGGING_DELAY, SERVER_KNOBS->STORAGE_LOGGING_DELAY,
@ -3090,8 +3106,8 @@ ACTOR Future<Void> clusterControllerCore(ClusterControllerFullInterface interf,
endRole(Role::CLUSTER_CONTROLLER, interf.id(), "Stop Received Signal", true); endRole(Role::CLUSTER_CONTROLLER, interf.id(), "Stop Received Signal", true);
} }
// We shut down normally even if there was a serious error (so this fdbserver may be re-elected cluster // We shut down normally even if there was a serious error (so this fdbserver may be re-elected
// controller) // cluster controller)
return Void(); return Void();
} }
when(OpenDatabaseRequest req = waitNext(interf.clientInterface.openDatabase.getFuture())) { when(OpenDatabaseRequest req = waitNext(interf.clientInterface.openDatabase.getFuture())) {
@ -3243,11 +3259,11 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
Reference<AsyncVar<Optional<UID>>> clusterId) { Reference<AsyncVar<Optional<UID>>> clusterId) {
// Defer this wait optimization of cluster configuration has 'Encryption data at-rest' enabled. // Defer this wait optimization of cluster configuration has 'Encryption data at-rest' enabled.
// Encryption depends on available of EncryptKeyProxy (EKP) FDB role to enable fetch/refresh of encryption keys // Encryption depends on available of EncryptKeyProxy (EKP) FDB role to enable fetch/refresh of
// created and managed by external KeyManagementService (KMS). // encryption keys created and managed by external KeyManagementService (KMS).
// //
// TODO: Wait optimization is to ensure the worker server on the same process gets registered with the new CC before // TODO: Wait optimization is to ensure the worker server on the same process gets registered with the
// recruitment. Unify the codepath for both Encryption enable vs disable scenarios. // new CC before recruitment. Unify the codepath for both Encryption enable vs disable scenarios.
if (!SERVER_KNOBS->ENABLE_ENCRYPTION) { if (!SERVER_KNOBS->ENABLE_ENCRYPTION) {
wait(recoveredDiskFiles); wait(recoveredDiskFiles);
@ -3278,8 +3294,8 @@ ACTOR Future<Void> clusterController(Reference<IClusterConnectionRecord> connRec
namespace { namespace {
// Tests `ClusterControllerData::updateWorkerHealth()` can update `ClusterControllerData::workerHealth` based on // Tests `ClusterControllerData::updateWorkerHealth()` can update `ClusterControllerData::workerHealth`
// `UpdateWorkerHealth` request correctly. // based on `UpdateWorkerHealth` request correctly.
TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") { TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
// Create a testing ClusterControllerData. Most of the internal states do not matter in this test. // Create a testing ClusterControllerData. Most of the internal states do not matter in this test.
state ClusterControllerData data(ClusterControllerFullInterface(), state ClusterControllerData data(ClusterControllerFullInterface(),
@ -3292,8 +3308,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
state NetworkAddress badPeer2(IPAddress(0x03030303), 1); state NetworkAddress badPeer2(IPAddress(0x03030303), 1);
state NetworkAddress badPeer3(IPAddress(0x04040404), 1); state NetworkAddress badPeer3(IPAddress(0x04040404), 1);
// Create a `UpdateWorkerHealthRequest` with two bad peers, and they should appear in the `workerAddress`'s // Create a `UpdateWorkerHealthRequest` with two bad peers, and they should appear in the
// degradedPeers. // `workerAddress`'s degradedPeers.
{ {
UpdateWorkerHealthRequest req; UpdateWorkerHealthRequest req;
req.address = workerAddress; req.address = workerAddress;
@ -3354,8 +3370,8 @@ TEST_CASE("/fdbserver/clustercontroller/updateWorkerHealth") {
previousRefreshTime = health.degradedPeers[badPeer3].lastRefreshTime; previousRefreshTime = health.degradedPeers[badPeer3].lastRefreshTime;
} }
// Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should not remove the worker from // Create a `UpdateWorkerHealthRequest` with empty `degradedPeers`, which should not remove the worker
// `workerHealth`. // from `workerHealth`.
{ {
wait(delay(0.001)); wait(delay(0.001));
UpdateWorkerHealthRequest req; UpdateWorkerHealthRequest req;
@ -3439,8 +3455,8 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
NetworkAddress badPeer3(IPAddress(0x04040404), 1); NetworkAddress badPeer3(IPAddress(0x04040404), 1);
NetworkAddress badPeer4(IPAddress(0x05050505), 1); NetworkAddress badPeer4(IPAddress(0x05050505), 1);
// Test that a reported degraded link should stay for sometime before being considered as a degraded link by // Test that a reported degraded link should stay for sometime before being considered as a degraded
// cluster controller. // link by cluster controller.
{ {
data.workerHealth[worker].degradedPeers[badPeer1] = { now(), now() }; data.workerHealth[worker].degradedPeers[badPeer1] = { now(), now() };
data.workerHealth[worker].disconnectedPeers[badPeer2] = { now(), now() }; data.workerHealth[worker].disconnectedPeers[badPeer2] = { now(), now() };
@ -3472,7 +3488,8 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
data.workerHealth.clear(); data.workerHealth.clear();
} }
// Test that if both A complains B and B compalins A, only one of the server will be chosen as degraded server. // Test that if both A complains B and B compalins A, only one of the server will be chosen as degraded
// server.
{ {
data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1, data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,
now() }; now() };
@ -3553,8 +3570,8 @@ TEST_CASE("/fdbserver/clustercontroller/getDegradationInfo") {
data.workerHealth.clear(); data.workerHealth.clear();
} }
// Test that if the degradation is reported both ways between A and other 4 servers, no degraded server is // Test that if the degradation is reported both ways between A and other 4 servers, no degraded server
// returned. // is returned.
{ {
ASSERT(SERVER_KNOBS->CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE < 4); ASSERT(SERVER_KNOBS->CC_DEGRADED_PEER_DEGREE_TO_EXCLUDE < 4);
data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1, data.workerHealth[worker].degradedPeers[badPeer1] = { now() - SERVER_KNOBS->CC_MIN_DEGRADATION_INTERVAL - 1,

View File

@ -18,12 +18,14 @@
* limitations under the License. * limitations under the License.
*/ */
#include "fdbclient/FDBTypes.h"
#include "fdbclient/Metacluster.h" #include "fdbclient/Metacluster.h"
#include "fdbrpc/sim_validation.h" #include "fdbrpc/sim_validation.h"
#include "fdbserver/ApplyMetadataMutation.h" #include "fdbserver/ApplyMetadataMutation.h"
#include "fdbserver/BackupProgress.actor.h" #include "fdbserver/BackupProgress.actor.h"
#include "fdbserver/ClusterRecovery.actor.h" #include "fdbserver/ClusterRecovery.actor.h"
#include "fdbserver/EncryptionOpsUtils.h" #include "fdbserver/EncryptionOpsUtils.h"
#include "fdbserver/Knobs.h"
#include "fdbserver/MasterInterface.h" #include "fdbserver/MasterInterface.h"
#include "fdbserver/WaitFailure.h" #include "fdbserver/WaitFailure.h"
@ -429,18 +431,34 @@ ACTOR Future<Void> rejoinRequestHandler(Reference<ClusterRecoveryData> self) {
} }
} }
namespace {
EncryptionAtRestMode getEncryptionAtRest() {
// TODO: Use db-config encryption config to determine cluster encryption status
if (SERVER_KNOBS->ENABLE_ENCRYPTION) {
return EncryptionAtRestMode(EncryptionAtRestMode::Mode::AES_256_CTR);
} else {
return EncryptionAtRestMode();
}
}
} // namespace
// Keeps the coordinated state (cstate) updated as the set of recruited tlogs change through recovery. // Keeps the coordinated state (cstate) updated as the set of recruited tlogs change through recovery.
ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self, ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self,
Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems, Reference<AsyncVar<Reference<ILogSystem>>> oldLogSystems,
Future<Void> minRecoveryDuration) { Future<Void> minRecoveryDuration) {
state Future<Void> rejoinRequests = Never(); state Future<Void> rejoinRequests = Never();
state DBRecoveryCount recoverCount = self->cstate.myDBState.recoveryCount + 1; state DBRecoveryCount recoverCount = self->cstate.myDBState.recoveryCount + 1;
state EncryptionAtRestMode encryptionAtRestMode = getEncryptionAtRest();
state DatabaseConfiguration configuration = state DatabaseConfiguration configuration =
self->configuration; // self-configuration can be changed by configurationMonitor so we need a copy self->configuration; // self-configuration can be changed by configurationMonitor so we need a copy
loop { loop {
state DBCoreState newState; state DBCoreState newState;
self->logSystem->toCoreState(newState); self->logSystem->toCoreState(newState);
newState.recoveryCount = recoverCount; newState.recoveryCount = recoverCount;
// Update Coordinators EncryptionAtRest status during the very first recovery of the cluster (empty database)
newState.encryptionAtRestMode = encryptionAtRestMode;
state Future<Void> changed = self->logSystem->onCoreStateChanged(); state Future<Void> changed = self->logSystem->onCoreStateChanged();
ASSERT(newState.tLogs[0].tLogWriteAntiQuorum == configuration.tLogWriteAntiQuorum && ASSERT(newState.tLogs[0].tLogWriteAntiQuorum == configuration.tLogWriteAntiQuorum &&
@ -454,6 +472,7 @@ ACTOR Future<Void> trackTlogRecovery(Reference<ClusterRecoveryData> self,
.detail("FinalUpdate", finalUpdate) .detail("FinalUpdate", finalUpdate)
.detail("NewState.tlogs", newState.tLogs.size()) .detail("NewState.tlogs", newState.tLogs.size())
.detail("NewState.OldTLogs", newState.oldTLogData.size()) .detail("NewState.OldTLogs", newState.oldTLogData.size())
.detail("NewState.EncryptionAtRestMode", newState.encryptionAtRestMode.toString())
.detail("Expected.tlogs", .detail("Expected.tlogs",
configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional<Key>())); configuration.expectedLogSets(self->primaryDcId.size() ? self->primaryDcId[0] : Optional<Key>()));
wait(self->cstate.write(newState, finalUpdate)); wait(self->cstate.write(newState, finalUpdate));
@ -934,7 +953,7 @@ ACTOR Future<std::vector<Standalone<CommitTransactionRef>>> recruitEverything(
.detail("Status", RecoveryStatus::names[status]) .detail("Status", RecoveryStatus::names[status])
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
return Never(); return Never();
} else } else {
TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(), TraceEvent(getRecoveryEventName(ClusterRecoveryEventType::CLUSTER_RECOVERY_STATE_EVENT_NAME).c_str(),
self->dbgid) self->dbgid)
.detail("StatusCode", RecoveryStatus::recruiting_transaction_servers) .detail("StatusCode", RecoveryStatus::recruiting_transaction_servers)
@ -945,6 +964,12 @@ ACTOR Future<std::vector<Standalone<CommitTransactionRef>>> recruitEverything(
.detail("RequiredResolvers", 1) .detail("RequiredResolvers", 1)
.trackLatest(self->clusterRecoveryStateEventHolder->trackingKey); .trackLatest(self->clusterRecoveryStateEventHolder->trackingKey);
// The cluster's EncryptionAtRest status is now readable.
if (self->controllerData->encryptionAtRestMode.canBeSet()) {
self->controllerData->encryptionAtRestMode.send(getEncryptionAtRest());
}
}
// FIXME: we only need log routers for the same locality as the master // FIXME: we only need log routers for the same locality as the master
int maxLogRouters = self->cstate.prevDBState.logRouterTags; int maxLogRouters = self->cstate.prevDBState.logRouterTags;
for (auto& old : self->cstate.prevDBState.oldTLogData) { for (auto& old : self->cstate.prevDBState.oldTLogData) {
@ -1443,6 +1468,12 @@ ACTOR Future<Void> clusterRecoveryCore(Reference<ClusterRecoveryData> self) {
wait(self->cstate.read()); wait(self->cstate.read());
// Unless the cluster database is 'empty', the cluster's EncryptionAtRest status is readable once cstate is
// recovered
if (!self->cstate.myDBState.tLogs.empty() && self->controllerData->encryptionAtRestMode.canBeSet()) {
self->controllerData->encryptionAtRestMode.send(self->cstate.myDBState.encryptionAtRestMode);
}
if (self->cstate.prevDBState.lowestCompatibleProtocolVersion > currentProtocolVersion()) { if (self->cstate.prevDBState.lowestCompatibleProtocolVersion > currentProtocolVersion()) {
TraceEvent(SevWarnAlways, "IncompatibleProtocolVersion", self->dbgid).log(); TraceEvent(SevWarnAlways, "IncompatibleProtocolVersion", self->dbgid).log();
throw internal_error(); throw internal_error();

View File

@ -20,6 +20,7 @@
#include <algorithm> #include <algorithm>
#include <tuple> #include <tuple>
#include <variant>
#include "fdbclient/Atomic.h" #include "fdbclient/Atomic.h"
#include "fdbclient/BackupAgent.actor.h" #include "fdbclient/BackupAgent.actor.h"
@ -68,6 +69,8 @@
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
#include "flow/network.h" #include "flow/network.h"
using WriteMutationRefVar = std::variant<MutationRef, VectorRef<MutationRef>>;
ACTOR Future<Void> broadcastTxnRequest(TxnStateRequest req, int sendAmount, bool sendReply) { ACTOR Future<Void> broadcastTxnRequest(TxnStateRequest req, int sendAmount, bool sendReply) {
state ReplyPromise<Void> reply = req.reply; state ReplyPromise<Void> reply = req.reply;
resetReply(req); resetReply(req);
@ -1256,16 +1259,78 @@ ACTOR Future<Void> applyMetadataToCommittedTransactions(CommitBatchContext* self
return Void(); return Void();
} }
ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self, ACTOR Future<WriteMutationRefVar> writeMutationEncryptedMutation(CommitBatchContext* self,
int64_t tenantId, int64_t tenantId,
const MutationRef* mutation, const MutationRef* mutation,
Optional<MutationRef>* encryptedMutationOpt, Optional<MutationRef>* encryptedMutationOpt,
Arena* arena) { Arena* arena) {
state MutationRef encryptedMutation = encryptedMutationOpt->get();
state const BlobCipherEncryptHeader* header;
static_assert(TenantInfo::INVALID_TENANT == INVALID_ENCRYPT_DOMAIN_ID);
ASSERT(self->pProxyCommitData->isEncryptionEnabled);
ASSERT(g_network && g_network->isSimulated());
ASSERT(encryptedMutation.isEncrypted());
Reference<AsyncVar<ServerDBInfo> const> dbInfo = self->pProxyCommitData->db;
header = encryptedMutation.encryptionHeader();
TextAndHeaderCipherKeys cipherKeys = wait(getEncryptCipherKeys(dbInfo, *header, BlobCipherMetrics::TLOG));
MutationRef decryptedMutation = encryptedMutation.decrypt(cipherKeys, *arena, BlobCipherMetrics::TLOG);
ASSERT(decryptedMutation.param1 == mutation->param1 && decryptedMutation.param2 == mutation->param2 &&
decryptedMutation.type == mutation->type);
CODE_PROBE(true, "encrypting non-metadata mutations");
self->toCommit.writeTypedMessage(encryptedMutation);
return encryptedMutation;
}
ACTOR Future<WriteMutationRefVar> writeMutationFetchEncryptKey(CommitBatchContext* self,
int64_t tenantId,
const MutationRef* mutation,
Arena* arena) {
state EncryptCipherDomainId domainId = tenantId;
state MutationRef encryptedMutation;
static_assert(TenantInfo::INVALID_TENANT == INVALID_ENCRYPT_DOMAIN_ID);
ASSERT(self->pProxyCommitData->isEncryptionEnabled);
ASSERT_NE((MutationRef::Type)mutation->type, MutationRef::Type::ClearRange);
std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p =
getEncryptDetailsFromMutationRef(self->pProxyCommitData, *mutation);
domainId = p.second;
Reference<BlobCipherKey> cipherKey =
wait(getLatestEncryptCipherKey(self->pProxyCommitData->db, domainId, p.first, BlobCipherMetrics::TLOG));
self->cipherKeys[domainId] = cipherKey;
CODE_PROBE(true, "Raw access mutation encryption");
ASSERT_NE(domainId, INVALID_ENCRYPT_DOMAIN_ID);
encryptedMutation = mutation->encrypt(self->cipherKeys, domainId, *arena, BlobCipherMetrics::TLOG);
self->toCommit.writeTypedMessage(encryptedMutation);
return encryptedMutation;
}
Future<WriteMutationRefVar> writeMutation(CommitBatchContext* self,
int64_t tenantId,
const MutationRef* mutation,
Optional<MutationRef>* encryptedMutationOpt,
Arena* arena) {
static_assert(TenantInfo::INVALID_TENANT == INVALID_ENCRYPT_DOMAIN_ID); static_assert(TenantInfo::INVALID_TENANT == INVALID_ENCRYPT_DOMAIN_ID);
// WriteMutation routine is responsible for appending mutations to be persisted in TLog, the operation
// isn't a 'blocking' operation, except for few cases when Encryption is supported by the cluster such
// as:
// 1. Fetch encryption keys to encrypt the mutation.
// 2. Split ClearRange mutation to respect Encryption domain boundaries.
// 3. Ensure sanity of already encrypted mutation - simulation limited check.
//
// Approach optimizes "fast" path by avoiding alloc/dealloc overhead due to be ACTOR framework support,
// the penalty happens iff any of above conditions are met. Otherwise, corresponding handle routine (ACTOR
// compliant) gets invoked ("slow path").
if (self->pProxyCommitData->isEncryptionEnabled) { if (self->pProxyCommitData->isEncryptionEnabled) {
state EncryptCipherDomainId domainId = tenantId; EncryptCipherDomainId domainId = tenantId;
state MutationRef encryptedMutation; MutationRef encryptedMutation;
CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::DISABLED, CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::DISABLED,
"using disabled tenant mode"); "using disabled tenant mode");
CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::OPTIONAL_TENANT, CODE_PROBE(self->pProxyCommitData->db->get().client.tenantMode == TenantMode::OPTIONAL_TENANT,
@ -1279,13 +1344,7 @@ ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
ASSERT(encryptedMutation.isEncrypted()); ASSERT(encryptedMutation.isEncrypted());
// During simulation check whether the encrypted mutation matches the decrpyted mutation // During simulation check whether the encrypted mutation matches the decrpyted mutation
if (g_network && g_network->isSimulated()) { if (g_network && g_network->isSimulated()) {
Reference<AsyncVar<ServerDBInfo> const> dbInfo = self->pProxyCommitData->db; return writeMutationEncryptedMutation(self, tenantId, mutation, encryptedMutationOpt, arena);
state const BlobCipherEncryptHeader* header = encryptedMutation.encryptionHeader();
TextAndHeaderCipherKeys cipherKeys =
wait(getEncryptCipherKeys(dbInfo, *header, BlobCipherMetrics::TLOG));
MutationRef decryptedMutation = encryptedMutation.decrypt(cipherKeys, *arena, BlobCipherMetrics::TLOG);
ASSERT(decryptedMutation.param1 == mutation->param1 && decryptedMutation.param2 == mutation->param2 &&
decryptedMutation.type == mutation->type);
} }
} else { } else {
if (domainId == INVALID_ENCRYPT_DOMAIN_ID) { if (domainId == INVALID_ENCRYPT_DOMAIN_ID) {
@ -1294,9 +1353,7 @@ ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
domainId = p.second; domainId = p.second;
if (self->cipherKeys.find(domainId) == self->cipherKeys.end()) { if (self->cipherKeys.find(domainId) == self->cipherKeys.end()) {
Reference<BlobCipherKey> cipherKey = wait(getLatestEncryptCipherKey( return writeMutationFetchEncryptKey(self, tenantId, mutation, arena);
self->pProxyCommitData->db, domainId, p.first, BlobCipherMetrics::TLOG));
self->cipherKeys[domainId] = cipherKey;
} }
CODE_PROBE(true, "Raw access mutation encryption"); CODE_PROBE(true, "Raw access mutation encryption");
@ -1308,10 +1365,10 @@ ACTOR Future<MutationRef> writeMutation(CommitBatchContext* self,
ASSERT(encryptedMutation.isEncrypted()); ASSERT(encryptedMutation.isEncrypted());
CODE_PROBE(true, "encrypting non-metadata mutations"); CODE_PROBE(true, "encrypting non-metadata mutations");
self->toCommit.writeTypedMessage(encryptedMutation); self->toCommit.writeTypedMessage(encryptedMutation);
return encryptedMutation; return std::variant<MutationRef, VectorRef<MutationRef>>{ encryptedMutation };
} else { } else {
self->toCommit.writeTypedMessage(*mutation); self->toCommit.writeTypedMessage(*mutation);
return *mutation; return std::variant<MutationRef, VectorRef<MutationRef>>{ *mutation };
} }
} }
@ -1399,8 +1456,10 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
if (encryptedMutation.present()) { if (encryptedMutation.present()) {
ASSERT(encryptedMutation.get().isEncrypted()); ASSERT(encryptedMutation.get().isEncrypted());
} }
MutationRef tempMutation = wait(writeMutation(self, tenantId, &m, &encryptedMutation, &arena)); WriteMutationRefVar var = wait(writeMutation(self, tenantId, &m, &encryptedMutation, &arena));
writtenMutation = tempMutation; // FIXME: Remove assert once ClearRange RAW_ACCESS usecase handling is done
ASSERT(std::holds_alternative<MutationRef>(var));
writtenMutation = std::get<MutationRef>(var);
} else if (m.type == MutationRef::ClearRange) { } else if (m.type == MutationRef::ClearRange) {
KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2));
auto ranges = pProxyCommitData->keyInfo.intersectingRanges(clearRange); auto ranges = pProxyCommitData->keyInfo.intersectingRanges(clearRange);
@ -1453,8 +1512,10 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
if (pProxyCommitData->needsCacheTag(clearRange)) { if (pProxyCommitData->needsCacheTag(clearRange)) {
self->toCommit.addTag(cacheTag); self->toCommit.addTag(cacheTag);
} }
MutationRef tempMutation = wait(writeMutation(self, tenantId, &m, &encryptedMutation, &arena)); WriteMutationRefVar var = wait(writeMutation(self, tenantId, &m, &encryptedMutation, &arena));
writtenMutation = tempMutation; // FIXME: Remove assert once ClearRange RAW_ACCESS usecase handling is done
ASSERT(std::holds_alternative<MutationRef>(var));
writtenMutation = std::get<MutationRef>(var);
} else { } else {
UNREACHABLE(); UNREACHABLE();
} }
@ -1505,8 +1566,8 @@ ACTOR Future<Void> assignMutationsToStorageServers(CommitBatchContext* self) {
MutationRef backupMutation( MutationRef backupMutation(
MutationRef::Type::ClearRange, intersectionRange.begin, intersectionRange.end); MutationRef::Type::ClearRange, intersectionRange.begin, intersectionRange.end);
// TODO (Nim): Currently clear ranges are encrypted using the default encryption key, this must be // TODO (Nim): Currently clear ranges are encrypted using the default encryption key, this must
// changed to account for clear ranges which span tenant boundaries // be changed to account for clear ranges which span tenant boundaries
if (self->pProxyCommitData->isEncryptionEnabled) { if (self->pProxyCommitData->isEncryptionEnabled) {
CODE_PROBE(true, "encrypting clear range backup mutation"); CODE_PROBE(true, "encrypting clear range backup mutation");
if (backupMutation.param1 == m.param1 && backupMutation.param2 == m.param2 && if (backupMutation.param1 == m.param1 && backupMutation.param2 == m.param2 &&
@ -1627,9 +1688,8 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
CODE_PROBE(true, "encrypting idempotency mutation"); CODE_PROBE(true, "encrypting idempotency mutation");
std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p = std::pair<EncryptCipherDomainName, EncryptCipherDomainId> p =
getEncryptDetailsFromMutationRef(self->pProxyCommitData, idempotencyIdSet); getEncryptDetailsFromMutationRef(self->pProxyCommitData, idempotencyIdSet);
Arena arena;
MutationRef encryptedMutation = idempotencyIdSet.encrypt( MutationRef encryptedMutation = idempotencyIdSet.encrypt(
self->cipherKeys, p.second, arena, BlobCipherMetrics::TLOG); self->cipherKeys, p.second, self->arena, BlobCipherMetrics::TLOG);
self->toCommit.writeTypedMessage(encryptedMutation); self->toCommit.writeTypedMessage(encryptedMutation);
} else { } else {
self->toCommit.writeTypedMessage(idempotencyIdSet); self->toCommit.writeTypedMessage(idempotencyIdSet);
@ -1637,11 +1697,13 @@ ACTOR Future<Void> postResolution(CommitBatchContext* self) {
}); });
state int i = 0; state int i = 0;
for (i = 0; i < pProxyCommitData->idempotencyClears.size(); i++) { for (i = 0; i < pProxyCommitData->idempotencyClears.size(); i++) {
MutationRef& m = pProxyCommitData->idempotencyClears[i]; auto& tags = pProxyCommitData->tagsForKey(pProxyCommitData->idempotencyClears[i].param1);
auto& tags = pProxyCommitData->tagsForKey(m.param1);
self->toCommit.addTags(tags); self->toCommit.addTags(tags);
Arena arena; // We already have an arena with an appropriate lifetime handy
wait(success(writeMutation(self, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, &m, nullptr, &arena))); Arena& arena = pProxyCommitData->idempotencyClears.arena();
WriteMutationRefVar var = wait(writeMutation(
self, SYSTEM_KEYSPACE_ENCRYPT_DOMAIN_ID, &pProxyCommitData->idempotencyClears[i], nullptr, &arena));
ASSERT(std::holds_alternative<MutationRef>(var));
} }
pProxyCommitData->idempotencyClears = Standalone<VectorRef<MutationRef>>(); pProxyCommitData->idempotencyClears = Standalone<VectorRef<MutationRef>>();

View File

@ -129,6 +129,7 @@ ACTOR Future<bool> getKeyServers(
// one needs to be reachable // one needs to be reachable
if (performQuiescentChecks && !shards.present()) { if (performQuiescentChecks && !shards.present()) {
TraceEvent("ConsistencyCheck_CommitProxyUnavailable") TraceEvent("ConsistencyCheck_CommitProxyUnavailable")
.error(shards.getError())
.detail("CommitProxyID", commitProxyInfo->getId(i)); .detail("CommitProxyID", commitProxyInfo->getId(i));
testFailure("Commit proxy unavailable", performQuiescentChecks, true); testFailure("Commit proxy unavailable", performQuiescentChecks, true);
return false; return false;

View File

@ -1066,7 +1066,7 @@ private:
}; };
uint64_t seq; // seq is the index of the virtually infinite disk queue file. Its unit is bytes. uint64_t seq; // seq is the index of the virtually infinite disk queue file. Its unit is bytes.
uint64_t popped; uint64_t popped;
int payloadSize; int32_t payloadSize;
}; };
// The on disk format depends on the size of PageHeader. // The on disk format depends on the size of PageHeader.
static_assert(sizeof(PageHeader) == 36, "PageHeader must be 36 bytes"); static_assert(sizeof(PageHeader) == 36, "PageHeader must be 36 bytes");
@ -1703,4 +1703,4 @@ TEST_CASE("performance/fdbserver/DiskQueue") {
queue->dispose(); queue->dispose();
wait(queue->onClosed()); wait(queue->onClosed());
return Void(); return Void();
} }

View File

@ -102,7 +102,7 @@ void GrvProxyTagThrottler::addRequest(GetReadVersionRequest const& req) {
// SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be // SERVER_KNOBS->ENFORCE_TAG_THROTTLING_ON_PROXIES is enabled, there may be
// unexpected behaviour, because only one tag is used for throttling. // unexpected behaviour, because only one tag is used for throttling.
TraceEvent(SevWarnAlways, "GrvProxyTagThrottler_MultipleTags") TraceEvent(SevWarnAlways, "GrvProxyTagThrottler_MultipleTags")
.suppressFor(1.0) .suppressFor(60.0)
.detail("NumTags", req.tags.size()) .detail("NumTags", req.tags.size())
.detail("UsingTag", printable(tag)); .detail("UsingTag", printable(tag));
} }

View File

@ -397,13 +397,23 @@ struct Counters {
}; };
struct ReadIterator { struct ReadIterator {
CF& cf;
uint64_t index; // incrementing counter to uniquely identify read iterator. uint64_t index; // incrementing counter to uniquely identify read iterator.
bool inUse; bool inUse;
std::shared_ptr<rocksdb::Iterator> iter; std::shared_ptr<rocksdb::Iterator> iter;
double creationTime; double creationTime;
KeyRange keyRange;
std::shared_ptr<rocksdb::Slice> beginSlice, endSlice;
ReadIterator(CF& cf, uint64_t index, DB& db, rocksdb::ReadOptions& options) ReadIterator(CF& cf, uint64_t index, DB& db, rocksdb::ReadOptions& options)
: cf(cf), index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {} : index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
ReadIterator(CF& cf, uint64_t index, DB& db, rocksdb::ReadOptions options, KeyRange keyRange)
: index(index), inUse(true), creationTime(now()), keyRange(keyRange) {
beginSlice = std::shared_ptr<rocksdb::Slice>(new rocksdb::Slice(toSlice(keyRange.begin)));
options.iterate_lower_bound = beginSlice.get();
endSlice = std::shared_ptr<rocksdb::Slice>(new rocksdb::Slice(toSlice(keyRange.end)));
options.iterate_upper_bound = endSlice.get();
iter = std::shared_ptr<rocksdb::Iterator>(db->NewIterator(options, cf));
}
}; };
/* /*
@ -426,42 +436,84 @@ public:
readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0); readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0);
TraceEvent("ReadIteratorPool", id) TraceEvent("ReadIteratorPool", id)
.detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) .detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS)
.detail("KnobRocksDBReadRangeReuseBoundedIterators",
SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS)
.detail("KnobRocksDBReadRangeBoundedIteratorsMaxLimit",
SERVER_KNOBS->ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT)
.detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN); .detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN);
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS &&
SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS) {
TraceEvent(SevWarn, "ReadIteratorKnobsMismatch");
}
} }
// Called on every db commit. // Called on every db commit.
void update() { void update() {
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) { if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS ||
SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS) {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
iteratorsMap.clear(); iteratorsMap.clear();
} }
} }
// Called on every read operation. // Called on every read operation.
ReadIterator getIterator() { ReadIterator getIterator(KeyRange keyRange) {
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) { if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
std::lock_guard<std::mutex> lock(mutex); mutex.lock();
for (it = iteratorsMap.begin(); it != iteratorsMap.end(); it++) { for (it = iteratorsMap.begin(); it != iteratorsMap.end(); it++) {
if (!it->second.inUse) { if (!it->second.inUse) {
it->second.inUse = true; it->second.inUse = true;
iteratorsReuseCount++; iteratorsReuseCount++;
return it->second; ReadIterator iter = it->second;
mutex.unlock();
return iter;
} }
} }
index++; index++;
ReadIterator iter(cf, index, db, readRangeOptions); uint64_t readIteratorIndex = index;
iteratorsMap.insert({ index, iter }); mutex.unlock();
ReadIterator iter(cf, readIteratorIndex, db, readRangeOptions);
mutex.lock();
iteratorsMap.insert({ readIteratorIndex, iter });
mutex.unlock();
return iter;
} else if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS) {
// TODO: Based on the datasize in the keyrange, decide whether to store the iterator for reuse.
mutex.lock();
for (it = iteratorsMap.begin(); it != iteratorsMap.end(); it++) {
if (!it->second.inUse && it->second.keyRange.contains(keyRange)) {
it->second.inUse = true;
iteratorsReuseCount++;
ReadIterator iter = it->second;
mutex.unlock();
return iter;
}
}
index++;
uint64_t readIteratorIndex = index;
mutex.unlock();
ReadIterator iter(cf, readIteratorIndex, db, readRangeOptions, keyRange);
if (iteratorsMap.size() < SERVER_KNOBS->ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT) {
// Not storing more than ROCKSDB_READ_RANGE_BOUNDED_ITERATORS_MAX_LIMIT of iterators
// to avoid 'out of memory' issues.
mutex.lock();
iteratorsMap.insert({ readIteratorIndex, iter });
mutex.unlock();
}
return iter; return iter;
} else { } else {
index++; index++;
ReadIterator iter(cf, index, db, readRangeOptions); ReadIterator iter(cf, index, db, readRangeOptions, keyRange);
return iter; return iter;
} }
} }
// Called on every read operation, after the keys are collected. // Called on every read operation, after the keys are collected.
void returnIterator(ReadIterator& iter) { void returnIterator(ReadIterator& iter) {
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) { if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS ||
SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS) {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
it = iteratorsMap.find(iter.index); it = iteratorsMap.find(iter.index);
// iterator found: put the iterator back to the pool(inUse=false). // iterator found: put the iterator back to the pool(inUse=false).
@ -768,7 +820,7 @@ uint64_t PerfContextMetrics::getRocksdbPerfcontextMetric(int metric) {
} }
ACTOR Future<Void> refreshReadIteratorPool(std::shared_ptr<ReadIteratorPool> readIterPool) { ACTOR Future<Void> refreshReadIteratorPool(std::shared_ptr<ReadIteratorPool> readIterPool) {
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) { if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS || SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_BOUNDED_ITERATORS) {
loop { loop {
wait(delay(SERVER_KNOBS->ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME)); wait(delay(SERVER_KNOBS->ROCKSDB_READ_RANGE_ITERATOR_REFRESH_TIME));
readIterPool->refreshIterators(); readIterPool->refreshIterators();
@ -1559,7 +1611,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
rocksdb::Status s; rocksdb::Status s;
if (a.rowLimit >= 0) { if (a.rowLimit >= 0) {
double iterCreationBeginTime = a.getHistograms ? timer_monotonic() : 0; double iterCreationBeginTime = a.getHistograms ? timer_monotonic() : 0;
ReadIterator readIter = readIterPool->getIterator(); ReadIterator readIter = readIterPool->getIterator(a.keys);
if (a.getHistograms) { if (a.getHistograms) {
metricPromiseStream->send(std::make_pair(ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM.toString(), metricPromiseStream->send(std::make_pair(ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM.toString(),
timer_monotonic() - iterCreationBeginTime)); timer_monotonic() - iterCreationBeginTime));
@ -1588,7 +1640,7 @@ struct RocksDBKeyValueStore : IKeyValueStore {
readIterPool->returnIterator(readIter); readIterPool->returnIterator(readIter);
} else { } else {
double iterCreationBeginTime = a.getHistograms ? timer_monotonic() : 0; double iterCreationBeginTime = a.getHistograms ? timer_monotonic() : 0;
ReadIterator readIter = readIterPool->getIterator(); ReadIterator readIter = readIterPool->getIterator(a.keys);
if (a.getHistograms) { if (a.getHistograms) {
metricPromiseStream->send(std::make_pair(ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM.toString(), metricPromiseStream->send(std::make_pair(ROCKSDB_READRANGE_NEWITERATOR_HISTOGRAM.toString(),
timer_monotonic() - iterCreationBeginTime)); timer_monotonic() - iterCreationBeginTime));

View File

@ -155,7 +155,7 @@ struct ShardedRocksDBState {
std::shared_ptr<rocksdb::Cache> rocksdb_block_cache = nullptr; std::shared_ptr<rocksdb::Cache> rocksdb_block_cache = nullptr;
rocksdb::Slice toSlice(StringRef s) { const rocksdb::Slice toSlice(StringRef s) {
return rocksdb::Slice(reinterpret_cast<const char*>(s.begin()), s.size()); return rocksdb::Slice(reinterpret_cast<const char*>(s.begin()), s.size());
} }
@ -309,8 +309,20 @@ struct ReadIterator {
bool inUse; bool inUse;
std::shared_ptr<rocksdb::Iterator> iter; std::shared_ptr<rocksdb::Iterator> iter;
double creationTime; double creationTime;
ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, rocksdb::ReadOptions& options) KeyRange keyRange;
std::shared_ptr<rocksdb::Slice> beginSlice, endSlice;
ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, const rocksdb::ReadOptions& options)
: index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {} : index(index), inUse(true), creationTime(now()), iter(db->NewIterator(options, cf)) {}
ReadIterator(rocksdb::ColumnFamilyHandle* cf, uint64_t index, rocksdb::DB* db, const KeyRange& range)
: index(index), inUse(true), creationTime(now()), keyRange(range) {
auto options = getReadOptions();
beginSlice = std::shared_ptr<rocksdb::Slice>(new rocksdb::Slice(toSlice(keyRange.begin)));
options.iterate_lower_bound = beginSlice.get();
endSlice = std::shared_ptr<rocksdb::Slice>(new rocksdb::Slice(toSlice(keyRange.end)));
options.iterate_upper_bound = endSlice.get();
iter = std::shared_ptr<rocksdb::Iterator>(db->NewIterator(options, cf));
}
}; };
/* /*
@ -348,7 +360,8 @@ public:
} }
// Called on every read operation. // Called on every read operation.
ReadIterator getIterator() { ReadIterator getIterator(const KeyRange& range) {
// Shared iterators are not bounded.
if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) { if (SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) {
std::lock_guard<std::mutex> lock(mutex); std::lock_guard<std::mutex> lock(mutex);
for (it = iteratorsMap.begin(); it != iteratorsMap.end(); it++) { for (it = iteratorsMap.begin(); it != iteratorsMap.end(); it++) {
@ -364,7 +377,7 @@ public:
return iter; return iter;
} else { } else {
index++; index++;
ReadIterator iter(cf, index, db, readRangeOptions); ReadIterator iter(cf, index, db, range);
return iter; return iter;
} }
} }
@ -511,7 +524,7 @@ struct PhysicalShard {
double deleteTimeSec; double deleteTimeSec;
}; };
int readRangeInDb(PhysicalShard* shard, const KeyRangeRef& range, int rowLimit, int byteLimit, RangeResult* result) { int readRangeInDb(PhysicalShard* shard, const KeyRangeRef range, int rowLimit, int byteLimit, RangeResult* result) {
if (rowLimit == 0 || byteLimit == 0) { if (rowLimit == 0 || byteLimit == 0) {
return 0; return 0;
} }
@ -523,7 +536,7 @@ int readRangeInDb(PhysicalShard* shard, const KeyRangeRef& range, int rowLimit,
// When using a prefix extractor, ensure that keys are returned in order even if they cross // When using a prefix extractor, ensure that keys are returned in order even if they cross
// a prefix boundary. // a prefix boundary.
if (rowLimit >= 0) { if (rowLimit >= 0) {
ReadIterator readIter = shard->readIterPool->getIterator(); ReadIterator readIter = shard->readIterPool->getIterator(range);
auto cursor = readIter.iter; auto cursor = readIter.iter;
cursor->Seek(toSlice(range.begin)); cursor->Seek(toSlice(range.begin));
while (cursor->Valid() && toStringRef(cursor->key()) < range.end) { while (cursor->Valid() && toStringRef(cursor->key()) < range.end) {
@ -540,7 +553,7 @@ int readRangeInDb(PhysicalShard* shard, const KeyRangeRef& range, int rowLimit,
s = cursor->status(); s = cursor->status();
shard->readIterPool->returnIterator(readIter); shard->readIterPool->returnIterator(readIter);
} else { } else {
ReadIterator readIter = shard->readIterPool->getIterator(); ReadIterator readIter = shard->readIterPool->getIterator(range);
auto cursor = readIter.iter; auto cursor = readIter.iter;
cursor->SeekForPrev(toSlice(range.end)); cursor->SeekForPrev(toSlice(range.end));
if (cursor->Valid() && toStringRef(cursor->key()) == range.end) { if (cursor->Valid() && toStringRef(cursor->key()) == range.end) {
@ -2150,10 +2163,16 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore {
: keys(keys), rowLimit(rowLimit), byteLimit(byteLimit), startTime(timer_monotonic()), : keys(keys), rowLimit(rowLimit), byteLimit(byteLimit), startTime(timer_monotonic()),
getHistograms( getHistograms(
(deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) { (deterministicRandom()->random01() < SERVER_KNOBS->ROCKSDB_HISTOGRAMS_SAMPLE_RATE) ? true : false) {
std::set<PhysicalShard*> usedShards;
for (const DataShard* shard : shards) { for (const DataShard* shard : shards) {
if (shard != nullptr) { ASSERT(shard);
shardRanges.emplace_back(shard->physicalShard, keys & shard->range); shardRanges.emplace_back(shard->physicalShard, keys & shard->range);
} usedShards.insert(shard->physicalShard);
}
if (usedShards.size() != shards.size()) {
TraceEvent("ReadRangeMetrics")
.detail("NumPhysicalShards", usedShards.size())
.detail("NumDataShards", shards.size());
} }
} }
double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; } double getTimeEstimate() const override { return SERVER_KNOBS->READ_RANGE_TIME_ESTIMATE; }

View File

@ -514,35 +514,38 @@ Future<Void> logRouterPeekMessages(PromiseType replyPromise,
} }
state double startTime = now(); state double startTime = now();
state Version poppedVer;
Version poppedVer = poppedVersion(self, reqTag);
if (poppedVer > reqBegin || reqBegin < self->startVersion) {
// This should only happen if a packet is sent multiple times and the reply is not needed.
// Since we are using popped differently, do not send a reply.
TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid)
.detail("Begin", reqBegin)
.detail("Popped", poppedVer)
.detail("Start", self->startVersion);
if (std::is_same<PromiseType, Promise<TLogPeekReply>>::value) {
// kills logRouterPeekStream actor, otherwise that actor becomes stuck
throw operation_obsolete();
}
replyPromise.send(Never());
if (reqSequence.present()) {
auto& trackerData = self->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (!sequenceData.isSet()) {
sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
}
}
return Void();
}
state Version endVersion; state Version endVersion;
// Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may // Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may
// want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob. // want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob.
loop { loop {
poppedVer = poppedVersion(self, reqTag);
if (poppedVer > reqBegin || reqBegin < self->startVersion) {
// This should only happen if a packet is sent multiple times and the reply is not needed.
// Since we are using popped differently, do not send a reply.
TraceEvent(SevWarnAlways, "LogRouterPeekPopped", self->dbgid)
.detail("Begin", reqBegin)
.detail("Popped", poppedVer)
.detail("Start", self->startVersion);
if (std::is_same<PromiseType, Promise<TLogPeekReply>>::value) {
// kills logRouterPeekStream actor, otherwise that actor becomes stuck
throw operation_obsolete();
}
replyPromise.send(Never());
if (reqSequence.present()) {
auto& trackerData = self->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
if (!sequenceData.isSet()) {
sequenceData.send(std::make_pair(reqBegin, reqOnlySpilled));
}
}
return Void();
}
ASSERT(reqBegin >= poppedVersion(self, reqTag) && reqBegin >= self->startVersion);
endVersion = self->version.get() + 1; endVersion = self->version.get() + 1;
peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion); peekMessagesFromMemory(self, reqTag, reqBegin, messages, endVersion);

View File

@ -656,7 +656,7 @@ ACTOR Future<int64_t> getVersionOffset(Database cx,
ACTOR Future<Void> repairDeadDatacenter(Database cx, ACTOR Future<Void> repairDeadDatacenter(Database cx,
Reference<AsyncVar<ServerDBInfo> const> dbInfo, Reference<AsyncVar<ServerDBInfo> const> dbInfo,
std::string context) { std::string context) {
if (g_network->isSimulated() && g_simulator->usableRegions > 1) { if (g_network->isSimulated() && g_simulator->usableRegions > 1 && !g_simulator->quiesced) {
bool primaryDead = g_simulator->datacenterDead(g_simulator->primaryDcId); bool primaryDead = g_simulator->datacenterDead(g_simulator->primaryDcId);
bool remoteDead = g_simulator->datacenterDead(g_simulator->remoteDcId); bool remoteDead = g_simulator->datacenterDead(g_simulator->remoteDcId);

View File

@ -307,7 +307,7 @@ JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
// map from machine networkAddress to datacenter ID // map from machine networkAddress to datacenter ID
std::map<NetworkAddress, std::string> dcIds; std::map<NetworkAddress, std::string> dcIds;
std::map<NetworkAddress, LocalityData> locality; std::map<NetworkAddress, LocalityData> locality;
std::map<std::string, bool> notExcludedMap; std::map<std::string, bool> excludedMap;
std::map<std::string, int32_t> workerContribMap; std::map<std::string, int32_t> workerContribMap;
std::map<std::string, JsonBuilderObject> machineJsonMap; std::map<std::string, JsonBuilderObject> machineJsonMap;
@ -377,7 +377,7 @@ JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
statusObj["network"] = networkObj; statusObj["network"] = networkObj;
if (configuration.present()) { if (configuration.present()) {
notExcludedMap[machineId] = excludedMap[machineId] =
true; // Will be set to false below if this or any later process is not excluded true; // Will be set to false below if this or any later process is not excluded
} }
@ -385,18 +385,21 @@ JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
machineJsonMap[machineId] = statusObj; machineJsonMap[machineId] = statusObj;
} }
// FIXME: this will not catch if the secondary address of the process was excluded
NetworkAddressList tempList; NetworkAddressList tempList;
tempList.address = it->first; tempList.address = it->first;
bool excludedServer = false; bool excludedServer = true;
bool excludedLocality = false; bool excludedLocality = true;
if (configuration.present() && configuration.get().isExcludedServer(tempList)) if (configuration.present() && !configuration.get().isExcludedServer(tempList))
excludedServer = true; excludedServer = false;
if (locality.count(it->first) && configuration.present() && if (locality.count(it->first) && configuration.present() &&
configuration.get().isMachineExcluded(locality[it->first])) !configuration.get().isMachineExcluded(locality[it->first]))
excludedLocality = true; excludedLocality = false;
notExcludedMap[machineId] = excludedServer || excludedLocality; // If any server is not excluded, set the overall exclusion status
// of the machine to false.
if (!excludedServer && !excludedLocality) {
excludedMap[machineId] = false;
}
workerContribMap[machineId]++; workerContribMap[machineId]++;
} catch (Error&) { } catch (Error&) {
++failed; ++failed;
@ -407,7 +410,7 @@ JsonBuilderObject machineStatusFetcher(WorkerEvents mMetrics,
for (auto& mapPair : machineJsonMap) { for (auto& mapPair : machineJsonMap) {
auto& machineId = mapPair.first; auto& machineId = mapPair.first;
auto& jsonItem = machineJsonMap[machineId]; auto& jsonItem = machineJsonMap[machineId];
jsonItem["excluded"] = notExcludedMap[machineId]; jsonItem["excluded"] = excludedMap[machineId];
jsonItem["contributing_workers"] = workerContribMap[machineId]; jsonItem["contributing_workers"] = workerContribMap[machineId];
machineMap[machineId] = jsonItem; machineMap[machineId] = jsonItem;
} }
@ -781,6 +784,9 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
// Map the address of the worker to the error message object // Map the address of the worker to the error message object
tracefileOpenErrorMap[traceFileErrorsItr->first.toString()] = msgObj; tracefileOpenErrorMap[traceFileErrorsItr->first.toString()] = msgObj;
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
incomplete_reasons->insert("file_open_error details could not be retrieved"); incomplete_reasons->insert("file_open_error details could not be retrieved");
} }
} }
@ -1095,6 +1101,9 @@ ACTOR static Future<JsonBuilderObject> processStatusFetcher(
} }
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
// Something strange occurred, process list is incomplete but what was built so far, if anything, will be // Something strange occurred, process list is incomplete but what was built so far, if anything, will be
// returned. // returned.
incomplete_reasons->insert("Cannot retrieve all process status information."); incomplete_reasons->insert("Cannot retrieve all process status information.");
@ -1410,6 +1419,9 @@ ACTOR static Future<JsonBuilderObject> latencyProbeFetcher(Database cx,
wait(waitForAll(probes)); wait(waitForAll(probes));
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
incomplete_reasons->insert(format("Unable to retrieve latency probe information (%s).", e.what())); incomplete_reasons->insert(format("Unable to retrieve latency probe information (%s).", e.what()));
} }
@ -1449,6 +1461,9 @@ ACTOR static Future<Void> consistencyCheckStatusFetcher(Database cx,
} }
} }
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
incomplete_reasons->insert(format("Unable to retrieve consistency check settings (%s).", e.what())); incomplete_reasons->insert(format("Unable to retrieve consistency check settings (%s).", e.what()));
} }
return Void(); return Void();
@ -1540,6 +1555,9 @@ ACTOR static Future<Void> logRangeWarningFetcher(Database cx,
} }
} }
} catch (Error& e) { } catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
incomplete_reasons->insert(format("Unable to retrieve log ranges (%s).", e.what())); incomplete_reasons->insert(format("Unable to retrieve log ranges (%s).", e.what()));
} }
return Void(); return Void();
@ -1713,7 +1731,10 @@ static JsonBuilderObject configurationFetcher(Optional<DatabaseConfiguration> co
} }
int count = coordinators.clientLeaderServers.size(); int count = coordinators.clientLeaderServers.size();
statusObj["coordinators_count"] = count; statusObj["coordinators_count"] = count;
} catch (Error&) { } catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
incomplete_reasons->insert("Could not retrieve all configuration status information."); incomplete_reasons->insert("Could not retrieve all configuration status information.");
} }
return statusObj; return statusObj;
@ -2735,6 +2756,9 @@ ACTOR Future<JsonBuilderObject> layerStatusFetcher(Database cx,
} }
} catch (Error& e) { } catch (Error& e) {
TraceEvent(SevWarn, "LayerStatusError").error(e); TraceEvent(SevWarn, "LayerStatusError").error(e);
if (e.code() == error_code_actor_cancelled) {
throw;
}
incomplete_reasons->insert(format("Unable to retrieve layer status (%s).", e.what())); incomplete_reasons->insert(format("Unable to retrieve layer status (%s).", e.what()));
json.create("_error") = format("Unable to retrieve layer status (%s).", e.what()); json.create("_error") = format("Unable to retrieve layer status (%s).", e.what());
json.create("_valid") = false; json.create("_valid") = false;

View File

@ -45,6 +45,8 @@
#include "fdbserver/FDBExecHelper.actor.h" #include "fdbserver/FDBExecHelper.actor.h"
#include "flow/Histogram.h" #include "flow/Histogram.h"
#include "flow/DebugTrace.h" #include "flow/DebugTrace.h"
#include "flow/genericactors.actor.h"
#include "flow/network.h"
#include "flow/actorcompiler.h" // This must be the last #include. #include "flow/actorcompiler.h" // This must be the last #include.
struct TLogQueueEntryRef { struct TLogQueueEntryRef {
@ -216,6 +218,8 @@ static const KeyRange persistTagMessagesKeys = prefixRange("TagMsg/"_sr);
static const KeyRange persistTagMessageRefsKeys = prefixRange("TagMsgRef/"_sr); static const KeyRange persistTagMessageRefsKeys = prefixRange("TagMsgRef/"_sr);
static const KeyRange persistTagPoppedKeys = prefixRange("TagPop/"_sr); static const KeyRange persistTagPoppedKeys = prefixRange("TagPop/"_sr);
static const KeyRef persistEncryptionAtRestModeKey = "encryptionAtRestMode"_sr;
static Key persistTagMessagesKey(UID id, Tag tag, Version version) { static Key persistTagMessagesKey(UID id, Tag tag, Version version) {
BinaryWriter wr(Unversioned()); BinaryWriter wr(Unversioned());
wr.serializeBytes(persistTagMessagesKeys.begin); wr.serializeBytes(persistTagMessagesKeys.begin);
@ -306,6 +310,8 @@ struct TLogData : NonCopyable {
UID dbgid; UID dbgid;
UID workerID; UID workerID;
Optional<EncryptionAtRestMode> encryptionAtRestMode;
IKeyValueStore* persistentData; // Durable data on disk that were spilled. IKeyValueStore* persistentData; // Durable data on disk that were spilled.
IDiskQueue* rawPersistentQueue; // The physical queue the persistentQueue below stores its data. Ideally, log IDiskQueue* rawPersistentQueue; // The physical queue the persistentQueue below stores its data. Ideally, log
// interface should work without directly accessing rawPersistentQueue // interface should work without directly accessing rawPersistentQueue
@ -1796,75 +1802,77 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
} }
state double workStart = now(); state double workStart = now();
state Version poppedVer;
state Version poppedVer = poppedVersion(logData, reqTag);
auto tagData = logData->getTagData(reqTag);
bool tagRecovered = tagData && !tagData->unpoppedRecovered;
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && poppedVer <= reqBegin &&
reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && reqTag.locality >= 0 &&
!reqReturnIfBlocked && tagRecovered) {
state double startTime = now();
// TODO (version vector) check if this should be included in "status details" json
// TODO (version vector) all tags may be too many, instead, standard deviation?
wait(waitForMessagesForTag(logData, reqTag, reqBegin, SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT));
double latency = now() - startTime;
if (logData->blockingPeekLatencies.find(reqTag) == logData->blockingPeekLatencies.end()) {
UID ssID = nondeterministicRandom()->randomUniqueID();
std::string s = "BlockingPeekLatencies-" + reqTag.toString();
logData->blockingPeekLatencies.try_emplace(
reqTag, s, ssID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE);
}
LatencySample& sample = logData->blockingPeekLatencies.at(reqTag);
sample.addMeasurement(latency);
poppedVer = poppedVersion(logData, reqTag);
}
DebugLogTraceEvent("TLogPeekMessages2", self->dbgid)
.detail("LogId", logData->logId)
.detail("Tag", reqTag.toString())
.detail("ReqBegin", reqBegin)
.detail("PoppedVer", poppedVer);
if (poppedVer > reqBegin) {
TLogPeekReply rep;
rep.maxKnownVersion = logData->version.get();
rep.minKnownCommittedVersion = logData->minKnownCommittedVersion;
rep.popped = poppedVer;
rep.end = poppedVer;
rep.onlySpilled = false;
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
trackerData.lastUpdate = now();
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
}
rep.begin = reqBegin;
}
replyPromise.send(rep);
return Void();
}
state Version endVersion; state Version endVersion;
state bool onlySpilled; state bool onlySpilled;
// Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may // Run the peek logic in a loop to account for the case where there is no data to return to the caller, and we may
// want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob. // want to wait a little bit instead of just sending back an empty message. This feature is controlled by a knob.
loop { loop {
poppedVer = poppedVersion(logData, reqTag);
auto tagData = logData->getTagData(reqTag);
bool tagRecovered = tagData && !tagData->unpoppedRecovered;
if (SERVER_KNOBS->ENABLE_VERSION_VECTOR && poppedVer <= reqBegin &&
reqBegin > logData->persistentDataDurableVersion && !reqOnlySpilled && reqTag.locality >= 0 &&
!reqReturnIfBlocked && tagRecovered) {
state double startTime = now();
// TODO (version vector) check if this should be included in "status details" json
// TODO (version vector) all tags may be too many, instead, standard deviation?
wait(waitForMessagesForTag(logData, reqTag, reqBegin, SERVER_KNOBS->BLOCKING_PEEK_TIMEOUT));
double latency = now() - startTime;
if (logData->blockingPeekLatencies.find(reqTag) == logData->blockingPeekLatencies.end()) {
UID ssID = nondeterministicRandom()->randomUniqueID();
std::string s = "BlockingPeekLatencies-" + reqTag.toString();
logData->blockingPeekLatencies.try_emplace(
reqTag, s, ssID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, SERVER_KNOBS->LATENCY_SAMPLE_SIZE);
}
LatencySample& sample = logData->blockingPeekLatencies.at(reqTag);
sample.addMeasurement(latency);
poppedVer = poppedVersion(logData, reqTag);
}
DebugLogTraceEvent("TLogPeekMessages2", self->dbgid)
.detail("LogId", logData->logId)
.detail("Tag", reqTag.toString())
.detail("ReqBegin", reqBegin)
.detail("PoppedVer", poppedVer);
if (poppedVer > reqBegin) {
TLogPeekReply rep;
rep.maxKnownVersion = logData->version.get();
rep.minKnownCommittedVersion = logData->minKnownCommittedVersion;
rep.popped = poppedVer;
rep.end = poppedVer;
rep.onlySpilled = false;
if (reqSequence.present()) {
auto& trackerData = logData->peekTracker[peekId];
auto& sequenceData = trackerData.sequence_version[sequence + 1];
trackerData.lastUpdate = now();
if (trackerData.sequence_version.size() && sequence + 1 < trackerData.sequence_version.begin()->first) {
replyPromise.sendError(operation_obsolete());
if (!sequenceData.isSet())
sequenceData.sendError(operation_obsolete());
return Void();
}
if (sequenceData.isSet()) {
if (sequenceData.getFuture().get().first != rep.end) {
CODE_PROBE(true, "tlog peek second attempt ended at a different version");
replyPromise.sendError(operation_obsolete());
return Void();
}
} else {
sequenceData.send(std::make_pair(rep.end, rep.onlySpilled));
}
rep.begin = reqBegin;
}
replyPromise.send(rep);
return Void();
}
ASSERT(reqBegin >= poppedVersion(logData, reqTag));
endVersion = logData->version.get() + 1; endVersion = logData->version.get() + 1;
onlySpilled = false; onlySpilled = false;
@ -2391,6 +2399,33 @@ ACTOR Future<Void> initPersistentState(TLogData* self, Reference<LogData> logDat
return Void(); return Void();
} }
ACTOR Future<EncryptionAtRestMode> getEncryptionAtRestMode(TLogData* self) {
loop {
state GetEncryptionAtRestModeRequest req(self->dbgid);
try {
choose {
when(wait(self->dbInfo->onChange())) {}
when(GetEncryptionAtRestModeResponse resp = wait(brokenPromiseToNever(
self->dbInfo->get().clusterInterface.getEncryptionAtRestMode.getReply(req)))) {
TraceEvent("GetEncryptionAtRestMode", self->dbgid).detail("Mode", resp.mode);
// TODO: TLOG_ENCTYPTION KNOB shall be removed and db-config check should be sufficient to
// determine tlog (and cluster) encryption status
if ((EncryptionAtRestMode::Mode)resp.mode != EncryptionAtRestMode::Mode::DISABLED &&
SERVER_KNOBS->ENABLE_TLOG_ENCRYPTION) {
return EncryptionAtRestMode((EncryptionAtRestMode::Mode)resp.mode);
} else {
return EncryptionAtRestMode();
}
}
}
} catch (Error& e) {
TraceEvent("GetEncryptionAtRestError", self->dbgid).error(e);
throw;
}
}
}
// send stopped promise instead of LogData* to avoid reference cycles // send stopped promise instead of LogData* to avoid reference cycles
ACTOR Future<Void> rejoinClusterController(TLogData* self, ACTOR Future<Void> rejoinClusterController(TLogData* self,
TLogInterface tli, TLogInterface tli,
@ -2579,6 +2614,32 @@ ACTOR Future<Void> tLogEnablePopReq(TLogEnablePopRequest enablePopReq, TLogData*
return Void(); return Void();
} }
ACTOR Future<Void> checkUpdateEncryptionAtRestMode(TLogData* self) {
EncryptionAtRestMode encryptionAtRestMode = wait(getEncryptionAtRestMode(self));
if (self->encryptionAtRestMode.present()) {
// Ensure the TLog encryptionAtRestMode status matches with the cluster config, if not, kill the TLog process.
// Approach prevents a fake TLog process joining the cluster.
if (self->encryptionAtRestMode.get() != encryptionAtRestMode) {
TraceEvent("EncryptionAtRestMismatch", self->dbgid)
.detail("Expected", encryptionAtRestMode.toString())
.detail("Present", self->encryptionAtRestMode.get().toString());
ASSERT(false);
}
} else {
self->encryptionAtRestMode = Optional<EncryptionAtRestMode>(encryptionAtRestMode);
wait(self->persistentDataCommitLock.take());
state FlowLock::Releaser commitLockReleaser(self->persistentDataCommitLock);
self->persistentData->set(
KeyValueRef(persistEncryptionAtRestModeKey, self->encryptionAtRestMode.get().toValue()));
wait(self->persistentData->commit());
TraceEvent("PersistEncryptionAtRestMode", self->dbgid)
.detail("Mode", self->encryptionAtRestMode.get().toString());
}
return Void();
}
ACTOR Future<Void> serveTLogInterface(TLogData* self, ACTOR Future<Void> serveTLogInterface(TLogData* self,
TLogInterface tli, TLogInterface tli,
Reference<LogData> logData, Reference<LogData> logData,
@ -2966,6 +3027,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
state IKeyValueStore* storage = self->persistentData; state IKeyValueStore* storage = self->persistentData;
state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key); state Future<Optional<Value>> fFormat = storage->readValue(persistFormat.key);
state Future<Optional<Value>> fRecoveryLocation = storage->readValue(persistRecoveryLocationKey); state Future<Optional<Value>> fRecoveryLocation = storage->readValue(persistRecoveryLocationKey);
state Future<Optional<Value>> fEncryptionAtRestMode = storage->readValue(persistEncryptionAtRestModeKey);
state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys); state Future<RangeResult> fVers = storage->readRange(persistCurrentVersionKeys);
state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys); state Future<RangeResult> fKnownCommitted = storage->readRange(persistKnownCommittedVersionKeys);
state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys); state Future<RangeResult> fLocality = storage->readRange(persistLocalityKeys);
@ -2977,7 +3039,7 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
// FIXME: metadata in queue? // FIXME: metadata in queue?
wait(waitForAll(std::vector{ fFormat, fRecoveryLocation })); wait(waitForAll(std::vector{ fFormat, fRecoveryLocation, fEncryptionAtRestMode }));
wait(waitForAll(std::vector{ fVers, wait(waitForAll(std::vector{ fVers,
fKnownCommitted, fKnownCommitted,
fLocality, fLocality,
@ -2987,6 +3049,12 @@ ACTOR Future<Void> restorePersistentState(TLogData* self,
fProtocolVersions, fProtocolVersions,
fTLogSpillTypes })); fTLogSpillTypes }));
if (fEncryptionAtRestMode.get().present()) {
self->encryptionAtRestMode =
Optional<EncryptionAtRestMode>(EncryptionAtRestMode::fromValue(fEncryptionAtRestMode.get()));
TraceEvent("PersistEncryptionAtRestModeRead").detail("Mode", self->encryptionAtRestMode.get().toString());
}
if (fFormat.get().present() && !persistFormatReadableRange.contains(fFormat.get().get())) { if (fFormat.get().present() && !persistFormatReadableRange.contains(fFormat.get().get())) {
// FIXME: remove when we no longer need to test upgrades from 4.X releases // FIXME: remove when we no longer need to test upgrades from 4.X releases
if (g_network->isSimulated()) { if (g_network->isSimulated()) {
@ -3537,11 +3605,13 @@ ACTOR Future<Void> tLog(IKeyValueStore* persistentData,
// Disk errors need a chance to kill this actor. // Disk errors need a chance to kill this actor.
wait(delay(0.000001)); wait(delay(0.000001));
if (recovered.canBeSet()) if (recovered.canBeSet()) {
recovered.send(Void()); recovered.send(Void());
}
self.sharedActors.send(commitQueue(&self)); self.sharedActors.send(commitQueue(&self));
self.sharedActors.send(updateStorageLoop(&self)); self.sharedActors.send(updateStorageLoop(&self));
self.sharedActors.send(checkUpdateEncryptionAtRestMode(&self));
self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId)); self.sharedActors.send(traceRole(Role::SHARED_TRANSACTION_LOG, tlogId));
state Future<Void> activeSharedChange = Void(); state Future<Void> activeSharedChange = Void();

View File

@ -1449,8 +1449,7 @@ void TagPartitionedLogSystem::pop(Version upTo, Tag tag, Version durableKnownCom
} }
if (prev == 0) { if (prev == 0) {
// pop tag from log upto version defined in outstandingPops[].first // pop tag from log upto version defined in outstandingPops[].first
popActors.add( popActors.add(popFromLog(this, log, tag, SERVER_KNOBS->POP_FROM_LOG_DELAY, /*popLogRouter=*/false));
popFromLog(this, log, tag, /*delayBeforePop*/ 1.0, /*popLogRouter=*/false)); //< FIXME: knob
} }
} }
} }

View File

@ -2149,7 +2149,7 @@ int main(int argc, char* argv[]) {
auto dataFolder = opts.dataFolder.size() ? opts.dataFolder : "simfdb"; auto dataFolder = opts.dataFolder.size() ? opts.dataFolder : "simfdb";
std::vector<std::string> directories = platform::listDirectories(dataFolder); std::vector<std::string> directories = platform::listDirectories(dataFolder);
const std::set<std::string> allowedDirectories = { ".", "..", "backups", "unittests" }; const std::set<std::string> allowedDirectories = { ".", "..", "backups", "unittests", "fdbblob" };
for (const auto& dir : directories) { for (const auto& dir : directories) {
if (dir.size() != 32 && allowedDirectories.count(dir) == 0 && dir.find("snap") == std::string::npos) { if (dir.size() != 32 && allowedDirectories.count(dir) == 0 && dir.find("snap") == std::string::npos) {

View File

@ -3400,6 +3400,12 @@ public:
excludedDegradedServers; // The degraded servers to be excluded when assigning workers to roles. excludedDegradedServers; // The degraded servers to be excluded when assigning workers to roles.
std::queue<double> recentHealthTriggeredRecoveryTime; std::queue<double> recentHealthTriggeredRecoveryTime;
// Capture cluster's Encryption data at-rest mode; the status is set 'only' at the time of cluster creation.
// The promise gets set as part of cluster recovery process and is used by recovering encryption participant
// stateful processes (such as TLog) to ensure the stateful process on-disk encryption status matches with cluster's
// encryption status.
Promise<EncryptionAtRestMode> encryptionAtRestMode;
CounterCollection clusterControllerMetrics; CounterCollection clusterControllerMetrics;
Counter openDatabaseRequests; Counter openDatabaseRequests;

View File

@ -132,7 +132,7 @@ private:
try { try {
wait(self->cstate.setExclusive( wait(self->cstate.setExclusive(
BinaryWriter::toValue(newState, IncludeVersion(ProtocolVersion::withDBCoreState())))); BinaryWriter::toValue(newState, IncludeVersion(ProtocolVersion::withEncryptionAtRest()))));
} catch (Error& e) { } catch (Error& e) {
CODE_PROBE(true, "Master displaced during writeMasterState"); CODE_PROBE(true, "Master displaced during writeMasterState");
throw; throw;

View File

@ -28,6 +28,7 @@
#include "fdbrpc/ReplicationPolicy.h" #include "fdbrpc/ReplicationPolicy.h"
#include "fdbserver/LogSystemConfig.h" #include "fdbserver/LogSystemConfig.h"
#include "fdbserver/MasterInterface.h" #include "fdbserver/MasterInterface.h"
#include "flow/ObjectSerializerTraits.h"
class LogSet; class LogSet;
struct OldLogData; struct OldLogData;
@ -143,7 +144,7 @@ struct DBCoreState {
std::set<int8_t> pseudoLocalities; std::set<int8_t> pseudoLocalities;
ProtocolVersion newestProtocolVersion; ProtocolVersion newestProtocolVersion;
ProtocolVersion lowestCompatibleProtocolVersion; ProtocolVersion lowestCompatibleProtocolVersion;
EncryptionAtRestMode encryptionAtRestMode; EncryptionAtRestMode encryptionAtRestMode; // cluster encryption data at-rest mode
DBCoreState() DBCoreState()
: logRouterTags(0), txsTags(0), recoveryCount(0), logSystemType(LogSystemType::empty), : logRouterTags(0), txsTags(0), recoveryCount(0), logSystemType(LogSystemType::empty),

View File

@ -44,6 +44,8 @@ const StringRef TLOG_MSGS_PTREE_UPDATES_LATENCY_HISTOGRAM = "TLogMsgsPTreeUpdate
const StringRef STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM = "StorageUpdatesDurableLatency"_sr; const StringRef STORAGE_UPDATES_DURABLE_LATENCY_HISTOGRAM = "StorageUpdatesDurableLatency"_sr;
const StringRef STORAGE_COMMIT_LATENCY_HISTOGRAM = "StorageCommitLatency"_sr; const StringRef STORAGE_COMMIT_LATENCY_HISTOGRAM = "StorageCommitLatency"_sr;
const StringRef SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM = "SSDurableVersionUpdateLatency"_sr; const StringRef SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM = "SSDurableVersionUpdateLatency"_sr;
const StringRef SS_READ_RANGE_BYTES_RETURNED_HISTOGRAM = "SSReadRangeBytesReturned"_sr;
const StringRef SS_READ_RANGE_BYTES_LIMIT_HISTOGRAM = "SSReadRangeBytesLimit"_sr;
struct StorageMetricSample { struct StorageMetricSample {
IndexedSet<Key, int64_t> sample; IndexedSet<Key, int64_t> sample;

View File

@ -175,6 +175,7 @@ struct ClusterControllerFullInterface {
tlogRejoin; // sent by tlog (whether or not rebooted) to communicate with a new controller tlogRejoin; // sent by tlog (whether or not rebooted) to communicate with a new controller
RequestStream<struct BackupWorkerDoneRequest> notifyBackupWorkerDone; RequestStream<struct BackupWorkerDoneRequest> notifyBackupWorkerDone;
RequestStream<struct ChangeCoordinatorsRequest> changeCoordinators; RequestStream<struct ChangeCoordinatorsRequest> changeCoordinators;
RequestStream<struct GetEncryptionAtRestModeRequest> getEncryptionAtRestMode;
UID id() const { return clientInterface.id(); } UID id() const { return clientInterface.id(); }
bool operator==(ClusterControllerFullInterface const& r) const { return id() == r.id(); } bool operator==(ClusterControllerFullInterface const& r) const { return id() == r.id(); }
@ -189,7 +190,7 @@ struct ClusterControllerFullInterface {
getWorkers.getFuture().isReady() || registerMaster.getFuture().isReady() || getWorkers.getFuture().isReady() || registerMaster.getFuture().isReady() ||
getServerDBInfo.getFuture().isReady() || updateWorkerHealth.getFuture().isReady() || getServerDBInfo.getFuture().isReady() || updateWorkerHealth.getFuture().isReady() ||
tlogRejoin.getFuture().isReady() || notifyBackupWorkerDone.getFuture().isReady() || tlogRejoin.getFuture().isReady() || notifyBackupWorkerDone.getFuture().isReady() ||
changeCoordinators.getFuture().isReady(); changeCoordinators.getFuture().isReady() || getEncryptionAtRestMode.getFuture().isReady();
} }
void initEndpoints() { void initEndpoints() {
@ -206,6 +207,7 @@ struct ClusterControllerFullInterface {
tlogRejoin.getEndpoint(TaskPriority::MasterTLogRejoin); tlogRejoin.getEndpoint(TaskPriority::MasterTLogRejoin);
notifyBackupWorkerDone.getEndpoint(TaskPriority::ClusterController); notifyBackupWorkerDone.getEndpoint(TaskPriority::ClusterController);
changeCoordinators.getEndpoint(TaskPriority::DefaultEndpoint); changeCoordinators.getEndpoint(TaskPriority::DefaultEndpoint);
getEncryptionAtRestMode.getEndpoint(TaskPriority::ClusterController);
} }
template <class Ar> template <class Ar>
@ -226,7 +228,8 @@ struct ClusterControllerFullInterface {
updateWorkerHealth, updateWorkerHealth,
tlogRejoin, tlogRejoin,
notifyBackupWorkerDone, notifyBackupWorkerDone,
changeCoordinators); changeCoordinators,
getEncryptionAtRestMode);
} }
}; };
@ -572,6 +575,33 @@ struct BackupWorkerDoneRequest {
} }
}; };
struct GetEncryptionAtRestModeResponse {
constexpr static FileIdentifier file_identifier = 2932156;
uint32_t mode;
GetEncryptionAtRestModeResponse() : mode(EncryptionAtRestMode::Mode::DISABLED) {}
GetEncryptionAtRestModeResponse(uint32_t m) : mode(m) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, mode);
}
};
struct GetEncryptionAtRestModeRequest {
constexpr static FileIdentifier file_identifier = 2670826;
UID tlogId;
ReplyPromise<GetEncryptionAtRestModeResponse> reply;
GetEncryptionAtRestModeRequest() {}
GetEncryptionAtRestModeRequest(UID tId) : tlogId(tId) {}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, tlogId, reply);
}
};
struct InitializeTLogRequest { struct InitializeTLogRequest {
constexpr static FileIdentifier file_identifier = 15604392; constexpr static FileIdentifier file_identifier = 15604392;
UID recruitmentID; UID recruitmentID;

View File

@ -595,8 +595,11 @@ struct ChangeFeedInfo : ReferenceCounted<ChangeFeedInfo> {
auto toRemove = moveTriggers.modify(range); auto toRemove = moveTriggers.modify(range);
for (auto triggerRange = toRemove.begin(); triggerRange != toRemove.end(); ++triggerRange) { for (auto triggerRange = toRemove.begin(); triggerRange != toRemove.end(); ++triggerRange) {
auto streamToRemove = triggerRange->value().find(streamUID); auto streamToRemove = triggerRange->value().find(streamUID);
ASSERT(streamToRemove != triggerRange->cvalue().end()); if (streamToRemove == triggerRange->cvalue().end()) {
triggerRange->value().erase(streamToRemove); ASSERT(destroyed);
} else {
triggerRange->value().erase(streamToRemove);
}
} }
// TODO: may be more cleanup possible here // TODO: may be more cleanup possible here
} }
@ -732,6 +735,9 @@ public:
Reference<Histogram> storageUpdatesDurableLatencyHistogram; Reference<Histogram> storageUpdatesDurableLatencyHistogram;
Reference<Histogram> storageCommitLatencyHistogram; Reference<Histogram> storageCommitLatencyHistogram;
Reference<Histogram> ssDurableVersionUpdateLatencyHistogram; Reference<Histogram> ssDurableVersionUpdateLatencyHistogram;
// Histograms of requests sent to KVS.
Reference<Histogram> readRangeBytesReturnedHistogram;
Reference<Histogram> readRangeBytesLimitHistogram;
// watch map operations // watch map operations
Reference<ServerWatchMetadata> getWatchMetadata(KeyRef key) const; Reference<ServerWatchMetadata> getWatchMetadata(KeyRef key) const;
@ -1293,6 +1299,12 @@ public:
ssDurableVersionUpdateLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP, ssDurableVersionUpdateLatencyHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM, SS_DURABLE_VERSION_UPDATE_LATENCY_HISTOGRAM,
Histogram::Unit::microseconds)), Histogram::Unit::microseconds)),
readRangeBytesReturnedHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
SS_READ_RANGE_BYTES_RETURNED_HISTOGRAM,
Histogram::Unit::countLinear)),
readRangeBytesLimitHistogram(Histogram::getHistogram(STORAGESERVER_HISTOGRAM_GROUP,
SS_READ_RANGE_BYTES_LIMIT_HISTOGRAM,
Histogram::Unit::countLinear)),
tag(invalidTag), poppedAllAfter(std::numeric_limits<Version>::max()), cpuUsage(0.0), diskUsage(0.0), tag(invalidTag), poppedAllAfter(std::numeric_limits<Version>::max()), cpuUsage(0.0), diskUsage(0.0),
storage(this, storage), shardChangeCounter(0), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0), storage(this, storage), shardChangeCounter(0), lastTLogVersion(0), lastVersionWithData(0), restoredVersion(0),
prevVersion(0), rebootAfterDurableVersion(std::numeric_limits<Version>::max()), prevVersion(0), rebootAfterDurableVersion(std::numeric_limits<Version>::max()),
@ -3457,6 +3469,8 @@ ACTOR Future<GetKeyValuesReply> readRange(StorageServer* data,
RangeResult atStorageVersion = RangeResult atStorageVersion =
wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, options)); wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, options));
data->counters.kvScanBytes += atStorageVersion.logicalSize(); data->counters.kvScanBytes += atStorageVersion.logicalSize();
data->readRangeBytesReturnedHistogram->sample(atStorageVersion.logicalSize());
data->readRangeBytesLimitHistogram->sample(*pLimitBytes);
ASSERT(atStorageVersion.size() <= limit); ASSERT(atStorageVersion.size() <= limit);
if (data->storageVersion() > version) { if (data->storageVersion() > version) {
@ -3552,6 +3566,8 @@ ACTOR Future<GetKeyValuesReply> readRange(StorageServer* data,
RangeResult atStorageVersion = RangeResult atStorageVersion =
wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, options)); wait(data->storage.readRange(KeyRangeRef(readBegin, readEnd), limit, *pLimitBytes, options));
data->counters.kvScanBytes += atStorageVersion.logicalSize(); data->counters.kvScanBytes += atStorageVersion.logicalSize();
data->readRangeBytesReturnedHistogram->sample(atStorageVersion.logicalSize());
data->readRangeBytesLimitHistogram->sample(*pLimitBytes);
ASSERT(atStorageVersion.size() <= -limit); ASSERT(atStorageVersion.size() <= -limit);
if (data->storageVersion() > version) if (data->storageVersion() > version)

View File

@ -131,6 +131,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
try { try {
wait(timeoutError(quietDatabase(cx, self->dbInfo, "ConsistencyCheckStart", 0, 1e5, 0, 0), wait(timeoutError(quietDatabase(cx, self->dbInfo, "ConsistencyCheckStart", 0, 1e5, 0, 0),
self->quiescentWaitTimeout)); // FIXME: should be zero? self->quiescentWaitTimeout)); // FIXME: should be zero?
if (g_network->isSimulated()) {
g_simulator->quiesced = true;
TraceEvent("ConsistencyCheckQuiesced").detail("Quiesced", g_simulator->quiesced);
}
} catch (Error& e) { } catch (Error& e) {
TraceEvent("ConsistencyCheck_QuietDatabaseError").error(e); TraceEvent("ConsistencyCheck_QuietDatabaseError").error(e);
self->testFailure("Unable to achieve a quiet database"); self->testFailure("Unable to achieve a quiet database");
@ -201,6 +205,10 @@ struct ConsistencyCheckWorkload : TestWorkload {
when(wait(self->suspendConsistencyCheck.onChange())) {} when(wait(self->suspendConsistencyCheck.onChange())) {}
} }
} }
if (self->firstClient && g_network->isSimulated() && self->performQuiescentChecks) {
g_simulator->quiesced = false;
TraceEvent("ConsistencyCheckQuiescedEnd").detail("Quiesced", g_simulator->quiesced);
}
return Void(); return Void();
} }

View File

@ -665,23 +665,6 @@ struct TenantManagementWorkload : TestWorkload {
return Void(); return Void();
} }
// Returns GRV and eats GRV errors
ACTOR static Future<Version> getReadVersion(Reference<ReadYourWritesTransaction> tr) {
loop {
try {
Version version = wait(tr->getReadVersion());
return version;
} catch (Error& e) {
if (e.code() == error_code_grv_proxy_memory_limit_exceeded ||
e.code() == error_code_batch_transaction_throttled) {
wait(tr->onError(e));
} else {
throw;
}
}
}
}
ACTOR static Future<Void> deleteTenant(TenantManagementWorkload* self) { ACTOR static Future<Void> deleteTenant(TenantManagementWorkload* self) {
state TenantName beginTenant = self->chooseTenantName(true); state TenantName beginTenant = self->chooseTenantName(true);
state OperationType operationType = self->randomOperationType(); state OperationType operationType = self->randomOperationType();
@ -772,7 +755,8 @@ struct TenantManagementWorkload : TestWorkload {
state bool retried = false; state bool retried = false;
loop { loop {
try { try {
state Version beforeVersion = wait(self->getReadVersion(tr)); state Version beforeVersion =
wait(getLatestReadVersion(self, OperationType::MANAGEMENT_DATABASE));
Optional<Void> result = Optional<Void> result =
wait(timeout(deleteTenantImpl(tr, beginTenant, endTenant, tenants, operationType, self), wait(timeout(deleteTenantImpl(tr, beginTenant, endTenant, tenants, operationType, self),
deterministicRandom()->randomInt(1, 30))); deterministicRandom()->randomInt(1, 30)));
@ -780,8 +764,8 @@ struct TenantManagementWorkload : TestWorkload {
if (result.present()) { if (result.present()) {
if (anyExists) { if (anyExists) {
if (self->oldestDeletionVersion == 0 && !tenants.empty()) { if (self->oldestDeletionVersion == 0 && !tenants.empty()) {
tr->reset(); Version afterVersion =
Version afterVersion = wait(self->getReadVersion(tr)); wait(self->getLatestReadVersion(self, OperationType::MANAGEMENT_DATABASE));
self->oldestDeletionVersion = afterVersion; self->oldestDeletionVersion = afterVersion;
} }
self->newestDeletionVersion = beforeVersion; self->newestDeletionVersion = beforeVersion;

View File

@ -34,7 +34,7 @@ class TransactionCostWorkload : public TestWorkload {
return bw.toValue().withPrefix(prefix); return bw.toValue().withPrefix(prefix);
} }
static Value getValue(uint32_t size) { return makeString(size); } static Value getValue(uint32_t size) { return ValueRef(std::string(size, '\x00')); }
static UID getDebugID(uint64_t testNumber) { return UID(testNumber << 32, testNumber << 32); } static UID getDebugID(uint64_t testNumber) { return UID(testNumber << 32, testNumber << 32); }

View File

@ -25,6 +25,7 @@
#include "flow/JsonTraceLogFormatter.h" #include "flow/JsonTraceLogFormatter.h"
#include "flow/flow.h" #include "flow/flow.h"
#include "flow/DeterministicRandom.h" #include "flow/DeterministicRandom.h"
#include <exception>
#include <stdlib.h> #include <stdlib.h>
#include <stdarg.h> #include <stdarg.h>
#include <cctype> #include <cctype>
@ -514,25 +515,29 @@ public:
void close() { void close() {
if (opened) { if (opened) {
MutexHolder hold(mutex); try {
MutexHolder hold(mutex);
// Write remaining contents // Write remaining contents
auto a = new WriterThread::WriteBuffer(std::move(eventBuffer)); auto a = new WriterThread::WriteBuffer(std::move(eventBuffer));
loggedLength += bufferLength; loggedLength += bufferLength;
eventBuffer = std::vector<TraceEventFields>(); eventBuffer = std::vector<TraceEventFields>();
bufferLength = 0; bufferLength = 0;
writer->post(a); writer->post(a);
auto c = new WriterThread::Close(); auto c = new WriterThread::Close();
writer->post(c); writer->post(c);
ThreadFuture<Void> f(new ThreadSingleAssignmentVar<Void>); ThreadFuture<Void> f(new ThreadSingleAssignmentVar<Void>);
barriers->push(f); barriers->push(f);
writer->post(new WriterThread::Barrier); writer->post(new WriterThread::Barrier);
f.getBlocking(); f.getBlocking();
opened = false; opened = false;
} catch (const std::exception& e) {
fprintf(stderr, "Error closing trace file: %s\n", e.what());
}
} }
} }

View File

@ -178,7 +178,7 @@ RUN yum -y install \
rm -rf /var/cache/yum rm -rf /var/cache/yum
WORKDIR /tmp WORKDIR /tmp
RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \ RUN NO_PROXY="" no_proxy="" curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \
echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \ echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \
sha256sum --quiet -c kubectl.txt && \ sha256sum --quiet -c kubectl.txt && \
mv kubectl /usr/local/bin/kubectl && \ mv kubectl /usr/local/bin/kubectl && \

View File

@ -53,7 +53,7 @@ RUN curl -Ls https://github.com/krallin/tini/releases/download/v0.19.0/tini-amd6
mv tini /usr/bin/ && \ mv tini /usr/bin/ && \
rm -rf /tmp/* rm -rf /tmp/*
RUN curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \ RUN NO_PROXY="" no_proxy="" curl -Ls https://s3.us-west-2.amazonaws.com/amazon-eks/1.22.6/2022-03-09/bin/linux/amd64/kubectl -o kubectl && \
echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \ echo "860c3d37a5979491895767e7332404d28dc0d7797c7673c33df30ca80e215a07 kubectl" > kubectl.txt && \
sha256sum --quiet -c kubectl.txt && \ sha256sum --quiet -c kubectl.txt && \
mv kubectl /usr/local/bin/kubectl && \ mv kubectl /usr/local/bin/kubectl && \

View File

@ -239,7 +239,7 @@ if(WITH_PYTHON)
add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml) add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.toml)
add_fdb_test(TEST_FILES rare/RedwoodDeltaTree.toml) add_fdb_test(TEST_FILES rare/RedwoodDeltaTree.toml)
add_fdb_test(TEST_FILES rare/Throttling.toml) add_fdb_test(TEST_FILES rare/Throttling.toml)
add_fdb_test(TEST_FILES rare/ThroughputQuota.toml) add_fdb_test(TEST_FILES rare/ThroughputQuota.toml IGNORE)
add_fdb_test(TEST_FILES rare/TransactionCost.toml) add_fdb_test(TEST_FILES rare/TransactionCost.toml)
add_fdb_test(TEST_FILES rare/TransactionTagApiCorrectness.toml) add_fdb_test(TEST_FILES rare/TransactionTagApiCorrectness.toml)
add_fdb_test(TEST_FILES rare/TransactionTagSwizzledApiCorrectness.toml) add_fdb_test(TEST_FILES rare/TransactionTagSwizzledApiCorrectness.toml)

View File

@ -280,11 +280,13 @@ class UpgradeTest:
os.close(self.ctrl_pipe) os.close(self.ctrl_pipe)
# Kill the tester process if it is still alive # Kill the tester process if it is still alive
def kill_tester_if_alive(self, workload_thread): def kill_tester_if_alive(self, workload_thread, dump_stacks):
if not workload_thread.is_alive(): if not workload_thread.is_alive():
return return
if self.tester_proc is not None: if self.tester_proc is not None:
try: try:
if dump_stacks:
os.system("pstack {}".format(self.tester_proc.pid))
print("Killing the tester process") print("Killing the tester process")
self.tester_proc.kill() self.tester_proc.kill()
workload_thread.join(5) workload_thread.join(5)
@ -310,11 +312,11 @@ class UpgradeTest:
except Exception: except Exception:
print("Upgrade test failed") print("Upgrade test failed")
print(traceback.format_exc()) print(traceback.format_exc())
self.kill_tester_if_alive(workload_thread) self.kill_tester_if_alive(workload_thread, False)
finally: finally:
workload_thread.join(5) workload_thread.join(5)
reader_thread.join(5) reader_thread.join(5)
self.kill_tester_if_alive(workload_thread) self.kill_tester_if_alive(workload_thread, True)
if test_retcode == 0: if test_retcode == 0:
test_retcode = self.tester_retcode test_retcode = self.tester_retcode
return test_retcode return test_retcode