diff --git a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py index e704cacb72..37b1132de8 100644 --- a/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py +++ b/contrib/transaction_profiling_analyzer/transaction_profiling_analyzer.py @@ -194,7 +194,7 @@ class BaseInfo(object): if protocol_version >= PROTOCOL_VERSION_6_3: self.dc_id = bb.get_bytes_with_length() if protocol_version >= PROTOCOL_VERSION_7_1: - if bb.get_bytes(1): + if bb.get_bool(): self.tenant = bb.get_bytes_with_length() class GetVersionInfo(BaseInfo): diff --git a/fdbclient/BlobGranuleFiles.cpp b/fdbclient/BlobGranuleFiles.cpp index 14714b1559..3b1ee32eb9 100644 --- a/fdbclient/BlobGranuleFiles.cpp +++ b/fdbclient/BlobGranuleFiles.cpp @@ -22,6 +22,7 @@ #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/ClientKnobs.h" +#include "fdbclient/CommitTransaction.h" #include "fdbclient/Knobs.h" #include "fdbclient/SystemData.h" // for allKeys unit test - could remove @@ -41,8 +42,6 @@ #define BG_READ_DEBUG false -// FIXME: implement actual proper file format for this - // Implements granule file parsing and materialization with normal c++ functions (non-actors) so that this can be used // outside the FDB network thread. @@ -57,6 +56,78 @@ uint16_t MIN_SUPPORTED_BG_FORMAT_VERSION = 1; const uint8_t SNAPSHOT_FILE_TYPE = 'S'; const uint8_t DELTA_FILE_TYPE = 'D'; +// Deltas in key order + +// For key-ordered delta files, the format for both sets and range clears is that you store boundaries ordered by key. +// Each boundary has a corresponding key, zero or more versioned updates (ValueAndVersionRef), and optionally a clear +// from keyAfter(key) to the next boundary, at a version. +// A streaming merge is more efficient than applying deltas one by one to restore to a later version from the snapshot. +// The concept of this versioned mutation boundaries is repurposed directly from a prior version of redwood, back when +// it supported versioned data. +struct ValueAndVersionRef { + Version version; + MutationRef::Type op; // only set/clear + ValueRef value; // only present for set + + ValueAndVersionRef() {} + // create clear + explicit ValueAndVersionRef(Version version) : version(version), op(MutationRef::Type::ClearRange) {} + // create set + explicit ValueAndVersionRef(Version version, ValueRef value) + : version(version), op(MutationRef::Type::SetValue), value(value) {} + ValueAndVersionRef(Arena& arena, const ValueAndVersionRef& copyFrom) + : version(copyFrom.version), op(copyFrom.op), value(arena, copyFrom.value) {} + + bool isSet() const { return op == MutationRef::SetValue; } + bool isClear() const { return op == MutationRef::ClearRange; } + + int totalSize() const { return sizeof(ValueAndVersionRef) + value.size(); } + int expectedSize() const { return value.size(); } + + struct OrderByVersion { + bool operator()(ValueAndVersionRef const& a, ValueAndVersionRef const& b) const { + return a.version < b.version; + } + }; + + template + void serialize(Ar& ar) { + serializer(ar, version, op, value); + } +}; + +struct DeltaBoundaryRef { + // key + KeyRef key; + // updates to exactly this key + VectorRef values; + // clear version from keyAfter(key) up to the next boundary + Optional clearVersion; + + DeltaBoundaryRef() {} + DeltaBoundaryRef(Arena& ar, const DeltaBoundaryRef& copyFrom) + : key(ar, copyFrom.key), values(ar, copyFrom.values), clearVersion(copyFrom.clearVersion) {} + + int totalSize() { return sizeof(DeltaBoundaryRef) + key.expectedSize() + values.expectedSize(); } + int expectedSize() const { return key.expectedSize() + values.expectedSize(); } + + template + void serialize(Ar& ar) { + serializer(ar, key, values, clearVersion); + } +}; + +struct GranuleSortedDeltas { + constexpr static FileIdentifier file_identifier = 8183903; + + VectorRef boundaries; + + template + void serialize(Ar& ar) { + serializer(ar, boundaries); + } +}; + struct ChildBlockPointerRef { StringRef key; uint32_t offset; @@ -177,7 +248,7 @@ struct IndexBlockRef { cipherKeysCtx.ivRef.begin(), AES_256_IV_LENGTH, ENCRYPT_HEADER_AUTH_TOKEN_MODE_SINGLE); - Value serializedBuff = ObjectWriter::toValue(block, Unversioned()); + Value serializedBuff = ObjectWriter::toValue(block, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); BlobCipherEncryptHeader header; buffer = encryptor.encrypt(serializedBuff.contents().begin(), serializedBuff.contents().size(), &header, arena) ->toStringRef(); @@ -214,19 +285,20 @@ struct IndexBlockRef { } // TODO: Add version? - ObjectReader dataReader(decrypted.begin(), Unversioned()); + ObjectReader dataReader(decrypted.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, idxRef.block, arena); } void init(Optional cipherKeysCtx, Arena& arena) { if (encryptHeaderRef.present()) { + CODE_PROBE(true, "reading encrypted chunked file"); ASSERT(cipherKeysCtx.present()); decrypt(cipherKeysCtx.get(), *this, arena); } else { TraceEvent("IndexBlockSize").detail("Sz", buffer.size()); // TODO: Add version? - ObjectReader dataReader(buffer.begin(), Unversioned()); + ObjectReader dataReader(buffer.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, block, arena); } } @@ -242,7 +314,8 @@ struct IndexBlockRef { encrypt(cipherKeysCtx.get(), arena); } else { encryptHeaderRef.reset(); - buffer = StringRef(arena, ObjectWriter::toValue(block, Unversioned()).contents()); + buffer = StringRef( + arena, ObjectWriter::toValue(block, IncludeVersion(ProtocolVersion::withBlobGranuleFile())).contents()); } TraceEvent(SevDebug, "IndexBlockSize").detail("Sz", buffer.size()).detail("Encrypted", cipherKeysCtx.present()); @@ -259,8 +332,8 @@ struct IndexBlockRef { // Encryption: A 'chunk' gets encrypted before getting persisted if enabled. Encryption header is persisted along with // the chunk data to assist decryption on reads. // -// Compression: A 'chunk' gets compressed before getting persisted if enabled. Compression filter (algoritm) infomration -// is persisted as part of 'chunk metadata' to assist decompression on reads. +// Compression: A 'chunk' gets compressed before getting persisted if enabled. Compression filter (algorithm) +// information is persisted as part of 'chunk metadata' to assist decompression on reads. struct IndexBlobGranuleFileChunkRef { constexpr static FileIdentifier file_identifier = 2814019; @@ -378,19 +451,18 @@ struct IndexBlobGranuleFileChunkRef { : CompressionUtils::toString(CompressionFilter::NONE)); } - // TODO: Add version? - return ObjectWriter::toValue(chunkRef, Unversioned()); + return ObjectWriter::toValue(chunkRef, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); } static IndexBlobGranuleFileChunkRef fromBytes(Optional cipherKeysCtx, StringRef buffer, Arena& arena) { IndexBlobGranuleFileChunkRef chunkRef; - // TODO: Add version? - ObjectReader dataReader(buffer.begin(), Unversioned()); + ObjectReader dataReader(buffer.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, chunkRef, arena); if (chunkRef.encryptHeaderRef.present()) { + CODE_PROBE(true, "reading encrypted file chunk"); ASSERT(cipherKeysCtx.present()); chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decrypt(cipherKeysCtx.get(), chunkRef, arena); } else { @@ -398,6 +470,7 @@ struct IndexBlobGranuleFileChunkRef { } if (chunkRef.compressionFilter.present()) { + CODE_PROBE(true, "reading compressed file chunk"); chunkRef.chunkBytes = IndexBlobGranuleFileChunkRef::decompress(chunkRef, arena); } else if (!chunkRef.chunkBytes.present()) { // 'Encryption' & 'Compression' aren't enabled. @@ -441,9 +514,9 @@ struct IndexedBlobGranuleFile { // Non-serialized member fields StringRef fileBytes; - void init(const Optional cipherKeysCtx) { + void init(uint8_t fType, const Optional cipherKeysCtx) { formatVersion = LATEST_BG_FORMAT_VERSION; - fileType = SNAPSHOT_FILE_TYPE; + fileType = fType; chunkStartOffset = -1; } @@ -459,8 +532,7 @@ struct IndexedBlobGranuleFile { // parse index block at head of file Arena arena; IndexedBlobGranuleFile file; - // TODO: version? - ObjectReader dataReader(fileBytes.begin(), Unversioned()); + ObjectReader dataReader(fileBytes.begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, file, arena); file.init(fileBytes, arena, cipherKeysCtx); @@ -521,8 +593,7 @@ struct IndexedBlobGranuleFile { IndexBlobGranuleFileChunkRef::fromBytes(cipherKeysCtx, childData, childArena); ChildType child; - // TODO: version? - ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), Unversioned()); + ObjectReader dataReader(chunkRef.chunkBytes.get().begin(), IncludeVersion()); dataReader.deserialize(FileIdentifierFor::value, child, childArena); // TODO implement some sort of decrypted+decompressed+deserialized cache, if this object gets reused? @@ -542,29 +613,59 @@ struct IndexedBlobGranuleFile { Value serializeIndexBlock(Standalone& file, Optional cipherKeysCtx) { file.indexBlockRef.finalize(cipherKeysCtx, file.arena()); - // TODO: version? - Value serialized = ObjectWriter::toValue(file, Unversioned()); + Value serialized = ObjectWriter::toValue(file, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); file.chunkStartOffset = serialized.contents().size(); if (BG_ENCRYPT_COMPRESS_DEBUG) { TraceEvent(SevDebug, "SerializeIndexBlock").detail("StartOffset", file.chunkStartOffset); } - return ObjectWriter::toValue(file, Unversioned()); + return ObjectWriter::toValue(file, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); } -// TODO: this should probably be in actor file with yields? +Value serializeFileFromChunks(Standalone& file, + Optional cipherKeysCtx, + std::vector& chunks, + int previousChunkBytes) { + Value indexBlockBytes = serializeIndexBlock(file, cipherKeysCtx); + int32_t indexSize = indexBlockBytes.size(); + chunks[0] = indexBlockBytes; + + // TODO: write this directly to stream to avoid extra copy? + Arena ret; + + size_t size = indexSize + previousChunkBytes; + uint8_t* buffer = new (ret) uint8_t[size]; + uint8_t* bufferStart = buffer; + + int idx = 0; + for (auto& it : chunks) { + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "SerializeFile") + .detail("ChunkIdx", idx++) + .detail("Size", it.size()) + .detail("Offset", buffer - bufferStart); + } + buffer = it.copyTo(buffer); + } + ASSERT(size == buffer - bufferStart); + + return Standalone(StringRef(bufferStart, size), ret); +} + +// TODO: this should probably be in actor file with yields? - move writing logic to separate actor file in server? // TODO: optimize memory copying // TODO: sanity check no oversized files Value serializeChunkedSnapshot(Standalone snapshot, - int chunkCount, + int targetChunkBytes, Optional compressFilter, Optional cipherKeysCtx) { + CODE_PROBE(compressFilter.present(), "serializing compressed snapshot file"); + CODE_PROBE(cipherKeysCtx.present(), "serializing encrypted snapshot file"); Standalone file; - file.init(cipherKeysCtx); + file.init(SNAPSHOT_FILE_TYPE, cipherKeysCtx); - size_t targetChunkBytes = snapshot.expectedSize() / chunkCount; size_t currentChunkBytesEstimate = 0; size_t previousChunkBytes = 0; @@ -572,7 +673,6 @@ Value serializeChunkedSnapshot(Standalone snapshot, chunks.push_back(Value()); // dummy value for index block Standalone currentChunk; - // fmt::print("Chunk index:\n"); for (int i = 0; i < snapshot.size(); i++) { // TODO REMOVE sanity check if (i > 0) { @@ -583,8 +683,8 @@ Value serializeChunkedSnapshot(Standalone snapshot, currentChunkBytesEstimate += snapshot[i].expectedSize(); if (currentChunkBytesEstimate >= targetChunkBytes || i == snapshot.size() - 1) { - // TODO: protocol version - Value serialized = ObjectWriter::toValue(currentChunk, Unversioned()); + Value serialized = + ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); Value chunkBytes = IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena()); chunks.push_back(chunkBytes); @@ -613,37 +713,12 @@ Value serializeChunkedSnapshot(Standalone snapshot, file.arena(), keyAfter(snapshot.back().key), previousChunkBytes); } - Value indexBlockBytes = serializeIndexBlock(file, cipherKeysCtx); - int32_t indexSize = indexBlockBytes.size(); - chunks[0] = indexBlockBytes; - - // TODO: write this directly to stream to avoid extra copy? - Arena ret; - - size_t size = indexSize + previousChunkBytes; - uint8_t* buffer = new (ret) uint8_t[size]; - - previousChunkBytes = 0; - int idx = 0; - for (auto& it : chunks) { - if (BG_ENCRYPT_COMPRESS_DEBUG) { - TraceEvent(SevDebug, "SerializeSnapshot") - .detail("ChunkIdx", idx++) - .detail("Size", it.size()) - .detail("Offset", previousChunkBytes); - } - - memcpy(buffer + previousChunkBytes, it.begin(), it.size()); - previousChunkBytes += it.size(); - } - ASSERT(size == previousChunkBytes); - - return Standalone(StringRef(buffer, size), ret); + return serializeFileFromChunks(file, cipherKeysCtx, chunks, previousChunkBytes); } // TODO: use redwood prefix trick to optimize cpu comparison static Arena loadSnapshotFile(const StringRef& snapshotData, - KeyRangeRef keyRange, + const KeyRangeRef& keyRange, std::map& dataMap, Optional cipherKeysCtx) { Arena rootArena; @@ -691,7 +766,376 @@ static Arena loadSnapshotFile(const StringRef& snapshotData, return rootArena; } -static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map& dataMap) { +typedef std::map> SortedDeltasT; + +// FIXME: optimize all of this with common prefix comparison stuff +SortedDeltasT::iterator insertMutationBoundary(SortedDeltasT& deltasByKey, const KeyRef& boundary) { + // Find the first split point in buffer that is >= key + auto it = deltasByKey.lower_bound(boundary); + + // Since the map contains fileRange already, we had to have found something + ASSERT(it != deltasByKey.end()); + if (it->first == boundary) { + return it; + } + + // new boundary, using find as insert hint + it = deltasByKey.insert(it, { boundary, Standalone() }); + + // look back at previous entry to see if this boundary is already cleared to at a prior version + ASSERT(it != deltasByKey.begin()); + auto itPrev = it; + --itPrev; + + if (itPrev->second.clearVersion.present()) { + it->second.clearVersion = itPrev->second.clearVersion; + it->second.values.push_back(it->second.arena(), ValueAndVersionRef(it->second.clearVersion.get())); + } + + return it; +} + +void updateMutationBoundary(Standalone& boundary, const ValueAndVersionRef& update) { + if (update.isSet()) { + if (boundary.values.empty() || boundary.values.back().version < update.version) { + // duplicate same set even if it's the same as the last one, so beginVersion reads still get updates + boundary.values.push_back(boundary.arena(), update); + } else { + CODE_PROBE(true, "multiple boundary updates at same version (set)"); + // preserve inter-mutation order by replacing this one + boundary.values.back() = update; + } + } else { + if (boundary.values.empty() || + (boundary.values.back().isSet() && boundary.values.back().version < update.version)) { + // don't duplicate single-key clears in order if previous was also a clear, since it's a no-op when starting + // with beginVersion + boundary.values.push_back(boundary.arena(), update); + } else if (!boundary.values.empty() && boundary.values.back().version == update.version) { + CODE_PROBE(true, "multiple boundary updates at same version (clear)"); + if (boundary.values.back().isSet()) { + // if the last 2 updates were clear @ v1 and set @ v2, and we now have a clear at v2, just pop off the + // set and leave the previous clear. Otherwise, just set the last set to a clear + if (boundary.values.size() >= 2 && boundary.values[boundary.values.size() - 2].isClear()) { + CODE_PROBE(true, "clear then set/clear at same version optimization"); + boundary.values.pop_back(); + } else { + boundary.values.back() = update; + } + } // else we have 2 consecutive clears at this version, no-op + } + } +} + +// TODO: investigate more cpu-efficient sorting methods. Potential options: +// 1) Replace std::map with ART mutation buffer +// 2) sort updates and clear endpoints by (key, version), and keep track of active clears. +void sortDeltasByKey(const Standalone& deltasByVersion, + const KeyRangeRef& fileRange, + SortedDeltasT& deltasByKey) { + if (deltasByVersion.empty()) { + return; + } + if (deltasByKey.empty()) { + deltasByKey.insert({ fileRange.begin, Standalone() }); + deltasByKey.insert({ fileRange.end, Standalone() }); + } + for (auto& it : deltasByVersion) { + for (auto& m : it.mutations) { + // TODO REMOVE validation + ASSERT(fileRange.contains(m.param1)); + if (m.type == MutationRef::ClearRange) { + ASSERT(m.param2 <= fileRange.end); + // handle single key clear more efficiently + if (equalsKeyAfter(m.param1, m.param2)) { + SortedDeltasT::iterator key = insertMutationBoundary(deltasByKey, m.param1); + updateMutationBoundary(key->second, ValueAndVersionRef(it.version)); + } else { + // Update each boundary in the cleared range + SortedDeltasT::iterator begin = insertMutationBoundary(deltasByKey, m.param1); + SortedDeltasT::iterator end = insertMutationBoundary(deltasByKey, m.param2); + while (begin != end) { + // Set the rangeClearedVersion if not set + if (!begin->second.clearVersion.present()) { + begin->second.clearVersion = it.version; + } + + // Add a clear to values if it's empty or the last item is not a clear + if (begin->second.values.empty() || begin->second.values.back().isSet()) { + updateMutationBoundary(begin->second, ValueAndVersionRef(it.version)); + } + ++begin; + } + } + } else { + Standalone& bound = insertMutationBoundary(deltasByKey, m.param1)->second; + updateMutationBoundary(bound, ValueAndVersionRef(it.version, m.param2)); + } + } + } + + // TODO: could do a scan through map and coalesce clears (if any boundaries with exactly 1 mutation (clear) and same + // clearVersion as previous guy) +} + +// FIXME: Could maybe reduce duplicated code between this and chunkedSnapshot for chunking +Value serializeChunkedDeltaFile(Standalone deltas, + const KeyRangeRef& fileRange, + int chunkSize, + Optional compressFilter, + Optional cipherKeysCtx) { + CODE_PROBE(compressFilter.present(), "serializing compressed delta file"); + CODE_PROBE(cipherKeysCtx.present(), "serializing encrypted delta file"); + Standalone file; + + file.init(DELTA_FILE_TYPE, cipherKeysCtx); + + // build in-memory version of boundaries - TODO separate functions + SortedDeltasT boundaries; + sortDeltasByKey(deltas, fileRange, boundaries); + + std::vector chunks; + chunks.push_back(Value()); // dummy value for index block + + Standalone currentChunk; + size_t currentChunkBytesEstimate = 0; + size_t previousChunkBytes = 0; + + // TODO REMOVE - for validation + KeyRef lastKey; + int i = 0; + for (auto& it : boundaries) { + // TODO REMOVE sanity check + if (i > 0) { + ASSERT(lastKey < it.first); + } + lastKey = it.first; + it.second.key = it.first; + + currentChunk.boundaries.push_back_deep(currentChunk.arena(), it.second); + currentChunkBytesEstimate += it.second.totalSize(); + + if (currentChunkBytesEstimate >= chunkSize || i == boundaries.size() - 1) { + Value serialized = + ObjectWriter::toValue(currentChunk, IncludeVersion(ProtocolVersion::withBlobGranuleFile())); + Value chunkBytes = + IndexBlobGranuleFileChunkRef::toBytes(cipherKeysCtx, compressFilter, serialized, file.arena()); + chunks.push_back(chunkBytes); + + // TODO remove validation + if (!file.indexBlockRef.block.children.empty()) { + ASSERT(file.indexBlockRef.block.children.back().key < currentChunk.boundaries.begin()->key); + } + file.indexBlockRef.block.children.emplace_back_deep( + file.arena(), currentChunk.boundaries.begin()->key, previousChunkBytes); + + if (BG_ENCRYPT_COMPRESS_DEBUG) { + TraceEvent(SevDebug, "ChunkSize") + .detail("ChunkBytes", chunkBytes.size()) + .detail("PrvChunkBytes", previousChunkBytes); + } + + previousChunkBytes += chunkBytes.size(); + currentChunkBytesEstimate = 0; + currentChunk = Standalone(); + } + i++; + } + ASSERT(currentChunk.boundaries.empty()); + if (!deltas.empty()) { + file.indexBlockRef.block.children.emplace_back_deep(file.arena(), fileRange.end, previousChunkBytes); + } + + return serializeFileFromChunks(file, cipherKeysCtx, chunks, previousChunkBytes); +} + +// Effectively the single DeltaBoundaryRef reduced to one update, but also with the key and clear after information. +// Sometimes at a given version, the boundary may only be necessary to represent a clear version after this key, or just +// an update/clear to this key, or both. +struct ParsedDeltaBoundaryRef { + KeyRef key; + MutationRef::Type op; // SetValue, ClearRange, or NoOp + ValueRef value; // null unless op == SetValue + bool clearAfter; + + // op constructor + ParsedDeltaBoundaryRef() {} + explicit ParsedDeltaBoundaryRef(KeyRef key, bool clearAfter, const ValueAndVersionRef& valueAndVersion) + : key(key), op(valueAndVersion.op), value(valueAndVersion.value), clearAfter(clearAfter) {} + // noop constructor + explicit ParsedDeltaBoundaryRef(KeyRef key, bool clearAfter) + : key(key), op(MutationRef::Type::NoOp), clearAfter(clearAfter) {} + ParsedDeltaBoundaryRef(Arena& arena, const ParsedDeltaBoundaryRef& copyFrom) + : key(arena, copyFrom.key), op(copyFrom.op), clearAfter(copyFrom.clearAfter) { + if (copyFrom.isSet()) { + value = StringRef(arena, copyFrom.value); + } + } + + bool isSet() const { return op == MutationRef::SetValue; } + bool isClear() const { return op == MutationRef::ClearRange; } + bool redundant(bool prevClearAfter) const { return op == MutationRef::Type::NoOp && clearAfter == prevClearAfter; } +}; + +// TODO could move ParsedDeltaBoundaryRef struct type up to granule common and make this a member of DeltaBoundaryRef? +ParsedDeltaBoundaryRef deltaAtVersion(const DeltaBoundaryRef& delta, Version beginVersion, Version readVersion) { + bool clearAfter = delta.clearVersion.present() && readVersion >= delta.clearVersion.get() && + beginVersion <= delta.clearVersion.get(); + if (delta.values.empty()) { + return ParsedDeltaBoundaryRef(delta.key, clearAfter); + } + auto valueAtVersion = std::lower_bound(delta.values.begin(), + delta.values.end(), + ValueAndVersionRef(readVersion), + ValueAndVersionRef::OrderByVersion()); + if (valueAtVersion == delta.values.begin() && readVersion < valueAtVersion->version) { + // deltas are all higher than read version + return ParsedDeltaBoundaryRef(delta.key, clearAfter); + } + // lower_bound() found version >= readVersion, so if we're at the end or it's not equal, go back one + if (valueAtVersion == delta.values.end() || valueAtVersion->version > readVersion) { + valueAtVersion--; + } + ASSERT(readVersion >= valueAtVersion->version); + // now, handle beginVersion (if update < beginVersion, it's a noop) + if (valueAtVersion->version < beginVersion) { + return ParsedDeltaBoundaryRef(delta.key, clearAfter); + } else { + return ParsedDeltaBoundaryRef(delta.key, clearAfter, *valueAtVersion); + } +} + +void applyDeltasSorted(const Standalone>& sortedDeltas, + bool startClear, + std::map& dataMap) { + if (sortedDeltas.empty() && !startClear) { + return; + } + + // sorted merge of 2 iterators + bool prevClear = startClear; + auto deltaIt = sortedDeltas.begin(); + auto snapshotIt = dataMap.begin(); + + while (deltaIt != sortedDeltas.end() && snapshotIt != dataMap.end()) { + if (deltaIt->key < snapshotIt->first) { + // Delta is lower than snapshot. Insert new row, if the delta is a set. Ignore point clear and noop + if (deltaIt->isSet()) { + snapshotIt = dataMap.insert(snapshotIt, { deltaIt->key, deltaIt->value }); + snapshotIt++; + } + prevClear = deltaIt->clearAfter; + deltaIt++; + } else if (snapshotIt->first < deltaIt->key) { + // Snapshot is lower than delta. Erase the current entry if the previous delta was a clearAfter + if (prevClear) { + snapshotIt = dataMap.erase(snapshotIt); + } else { + snapshotIt++; + } + } else { + // Delta and snapshot are for the same key. The delta is newer, so if it is a set, update the value, else if + // it's a clear, delete the value (ignore noop) + if (deltaIt->isSet()) { + snapshotIt->second = deltaIt->value; + } else if (deltaIt->isClear()) { + snapshotIt = dataMap.erase(snapshotIt); + } + if (!deltaIt->isClear()) { + snapshotIt++; + } + prevClear = deltaIt->clearAfter; + deltaIt++; + } + } + // Either we are out of deltas or out of snapshots. + // if snapshot remaining and prevClear last delta set, clear the rest of the map + if (prevClear && snapshotIt != dataMap.end()) { + CODE_PROBE(true, "last delta range cleared end of snapshot"); + dataMap.erase(snapshotIt, dataMap.end()); + } + // Apply remaining sets from delta, with no remaining snapshot + while (deltaIt != sortedDeltas.end()) { + if (deltaIt->isSet()) { + CODE_PROBE(true, "deltas past end of snapshot"); + snapshotIt = dataMap.insert(snapshotIt, { deltaIt->key, deltaIt->value }); + } + deltaIt++; + } +} + +// The arena owns the BoundaryDeltaRef struct data but the StringRef pointers point to data in deltaData, to avoid extra +// copying +Arena loadChunkedDeltaFile(const StringRef& deltaData, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + std::map& dataMap, + Optional cipherKeysCtx) { + Standalone> deltas; + Standalone file = IndexedBlobGranuleFile::fromFileBytes(deltaData, cipherKeysCtx); + + ASSERT(file.fileType == DELTA_FILE_TYPE); + ASSERT(file.chunkStartOffset > 0); + + // empty delta file + if (file.indexBlockRef.block.children.empty()) { + return deltas.arena(); + } + + ASSERT(file.indexBlockRef.block.children.size() >= 2); + + // TODO: refactor this out of delta tree + // int commonPrefixLen = commonPrefixLength(index.dataBlockOffsets.front().first, + // index.dataBlockOffsets.back().first); + + // find range of blocks needed to read + ChildBlockPointerRef* currentBlock = file.findStartBlock(keyRange.begin); + + // TODO cpu optimize (key check per block, prefixes, optimize start of first block) + bool startClear = false; + bool prevClearAfter = false; + while (currentBlock != (file.indexBlockRef.block.children.end() - 1) && keyRange.end > currentBlock->key) { + Standalone deltaBlock = + file.getChild(currentBlock, cipherKeysCtx, file.chunkStartOffset); + ASSERT(!deltaBlock.boundaries.empty()); + ASSERT(currentBlock->key == deltaBlock.boundaries.front().key); + + // TODO refactor this into function to share with memory deltas + bool blockMemoryUsed = false; + + for (auto& entry : deltaBlock.boundaries) { + ParsedDeltaBoundaryRef boundary = deltaAtVersion(entry, beginVersion, readVersion); + if (entry.key < keyRange.begin) { + startClear = boundary.clearAfter; + prevClearAfter = boundary.clearAfter; + } else if (entry.key < keyRange.end) { + if (!boundary.redundant(prevClearAfter)) { + deltas.push_back(deltas.arena(), boundary); + blockMemoryUsed = true; + prevClearAfter = boundary.clearAfter; + } + } else { + break; + } + } + if (blockMemoryUsed) { + deltas.arena().dependsOn(deltaBlock.arena()); + } + currentBlock++; + } + + // TODO REMOVE eventually? order sanity check for parsed deltas + for (int i = 0; i < deltas.size() - 1; i++) { + ASSERT(deltas[i].key < deltas[i + 1].key); + } + + applyDeltasSorted(deltas, startClear, dataMap); + + return deltas.arena(); +} + +static void applyDelta(const KeyRangeRef& keyRange, const MutationRef& m, std::map& dataMap) { if (m.type == MutationRef::ClearRange) { if (m.param2 <= keyRange.begin || m.param1 >= keyRange.end) { return; @@ -728,12 +1172,12 @@ static void applyDelta(KeyRangeRef keyRange, MutationRef m, std::map& dataMap) { +static void applyDeltasByVersion(const GranuleDeltas& deltas, + const KeyRangeRef& keyRange, + Version beginVersion, + Version readVersion, + Version& lastFileEndVersion, + std::map& dataMap) { if (deltas.empty()) { return; } @@ -768,34 +1212,6 @@ static void applyDeltas(const GranuleDeltas& deltas, lastFileEndVersion = deltas.back().version; } -static Arena loadDeltaFile(StringRef deltaData, - KeyRangeRef keyRange, - Version beginVersion, - Version readVersion, - Version& lastFileEndVersion, - std::map& dataMap) { - Arena parseArena; - GranuleDeltas deltas; - ObjectReader reader(deltaData.begin(), Unversioned()); - reader.deserialize(FileIdentifierFor::value, deltas, parseArena); - - if (BG_READ_DEBUG) { - fmt::print("Parsed {} deltas from file\n", deltas.size()); - } - - // TODO REMOVE sanity check - for (int i = 0; i < deltas.size() - 1; i++) { - if (deltas[i].version > deltas[i + 1].version) { - fmt::print( - "BG VERSION ORDER VIOLATION IN DELTA FILE: '{0}', '{1}'\n", deltas[i].version, deltas[i + 1].version); - } - ASSERT(deltas[i].version <= deltas[i + 1].version); - } - - applyDeltas(deltas, keyRange, beginVersion, readVersion, lastFileEndVersion, dataMap); - return parseArena; -} - RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, KeyRangeRef keyRange, Version beginVersion, @@ -829,14 +1245,15 @@ RangeResult materializeBlobGranule(const BlobGranuleChunkRef& chunk, fmt::print("Applying {} delta files\n", chunk.deltaFiles.size()); } for (int deltaIdx = 0; deltaIdx < chunk.deltaFiles.size(); deltaIdx++) { - Arena deltaArena = loadDeltaFile( - deltaFileData[deltaIdx], requestRange, beginVersion, readVersion, lastFileEndVersion, dataMap); + Arena deltaArena = loadChunkedDeltaFile( + deltaFileData[deltaIdx], requestRange, beginVersion, readVersion, dataMap, chunk.cipherKeysCtx); arena.dependsOn(deltaArena); } if (BG_READ_DEBUG) { fmt::print("Applying {} memory deltas\n", chunk.newDeltas.size()); } - applyDeltas(chunk.newDeltas, requestRange, beginVersion, readVersion, lastFileEndVersion, dataMap); + // TODO: also sort these and do merge + applyDeltasByVersion(chunk.newDeltas, requestRange, beginVersion, readVersion, lastFileEndVersion, dataMap); RangeResult ret; for (auto& it : dataMap) { @@ -1150,28 +1567,93 @@ TEST_CASE("/blobgranule/files/applyDelta") { return Void(); } -// picks a number between 2^minExp and 2^maxExp, but uniformly distributed over exponential buckets 2^n an 2^n+1 -int randomExp(int minExp, int maxExp) { - if (minExp == maxExp) { // N=2, case - return 1 << minExp; +void checkDeltaAtVersion(const ParsedDeltaBoundaryRef& expected, + const DeltaBoundaryRef& boundary, + Version beginVersion, + Version readVersion) { + ParsedDeltaBoundaryRef actual = deltaAtVersion(boundary, beginVersion, readVersion); + ASSERT(expected.clearAfter == actual.clearAfter); + ASSERT(expected.op == actual.op); + if (expected.isSet()) { + ASSERT(expected.value == actual.value); + } else { + ASSERT(actual.value.empty()); } - int val = 1 << deterministicRandom()->randomInt(minExp, maxExp); - ASSERT(val > 0); - return deterministicRandom()->randomInt(val, val * 2); } -void checkEmpty(const Value& serialized, Key begin, Key end, Optional cipherKeysCtx) { +TEST_CASE("/blobgranule/files/deltaAtVersion") { + Arena ar; + std::string keyStr = "k"; + std::string aStr = "a"; + + KeyRef key(ar, keyStr); + ValueAndVersionRef vv_a_3(3, ValueRef(ar, aStr)); + ValueAndVersionRef vv_clear_5(5); + + ParsedDeltaBoundaryRef resultEmpty(key, false); + ParsedDeltaBoundaryRef resultEmptyWithClear(key, true); + ParsedDeltaBoundaryRef resultSetA(key, false, vv_a_3); + ParsedDeltaBoundaryRef resultClearA(key, true, vv_clear_5); + + // test empty boundary ref + DeltaBoundaryRef boundaryEmpty; + boundaryEmpty.key = key; + checkDeltaAtVersion(resultEmpty, boundaryEmpty, 0, 2); + + // test empty boundary with clear + DeltaBoundaryRef boundaryEmptyWithClear; + boundaryEmptyWithClear.key = key; + boundaryEmptyWithClear.clearVersion = 5; + + // higher read version includes clear + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 0, 5); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 0, 10); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 2, 5); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 2, 10); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 5, 10); + checkDeltaAtVersion(resultEmptyWithClear, boundaryEmptyWithClear, 5, 5); + + // lower read version does not include clear + checkDeltaAtVersion(resultEmpty, boundaryEmptyWithClear, 0, 4); + checkDeltaAtVersion(resultEmpty, boundaryEmptyWithClear, 3, 4); + + // higher read version but also higher beginVersion does not include clear + checkDeltaAtVersion(resultEmpty, boundaryEmptyWithClear, 6, 10); + + // check values + DeltaBoundaryRef fullBoundary; + fullBoundary.key = key; + fullBoundary.values.push_back(ar, vv_a_3); + fullBoundary.values.push_back(ar, vv_clear_5); + fullBoundary.clearVersion = 5; + + checkDeltaAtVersion(resultEmpty, fullBoundary, 0, 2); + checkDeltaAtVersion(resultEmpty, fullBoundary, 6, 10); + checkDeltaAtVersion(resultEmpty, fullBoundary, 4, 4); + + checkDeltaAtVersion(resultSetA, fullBoundary, 0, 3); + checkDeltaAtVersion(resultSetA, fullBoundary, 3, 4); + + checkDeltaAtVersion(resultClearA, fullBoundary, 0, 5); + checkDeltaAtVersion(resultClearA, fullBoundary, 0, 10); + checkDeltaAtVersion(resultClearA, fullBoundary, 3, 5); + checkDeltaAtVersion(resultClearA, fullBoundary, 4, 5); + + return Void(); +} + +void checkSnapshotEmpty(const Value& serialized, Key begin, Key end, Optional cipherKeysCtx) { std::map result; Arena ar = loadSnapshotFile(serialized, KeyRangeRef(begin, end), result, cipherKeysCtx); ASSERT(result.empty()); } // endIdx is exclusive -void checkRead(const Standalone& snapshot, - const Value& serialized, - int beginIdx, - int endIdx, - Optional cipherKeysCtx) { +void checkSnapshotRead(const Standalone& snapshot, + const Value& serialized, + int beginIdx, + int endIdx, + Optional cipherKeysCtx) { ASSERT(beginIdx < endIdx); ASSERT(endIdx <= snapshot.size()); std::map result; @@ -1201,50 +1683,236 @@ void checkRead(const Standalone& snapshot, } } -TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { - // snapshot files are likely to have a non-trivial shared prefix since they're for a small contiguous key range - std::string sharedPrefix = deterministicRandom()->randomUniqueID().toString(); - int uidSize = sharedPrefix.size(); - int sharedPrefixLen = deterministicRandom()->randomInt(0, uidSize); - int targetKeyLength = deterministicRandom()->randomInt(4, uidSize); - sharedPrefix = sharedPrefix.substr(0, sharedPrefixLen) + "_"; +namespace { - int targetValueLen = randomExp(0, 12); - int targetChunks = randomExp(0, 9); - int targetDataBytes = randomExp(0, 25); +size_t uidSize = 32; - std::unordered_set usedKeys; - Standalone data; - int totalDataBytes = 0; - const int maxKeyGenAttempts = 1000; - int nAttempts = 0; - while (totalDataBytes < targetDataBytes) { - int keySize = deterministicRandom()->randomInt(targetKeyLength / 2, targetKeyLength * 3 / 2); - keySize = std::min(keySize, uidSize); - std::string key = sharedPrefix + deterministicRandom()->randomUniqueID().toString().substr(0, keySize); - if (usedKeys.insert(key).second) { - int valueSize = deterministicRandom()->randomInt(targetValueLen / 2, targetValueLen * 3 / 2); - std::string value = deterministicRandom()->randomUniqueID().toString(); - if (value.size() > valueSize) { - value = value.substr(0, valueSize); - } - if (value.size() < valueSize) { - value += std::string(valueSize - value.size(), 'x'); - } +struct KeyValueGen { + Arena ar; + std::string sharedPrefix; + int targetKeyLength; + int targetValueLength; + std::set usedKeys; + std::vector usedKeysList; + double clearFrequency; + double clearUnsetFrequency; + double updateExistingKeyFrequency; + int minVersionIncrease; + int maxVersionIncrease; + int targetMutationsPerDelta; + KeyRange allRange; - data.push_back_deep(data.arena(), KeyValueRef(KeyRef(key), ValueRef(value))); - totalDataBytes += key.size() + value.size(); - nAttempts = 0; - } else if (nAttempts > maxKeyGenAttempts) { - // KeySpace exhausted, avoid infinite loop - break; + Version version = 0; + + // encryption/compression settings + // TODO: possibly different cipher keys or meta context per file? + Optional cipherKeys; + Optional compressFilter; + + KeyValueGen() { + sharedPrefix = deterministicRandom()->randomUniqueID().toString(); + ASSERT(sharedPrefix.size() == uidSize); + int sharedPrefixLen = deterministicRandom()->randomInt(0, uidSize); + targetKeyLength = deterministicRandom()->randomInt(4, uidSize); + sharedPrefix = sharedPrefix.substr(0, sharedPrefixLen) + "_"; + targetValueLength = deterministicRandom()->randomExp(0, 12); + allRange = KeyRangeRef(StringRef(sharedPrefix), LiteralStringRef("\xff")); + + if (deterministicRandom()->coinflip()) { + clearFrequency = 0.0; + clearUnsetFrequency = 0.0; } else { - // Keep exploring the KeySpace - nAttempts++; + clearFrequency = deterministicRandom()->random01() / 2; + // clearing an unset value has no effect on the results, we mostly just want to make sure the format doesn't + // barf + clearUnsetFrequency = deterministicRandom()->random01() / 10; + } + if (deterministicRandom()->random01() < 0.2) { + // no updates, only new writes + updateExistingKeyFrequency = 0.0; + } else { + updateExistingKeyFrequency = deterministicRandom()->random01(); + } + if (deterministicRandom()->coinflip()) { + // sequential versions + minVersionIncrease = 1; + maxVersionIncrease = 2; + } else { + minVersionIncrease = deterministicRandom()->randomExp(0, 25); + maxVersionIncrease = minVersionIncrease + deterministicRandom()->randomExp(0, 25); + } + if (deterministicRandom()->coinflip()) { + targetMutationsPerDelta = 1; + } else { + targetMutationsPerDelta = deterministicRandom()->randomExp(1, 5); + } + + if (deterministicRandom()->coinflip()) { + cipherKeys = getCipherKeysCtx(ar); + } + if (deterministicRandom()->coinflip()) { +#ifdef ZLIB_LIB_SUPPORTED + compressFilter = CompressionFilter::GZIP; +#else + compressFilter = CompressionFilter::NONE; +#endif } } + Optional newKey() { + for (int nAttempt = 0; nAttempt < 1000; nAttempt++) { + size_t keySize = deterministicRandom()->randomInt(targetKeyLength / 2, targetKeyLength * 3 / 2); + keySize = std::min(keySize, uidSize); + std::string key = sharedPrefix + deterministicRandom()->randomUniqueID().toString().substr(0, keySize); + if (usedKeys.insert(key).second) { + StringRef k(ar, key); + usedKeysList.push_back(k); + return k; + } + } + return {}; + } + + StringRef value() { + int valueSize = deterministicRandom()->randomInt(targetValueLength / 2, targetValueLength * 3 / 2); + std::string value = deterministicRandom()->randomUniqueID().toString(); + if (value.size() > valueSize) { + value = value.substr(0, valueSize); + } + if (value.size() < valueSize) { + // repeated string so it's compressible + value += std::string(valueSize - value.size(), 'x'); + } + return StringRef(ar, value); + } + + KeyRef randomUsedKey() const { return usedKeysList[deterministicRandom()->randomInt(0, usedKeysList.size())]; } + + KeyRange randomKeyRange() const { + ASSERT(!usedKeysList.empty()); + Key begin = randomUsedKey(); + if (deterministicRandom()->coinflip()) { + begin = keyAfter(begin); + } + if (usedKeysList.size() == 1) { + return KeyRange(KeyRangeRef(begin, keyAfter(begin))); + } else { + Key end = begin; + while (end == begin) { + end = randomUsedKey(); + } + if (deterministicRandom()->coinflip()) { + end = keyAfter(end); + } + if (begin < end) { + return KeyRangeRef(begin, end); + } else { + return KeyRangeRef(end, begin); + } + } + } + + StringRef keyForUpdate(double probUseExisting) { + if (!usedKeysList.empty() && deterministicRandom()->random01() < probUseExisting) { + return randomUsedKey(); + } else { + auto k = newKey(); + if (k.present()) { + return k.get(); + } else { + // use existing key instead + ASSERT(!usedKeysList.empty()); + return randomUsedKey(); + } + } + } + + Version nextVersion() { + Version jump = deterministicRandom()->randomInt(minVersionIncrease, maxVersionIncrease); + version += jump; + return version; + } + + MutationRef newMutation() { + if (deterministicRandom()->random01() < clearFrequency) { + // The algorithm for generating clears of varying sizes is, to generate clear sizes based on an exponential + // distribution, such that the expected value of the clear size is 2. + int clearWidth = 1; + while (clearWidth < usedKeys.size() && deterministicRandom()->coinflip()) { + clearWidth *= 2; + } + bool clearPastEnd = deterministicRandom()->coinflip(); + if (clearPastEnd) { + clearWidth--; + } + StringRef begin = keyForUpdate(1.0 - clearUnsetFrequency); + std::string beginStr = begin.toString(); + auto it = usedKeys.find(beginStr); + ASSERT(it != usedKeys.end()); + while (it != usedKeys.end() && clearWidth > 0) { + it++; + clearWidth--; + } + if (it == usedKeys.end()) { + it--; + clearPastEnd = true; + } + std::string endKey = *it; + if (clearPastEnd) { + Key end = keyAfter(StringRef(ar, endKey)); + ar.dependsOn(end.arena()); + return MutationRef(MutationRef::ClearRange, begin, end); + } else { + // clear up to end + return MutationRef(MutationRef::ClearRange, begin, StringRef(ar, endKey)); + } + + } else { + return MutationRef(MutationRef::SetValue, keyForUpdate(updateExistingKeyFrequency), value()); + } + } + + MutationsAndVersionRef newDelta() { + Version v = nextVersion(); + int mutationCount = deterministicRandom()->randomInt(1, targetMutationsPerDelta * 2); + MutationsAndVersionRef ret(v, v); + for (int i = 0; i < mutationCount; i++) { + ret.mutations.push_back(ar, newMutation()); + } + return ret; + } +}; + +} // namespace + +Standalone genSnapshot(KeyValueGen& kvGen, int targetDataBytes) { + Standalone data; + int totalDataBytes = 0; + while (totalDataBytes < targetDataBytes) { + Optional key = kvGen.newKey(); + if (!key.present()) { + CODE_PROBE(true, "snapshot unit test keyspace full"); + break; + } + StringRef value = kvGen.value(); + + data.push_back_deep(data.arena(), KeyValueRef(KeyRef(key.get()), ValueRef(value))); + totalDataBytes += key.get().size() + value.size(); + } + std::sort(data.begin(), data.end(), KeyValueRef::OrderByKey()); + return data; +} + +TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { + // snapshot files are likely to have a non-trivial shared prefix since they're for a small contiguous key range + KeyValueGen kvGen; + + int targetChunks = deterministicRandom()->randomExp(0, 9); + int targetDataBytes = deterministicRandom()->randomExp(0, 25); + int targetChunkSize = targetDataBytes / targetChunks; + + Standalone data = genSnapshot(kvGen, targetDataBytes); int maxExp = 0; while (1 << maxExp < data.size()) { @@ -1257,24 +1925,9 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { ASSERT(data[i].key < data[i + 1].key); } - fmt::print( - "Constructing snapshot with {0} rows, {1} bytes, and {2} chunks\n", data.size(), totalDataBytes, targetChunks); + fmt::print("Constructing snapshot with {0} rows, {1} chunks\n", data.size(), targetChunks); - Optional cipherKeysCtx = Optional(); - Arena arena; - if (deterministicRandom()->coinflip()) { - cipherKeysCtx = getCipherKeysCtx(arena); - } - - Optional compressFilter; - if (deterministicRandom()->coinflip()) { -#ifdef ZLIB_LIB_SUPPORTED - compressFilter = CompressionFilter::GZIP; -#else - compressFilter = CompressionFilter::NONE; -#endif - } - Value serialized = serializeChunkedSnapshot(data, targetChunks, compressFilter, cipherKeysCtx); + Value serialized = serializeChunkedSnapshot(data, targetChunkSize, kvGen.compressFilter, kvGen.cipherKeys); fmt::print("Snapshot serialized! {0} bytes\n", serialized.size()); @@ -1285,31 +1938,288 @@ TEST_CASE("/blobgranule/files/snapshotFormatUnitTest") { fmt::print("Initial read starting\n"); - checkRead(data, serialized, 0, data.size(), cipherKeysCtx); + checkSnapshotRead(data, serialized, 0, data.size(), kvGen.cipherKeys); fmt::print("Initial read complete\n"); if (data.size() > 1) { for (int i = 0; i < std::min(100, data.size() * 2); i++) { - int width = randomExp(0, maxExp); + int width = deterministicRandom()->randomExp(0, maxExp); ASSERT(width <= data.size()); int start = deterministicRandom()->randomInt(0, data.size() - width); - checkRead(data, serialized, start, start + width, cipherKeysCtx); + checkSnapshotRead(data, serialized, start, start + width, kvGen.cipherKeys); } fmt::print("Doing empty checks\n"); int randomIdx = deterministicRandom()->randomInt(0, data.size() - 1); - checkEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key, cipherKeysCtx); + checkSnapshotEmpty(serialized, keyAfter(data[randomIdx].key), data[randomIdx + 1].key, kvGen.cipherKeys); } else { fmt::print("Doing empty checks\n"); } - checkEmpty(serialized, normalKeys.begin, data.front().key, cipherKeysCtx); - checkEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"), cipherKeysCtx); - checkEmpty(serialized, keyAfter(data.back().key), normalKeys.end, cipherKeysCtx); - checkEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end, cipherKeysCtx); + checkSnapshotEmpty(serialized, normalKeys.begin, data.front().key, kvGen.cipherKeys); + checkSnapshotEmpty(serialized, normalKeys.begin, LiteralStringRef("\x00"), kvGen.cipherKeys); + checkSnapshotEmpty(serialized, keyAfter(data.back().key), normalKeys.end, kvGen.cipherKeys); + checkSnapshotEmpty(serialized, LiteralStringRef("\xfe"), normalKeys.end, kvGen.cipherKeys); fmt::print("Snapshot format test done!\n"); return Void(); } + +void checkDeltaRead(const KeyValueGen& kvGen, + const KeyRangeRef& range, + Version beginVersion, + Version readVersion, + const Standalone& data, + StringRef* serialized) { + // expected answer + std::map expectedData; + Version lastFileEndVersion = 0; + + applyDeltasByVersion(data, range, beginVersion, readVersion, lastFileEndVersion, expectedData); + + // actual answer + std::string filename = randomBGFilename( + deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta"); + Standalone chunk; + // TODO need to add cipher keys meta + chunk.deltaFiles.emplace_back_deep(chunk.arena(), filename, 0, serialized->size(), serialized->size()); + chunk.cipherKeysCtx = kvGen.cipherKeys; + chunk.keyRange = kvGen.allRange; + chunk.includedVersion = readVersion; + chunk.snapshotVersion = invalidVersion; + + RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, {}, serialized); + + ASSERT(expectedData.size() == actualData.size()); + int i = 0; + for (auto& it : expectedData) { + ASSERT(it.first == actualData[i].key); + ASSERT(it.second == actualData[i].value); + i++; + } +} + +static std::tuple randomizeKeyAndVersions(const KeyValueGen& kvGen, + const Standalone data) { + // either randomize just keyrange, just version range, or both + double rand = deterministicRandom()->randomInt(0, 3); + bool randomizeKeyRange = rand == 0 || rand == 2; + bool randomizeVersionRange = rand == 1 || rand == 2; + KeyRange readRange = kvGen.allRange; + Version beginVersion = 0; + Version readVersion = data.back().version; + + if (randomizeKeyRange) { + readRange = kvGen.randomKeyRange(); + } + + if (randomizeVersionRange) { + if (deterministicRandom()->coinflip()) { + beginVersion = 0; + } else { + beginVersion = data[deterministicRandom()->randomInt(0, data.size())].version; + beginVersion += deterministicRandom()->randomInt(0, 3) - 1; // randomize between -1, 0, and +1 + } + readVersion = data[deterministicRandom()->randomInt(0, data.size())].version; + readVersion += deterministicRandom()->randomInt(0, 3) - 1; // randomize between -1, 0, and +1 + if (readVersion < beginVersion) { + std::swap(beginVersion, readVersion); + } + } + + // TODO randomize begin and read version to sometimes +/- 1 and readRange begin and end to keyAfter sometimes + return { readRange, beginVersion, readVersion }; +} + +Standalone genDeltas(KeyValueGen& kvGen, int targetBytes) { + Standalone data; + int totalDataBytes = 0; + while (totalDataBytes < targetBytes) { + data.push_back(data.arena(), kvGen.newDelta()); + totalDataBytes += data.back().expectedSize(); + } + return data; +} + +TEST_CASE("/blobgranule/files/deltaFormatUnitTest") { + KeyValueGen kvGen; + + int targetChunks = deterministicRandom()->randomExp(0, 8); + int targetDataBytes = deterministicRandom()->randomExp(0, 21); + + int targetChunkSize = targetDataBytes / targetChunks; + + Standalone data = genDeltas(kvGen, targetDataBytes); + + fmt::print("Deltas ({0})\n", data.size()); + Value serialized = + serializeChunkedDeltaFile(data, kvGen.allRange, targetChunkSize, kvGen.compressFilter, kvGen.cipherKeys); + + // check whole file + checkDeltaRead(kvGen, kvGen.allRange, 0, data.back().version, data, &serialized); + + for (int i = 0; i < std::min((size_t)100, kvGen.usedKeysList.size() * data.size()); i++) { + auto params = randomizeKeyAndVersions(kvGen, data); + checkDeltaRead(kvGen, std::get<0>(params), std::get<1>(params), std::get<2>(params), data, &serialized); + } + + return Void(); +} + +void checkGranuleRead(const KeyValueGen& kvGen, + const KeyRangeRef& range, + Version beginVersion, + Version readVersion, + const Standalone& snapshotData, + const Standalone& deltaData, + const Value& serializedSnapshot, + const std::vector>& serializedDeltas, + const Standalone& inMemoryDeltas) { + // expected answer + std::map expectedData; + if (beginVersion == 0) { + for (auto& it : snapshotData) { + if (range.contains(it.key)) { + expectedData.insert({ it.key, it.value }); + } + } + } + Version lastFileEndVersion = 0; + applyDeltasByVersion(deltaData, range, beginVersion, readVersion, lastFileEndVersion, expectedData); + + // actual answer + Standalone chunk; + if (beginVersion == 0) { + std::string snapshotFilename = randomBGFilename( + deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), 0, ".snapshot"); + chunk.snapshotFile = BlobFilePointerRef( + chunk.arena(), snapshotFilename, 0, serializedSnapshot.size(), serializedSnapshot.size()); + } + int deltaIdx = 0; + while (deltaIdx < serializedDeltas.size() && serializedDeltas[deltaIdx].first < beginVersion) { + deltaIdx++; + } + std::vector deltaPtrsVector; + while (deltaIdx < serializedDeltas.size()) { + std::string deltaFilename = randomBGFilename( + deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUniqueID(), readVersion, ".delta"); + size_t fsize = serializedDeltas[deltaIdx].second.size(); + chunk.deltaFiles.emplace_back_deep(chunk.arena(), deltaFilename, 0, fsize, fsize); + deltaPtrsVector.push_back(serializedDeltas[deltaIdx].second); + + if (serializedDeltas[deltaIdx].first >= readVersion) { + break; + } + deltaIdx++; + } + StringRef deltaPtrs[deltaPtrsVector.size()]; + for (int i = 0; i < deltaPtrsVector.size(); i++) { + deltaPtrs[i] = deltaPtrsVector[i]; + } + + // add in memory deltas + chunk.arena().dependsOn(inMemoryDeltas.arena()); + for (auto& it : inMemoryDeltas) { + if (beginVersion <= it.version && it.version <= readVersion) { + chunk.newDeltas.push_back(chunk.arena(), it); + } + } + + // TODO need to add cipher keys meta + chunk.cipherKeysCtx = kvGen.cipherKeys; + chunk.keyRange = kvGen.allRange; + chunk.includedVersion = readVersion; + chunk.snapshotVersion = (beginVersion == 0) ? 0 : invalidVersion; + + Optional snapshotPtr; + if (beginVersion == 0) { + snapshotPtr = serializedSnapshot; + } + RangeResult actualData = materializeBlobGranule(chunk, range, beginVersion, readVersion, snapshotPtr, deltaPtrs); + + ASSERT(expectedData.size() == actualData.size()); + int i = 0; + for (auto& it : expectedData) { + ASSERT(it.first == actualData[i].key); + ASSERT(it.second == actualData[i].value); + i++; + } +} + +TEST_CASE("/blobgranule/files/granuleReadUnitTest") { + KeyValueGen kvGen; + + int targetSnapshotChunks = deterministicRandom()->randomExp(0, 9); + int targetDeltaChunks = deterministicRandom()->randomExp(0, 8); + int targetDataBytes = deterministicRandom()->randomExp(12, 25); + int targetSnapshotBytes = (int)(deterministicRandom()->randomInt(0, targetDataBytes)); + int targetDeltaBytes = targetDataBytes - targetSnapshotBytes; + + int targetSnapshotChunkSize = targetSnapshotBytes / targetSnapshotChunks; + int targetDeltaChunkSize = targetDeltaBytes / targetDeltaChunks; + + Standalone snapshotData = genSnapshot(kvGen, targetSnapshotBytes); + Standalone deltaData = genDeltas(kvGen, targetDeltaBytes); + fmt::print("{0} snapshot rows and {1} deltas\n", snapshotData.size(), deltaData.size()); + + Value serializedSnapshot = + serializeChunkedSnapshot(snapshotData, targetSnapshotChunkSize, kvGen.compressFilter, kvGen.cipherKeys); + + // split deltas up across multiple files + int deltaFiles = std::min(deltaData.size(), deterministicRandom()->randomInt(1, 21)); + int deltasPerFile = deltaData.size() / deltaFiles + 1; + std::vector> serializedDeltaFiles; + Standalone inMemoryDeltas; + serializedDeltaFiles.reserve(deltaFiles); + for (int i = 0; i < deltaFiles; i++) { + Standalone fileData; + int j; + for (j = i * deltasPerFile; j < (i + 1) * deltasPerFile && j < deltaData.size(); j++) { + fileData.push_back_deep(fileData.arena(), deltaData[j]); + } + if (!fileData.empty()) { + if (j == deltaData.size() && deterministicRandom()->coinflip()) { + // if it's the last set of deltas, sometimes make them the memory deltas instead + inMemoryDeltas = fileData; + } else { + Value serializedDelta = serializeChunkedDeltaFile( + fileData, kvGen.allRange, targetDeltaChunkSize, kvGen.compressFilter, kvGen.cipherKeys); + serializedDeltaFiles.emplace_back(fileData.back().version, serializedDelta); + } + } + } + + fmt::print("Full test\n"); + checkGranuleRead(kvGen, + kvGen.allRange, + 0, + deltaData.back().version, + snapshotData, + deltaData, + serializedSnapshot, + serializedDeltaFiles, + inMemoryDeltas); + + for (int i = 0; i < std::min(100, 5 + snapshotData.size() * deltaData.size()); i++) { + auto params = randomizeKeyAndVersions(kvGen, deltaData); + fmt::print("Partial test {0}: [{1} - {2}) @ {3} - {4}\n", + i, + std::get<0>(params).begin.printable(), + std::get<0>(params).end.printable(), + std::get<1>(params), + std::get<2>(params)); + checkGranuleRead(kvGen, + std::get<0>(params), + std::get<1>(params), + std::get<2>(params), + snapshotData, + deltaData, + serializedSnapshot, + serializedDeltaFiles, + inMemoryDeltas); + } + + return Void(); +} \ No newline at end of file diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 041c4495b0..02a570ed86 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -3234,13 +3234,26 @@ TenantInfo TransactionState::getTenantInfo() { } else if (!t.present()) { return TenantInfo(); } else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && t.present()) { - throw tenants_disabled(); + // If we are running provisional proxies, we allow a tenant request to go through since we don't know the tenant + // mode. Such a transaction would not be allowed to commit without enabling provisional commits because either + // the commit proxies will be provisional or the read version will be too old. + if (!cx->clientInfo->get().grvProxies.empty() && !cx->clientInfo->get().grvProxies[0].provisional) { + throw tenants_disabled(); + } else { + ASSERT(!useProvisionalProxies); + } } ASSERT(tenantId != TenantInfo::INVALID_TENANT); return TenantInfo(t.get(), tenantId); } +// Returns the tenant used in this transaction. If the tenant is unset and raw access isn't specified, then the default +// tenant from DatabaseContext is applied to this transaction (note: the default tenant is typically unset, but in +// simulation could be something different). +// +// This function should not be called in the transaction constructor or in the setOption function to allow a user the +// opportunity to set raw access. Optional const& TransactionState::tenant() { if (tenantSet) { return tenant_; @@ -3253,6 +3266,9 @@ Optional const& TransactionState::tenant() { } } +// Returns true if the tenant has been set, but does not cause default tenant resolution. This is useful in setOption +// (where we do not want to call tenant()) if we want to enforce that an option not be set on a Tenant transaction (e.g. +// for raw access). bool TransactionState::hasTenant() const { return tenantSet && tenant_.present(); } @@ -6570,6 +6586,11 @@ void Transaction::setOption(FDBTransactionOptions::Option option, OptionalhasTenant()) { + Error e = invalid_option(); + TraceEvent(SevWarn, "TenantTransactionUseProvisionalProxies").error(e).detail("Tenant", trState->tenant()); + throw e; + } trState->options.getReadVersionFlags |= GetReadVersionRequest::FLAG_USE_PROVISIONAL_PROXIES; trState->useProvisionalProxies = UseProvisionalProxies::True; break; @@ -9388,11 +9409,20 @@ Future DatabaseContext::getChangeFeedStream(Reference resu Reference::addRef(this), results, rangeID, begin, end, range, replyBufferSize, canReadPopped); } -ACTOR Future> singleLocationOverlappingChangeFeeds( - Database cx, - Reference location, - KeyRangeRef range, - Version minVersion) { +Version OverlappingChangeFeedsInfo::getFeedMetadataVersion(const KeyRangeRef& range) const { + Version v = invalidVersion; + for (auto& it : feedMetadataVersions) { + if (it.second > v && it.first.intersects(range)) { + v = it.second; + } + } + return v; +} + +ACTOR Future singleLocationOverlappingChangeFeeds(Database cx, + Reference location, + KeyRangeRef range, + Version minVersion) { state OverlappingChangeFeedsRequest req; req.range = range; req.minVersion = minVersion; @@ -9404,16 +9434,16 @@ ACTOR Future> singleLocationOverlappingC TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, cx->enableLocalityLoadBalance ? &cx->queueModel : nullptr)); - return rep.rangeIds; + return rep; } bool compareChangeFeedResult(const OverlappingChangeFeedEntry& i, const OverlappingChangeFeedEntry& j) { - return i.rangeId < j.rangeId; + return i.feedId < j.feedId; } -ACTOR Future> getOverlappingChangeFeedsActor(Reference db, - KeyRangeRef range, - Version minVersion) { +ACTOR Future getOverlappingChangeFeedsActor(Reference db, + KeyRangeRef range, + Version minVersion) { state Database cx(db); state Span span("NAPI:GetOverlappingChangeFeeds"_loc); @@ -9439,19 +9469,33 @@ ACTOR Future> getOverlappingChangeFeedsA throw all_alternatives_failed(); } - state std::vector>> allOverlappingRequests; + state std::vector> allOverlappingRequests; for (auto& it : locations) { allOverlappingRequests.push_back( singleLocationOverlappingChangeFeeds(cx, it.locations, it.range & range, minVersion)); } wait(waitForAll(allOverlappingRequests)); - std::vector result; - for (auto& it : allOverlappingRequests) { - result.insert(result.end(), it.get().begin(), it.get().end()); + OverlappingChangeFeedsInfo result; + std::unordered_map latestFeedMetadata; + for (int i = 0; i < locations.size(); i++) { + result.arena.dependsOn(allOverlappingRequests[i].get().arena); + result.arena.dependsOn(locations[i].range.arena()); + result.feedMetadataVersions.push_back( + { locations[i].range, allOverlappingRequests[i].get().feedMetadataVersion }); + for (auto& it : allOverlappingRequests[i].get().feeds) { + auto res = latestFeedMetadata.insert({ it.feedId, it }); + if (!res.second) { + CODE_PROBE(true, "deduping fetched overlapping feed by higher metadata version"); + if (res.first->second.feedMetadataVersion < it.feedMetadataVersion) { + res.first->second = it; + } + } + } + } + for (auto& it : latestFeedMetadata) { + result.feeds.push_back(result.arena, it.second); } - std::sort(result.begin(), result.end(), compareChangeFeedResult); - result.resize(std::unique(result.begin(), result.end()) - result.begin()); return result; } catch (Error& e) { if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { @@ -9464,8 +9508,7 @@ ACTOR Future> getOverlappingChangeFeedsA } } -Future> DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range, - Version minVersion) { +Future DatabaseContext::getOverlappingChangeFeeds(KeyRangeRef range, Version minVersion) { return getOverlappingChangeFeedsActor(Reference::addRef(this), range, minVersion); } @@ -9589,7 +9632,7 @@ ACTOR Future purgeBlobGranulesActor(Reference db, state bool loadedTenantPrefix = false; // FIXME: implement force - if (!force) { + if (force) { throw unsupported_operation(); } diff --git a/fdbclient/ServerKnobs.cpp b/fdbclient/ServerKnobs.cpp index 8ca2894e4b..58478c698e 100644 --- a/fdbclient/ServerKnobs.cpp +++ b/fdbclient/ServerKnobs.cpp @@ -701,8 +701,9 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi init( FETCH_BLOCK_BYTES, 2e6 ); init( FETCH_KEYS_PARALLELISM_BYTES, 4e6 ); if( randomize && BUGGIFY ) FETCH_KEYS_PARALLELISM_BYTES = 3e6; init( FETCH_KEYS_PARALLELISM, 2 ); + init( FETCH_KEYS_PARALLELISM_FULL, 10 ); init( FETCH_KEYS_LOWER_PRIORITY, 0 ); - init( FETCH_CHANGEFEED_PARALLELISM, 2 ); + init( FETCH_CHANGEFEED_PARALLELISM, 4 ); init( SERVE_FETCH_CHECKPOINT_PARALLELISM, 4 ); init( BUGGIFY_BLOCK_BYTES, 10000 ); init( STORAGE_RECOVERY_VERSION_LAG_LIMIT, 2 * MAX_READ_TRANSACTION_LIFE_VERSIONS ); @@ -907,11 +908,13 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi // BlobGranuleVerify* simulation tests use "blobRangeKeys", BlobGranuleCorrectness* use "tenant", default in real clusters is "tenant" init( BG_RANGE_SOURCE, "tenant" ); // BlobGranuleVerify* simulation tests use "knobs", BlobGranuleCorrectness* use "tenant", default in real clusters is "knobs" + bool buggifyMediumGranules = simulationMediumShards || (randomize && BUGGIFY); init( BG_METADATA_SOURCE, "knobs" ); - init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (simulationMediumShards || (randomize && BUGGIFY) ) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000; - init( BG_SNAPSHOT_FILE_TARGET_CHUNKS, 100 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNKS = 1 << deterministicRandom()->randomInt(0, 8); + init( BG_SNAPSHOT_FILE_TARGET_BYTES, 10000000 ); if( buggifySmallShards ) BG_SNAPSHOT_FILE_TARGET_BYTES = 100000; else if (buggifyMediumGranules) BG_SNAPSHOT_FILE_TARGET_BYTES = 1000000; + init( BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES = BG_SNAPSHOT_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 8)); init( BG_DELTA_BYTES_BEFORE_COMPACT, BG_SNAPSHOT_FILE_TARGET_BYTES/2 ); init( BG_DELTA_FILE_TARGET_BYTES, BG_DELTA_BYTES_BEFORE_COMPACT/10 ); + init( BG_DELTA_FILE_TARGET_CHUNK_BYTES, 64*1024 ); if ( randomize && BUGGIFY ) BG_DELTA_FILE_TARGET_CHUNK_BYTES = BG_DELTA_FILE_TARGET_BYTES / (1 << deterministicRandom()->randomInt(0, 7)); init( BG_MAX_SPLIT_FANOUT, 10 ); if( randomize && BUGGIFY ) BG_MAX_SPLIT_FANOUT = deterministicRandom()->randomInt(5, 15); init( BG_MAX_MERGE_FANIN, 10 ); if( randomize && BUGGIFY ) BG_MAX_MERGE_FANIN = deterministicRandom()->randomInt(2, 15); init( BG_HOT_SNAPSHOT_VERSIONS, 5000000 ); diff --git a/fdbclient/include/fdbclient/BlobGranuleCommon.h b/fdbclient/include/fdbclient/BlobGranuleCommon.h index 9c65aa19d3..bf0d156b02 100644 --- a/fdbclient/include/fdbclient/BlobGranuleCommon.h +++ b/fdbclient/include/fdbclient/BlobGranuleCommon.h @@ -46,6 +46,7 @@ struct GranuleSnapshot : VectorRef { } }; +// Deltas in version order struct GranuleDeltas : VectorRef { constexpr static FileIdentifier file_identifier = 8563013; diff --git a/fdbclient/include/fdbclient/BlobGranuleFiles.h b/fdbclient/include/fdbclient/BlobGranuleFiles.h index 7ee7a62bd4..80877bbf7c 100644 --- a/fdbclient/include/fdbclient/BlobGranuleFiles.h +++ b/fdbclient/include/fdbclient/BlobGranuleFiles.h @@ -27,11 +27,15 @@ #include "flow/CompressionUtils.h" Value serializeChunkedSnapshot(Standalone snapshot, - int chunks, + int chunkSize, Optional compressFilter, - Optional cipherKeysCtx = Optional()); + Optional cipherKeysCtx = {}); -// FIXME: support sorted and chunked delta files +Value serializeChunkedDeltaFile(Standalone deltas, + const KeyRangeRef& fileRange, + int chunkSize, + Optional compressFilter, + Optional cipherKeysCtx = {}); ErrorOr loadAndMaterializeBlobGranules(const Standalone>& files, const KeyRangeRef& keyRange, diff --git a/fdbclient/include/fdbclient/DatabaseContext.h b/fdbclient/include/fdbclient/DatabaseContext.h index e13d1d4986..e4875b9d7c 100644 --- a/fdbclient/include/fdbclient/DatabaseContext.h +++ b/fdbclient/include/fdbclient/DatabaseContext.h @@ -207,6 +207,16 @@ struct KeyRangeLocationInfo { : tenantEntry(tenantEntry), range(range), locations(locations) {} }; +struct OverlappingChangeFeedsInfo { + Arena arena; + VectorRef feeds; + // would prefer to use key range map but it complicates copy/move constructors + std::vector> feedMetadataVersions; + + // for a feed that wasn't present, returns the metadata version it would have been fetched at. + Version getFeedMetadataVersion(const KeyRangeRef& feedRange) const; +}; + class DatabaseContext : public ReferenceCounted, public FastAllocated, NonCopyable { public: static DatabaseContext* allocateOnForeignThread() { @@ -361,7 +371,7 @@ public: int replyBufferSize = -1, bool canReadPopped = true); - Future> getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion); + Future getOverlappingChangeFeeds(KeyRangeRef ranges, Version minVersion); Future popChangeFeedMutations(Key rangeID, Version version); Future purgeBlobGranules(KeyRange keyRange, diff --git a/fdbclient/include/fdbclient/ServerKnobs.h b/fdbclient/include/fdbclient/ServerKnobs.h index 4579088afe..0c84e28928 100644 --- a/fdbclient/include/fdbclient/ServerKnobs.h +++ b/fdbclient/include/fdbclient/ServerKnobs.h @@ -659,6 +659,7 @@ public: int FETCH_BLOCK_BYTES; int FETCH_KEYS_PARALLELISM_BYTES; int FETCH_KEYS_PARALLELISM; + int FETCH_KEYS_PARALLELISM_FULL; int FETCH_KEYS_LOWER_PRIORITY; int FETCH_CHANGEFEED_PARALLELISM; int SERVE_FETCH_CHECKPOINT_PARALLELISM; @@ -887,8 +888,9 @@ public: std::string BG_METADATA_SOURCE; int BG_SNAPSHOT_FILE_TARGET_BYTES; - int BG_SNAPSHOT_FILE_TARGET_CHUNKS; + int BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES; int BG_DELTA_FILE_TARGET_BYTES; + int BG_DELTA_FILE_TARGET_CHUNK_BYTES; int BG_DELTA_BYTES_BEFORE_COMPACT; int BG_MAX_SPLIT_FANOUT; int BG_MAX_MERGE_FANIN; diff --git a/fdbclient/include/fdbclient/StorageServerInterface.h b/fdbclient/include/fdbclient/StorageServerInterface.h index c873703082..8ec8e1009a 100644 --- a/fdbclient/include/fdbclient/StorageServerInterface.h +++ b/fdbclient/include/fdbclient/StorageServerInterface.h @@ -970,39 +970,51 @@ struct FetchCheckpointKeyValuesRequest { }; struct OverlappingChangeFeedEntry { - Key rangeId; - KeyRange range; + KeyRef feedId; + KeyRangeRef range; Version emptyVersion; Version stopVersion; + Version feedMetadataVersion; bool operator==(const OverlappingChangeFeedEntry& r) const { - return rangeId == r.rangeId && range == r.range && emptyVersion == r.emptyVersion && - stopVersion == r.stopVersion; + return feedId == r.feedId && range == r.range && emptyVersion == r.emptyVersion && + stopVersion == r.stopVersion && feedMetadataVersion == r.feedMetadataVersion; } OverlappingChangeFeedEntry() {} - OverlappingChangeFeedEntry(Key const& rangeId, KeyRange const& range, Version emptyVersion, Version stopVersion) - : rangeId(rangeId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion) {} + OverlappingChangeFeedEntry(KeyRef const& feedId, + KeyRangeRef const& range, + Version emptyVersion, + Version stopVersion, + Version feedMetadataVersion) + : feedId(feedId), range(range), emptyVersion(emptyVersion), stopVersion(stopVersion), + feedMetadataVersion(feedMetadataVersion) {} + + OverlappingChangeFeedEntry(Arena& arena, const OverlappingChangeFeedEntry& rhs) + : feedId(arena, rhs.feedId), range(arena, rhs.range), emptyVersion(rhs.emptyVersion), + stopVersion(rhs.stopVersion), feedMetadataVersion(rhs.feedMetadataVersion) {} template void serialize(Ar& ar) { - serializer(ar, rangeId, range, emptyVersion, stopVersion); + serializer(ar, feedId, range, emptyVersion, stopVersion, feedMetadataVersion); } }; struct OverlappingChangeFeedsReply { constexpr static FileIdentifier file_identifier = 11815134; - std::vector rangeIds; + VectorRef feeds; bool cached; Arena arena; + Version feedMetadataVersion; - OverlappingChangeFeedsReply() : cached(false) {} - explicit OverlappingChangeFeedsReply(std::vector const& rangeIds) - : rangeIds(rangeIds), cached(false) {} + OverlappingChangeFeedsReply() : cached(false), feedMetadataVersion(invalidVersion) {} + explicit OverlappingChangeFeedsReply(VectorRef const& feeds, + Version feedMetadataVersion) + : feeds(feeds), cached(false), feedMetadataVersion(feedMetadataVersion) {} template void serialize(Ar& ar) { - serializer(ar, rangeIds, arena); + serializer(ar, feeds, arena, feedMetadataVersion); } }; diff --git a/fdbrpc/include/fdbrpc/simulator.h b/fdbrpc/include/fdbrpc/simulator.h index 67155490cd..5c304bd261 100644 --- a/fdbrpc/include/fdbrpc/simulator.h +++ b/fdbrpc/include/fdbrpc/simulator.h @@ -23,6 +23,7 @@ #include "flow/ProtocolVersion.h" #include #include +#include #pragma once #include "flow/flow.h" @@ -469,6 +470,8 @@ public: bool setDiffProtocol; // true if a process with a different protocol version has been started bool allowStorageMigrationTypeChange = false; + double injectTargetedSSRestartTime = std::numeric_limits::max(); + double injectSSDelayTime = std::numeric_limits::max(); flowGlobalType global(int id) const final { return getCurrentProcess()->global(id); }; void setGlobal(size_t id, flowGlobalType v) final { getCurrentProcess()->setGlobal(id, v); }; diff --git a/fdbserver/BlobGranuleValidation.actor.cpp b/fdbserver/BlobGranuleValidation.actor.cpp index af395bf71d..3168951a46 100644 --- a/fdbserver/BlobGranuleValidation.actor.cpp +++ b/fdbserver/BlobGranuleValidation.actor.cpp @@ -143,30 +143,34 @@ bool compareFDBAndBlob(RangeResult fdb, } } - printf("Chunks:\n"); - for (auto& chunk : blob.second) { - printf("[%s - %s)\n", chunk.keyRange.begin.printable().c_str(), chunk.keyRange.end.printable().c_str()); - - printf(" SnapshotFile:\n %s\n", - chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : ""); - printf(" DeltaFiles:\n"); - for (auto& df : chunk.deltaFiles) { - printf(" %s\n", df.toString().c_str()); - } - printf(" Deltas: (%d)", chunk.newDeltas.size()); - if (chunk.newDeltas.size() > 0) { - fmt::print(" with version [{0} - {1}]", - chunk.newDeltas[0].version, - chunk.newDeltas[chunk.newDeltas.size() - 1].version); - } - fmt::print(" IncludedVersion: {}\n", chunk.includedVersion); - } - printf("\n"); + printGranuleChunks(blob.second); } } return correct; } +void printGranuleChunks(const Standalone>& chunks) { + printf("Chunks:\n"); + for (auto& chunk : chunks) { + printf("[%s - %s)\n", chunk.keyRange.begin.printable().c_str(), chunk.keyRange.end.printable().c_str()); + + printf(" SnapshotFile:\n %s\n", + chunk.snapshotFile.present() ? chunk.snapshotFile.get().toString().c_str() : ""); + printf(" DeltaFiles:\n"); + for (auto& df : chunk.deltaFiles) { + printf(" %s\n", df.toString().c_str()); + } + printf(" Deltas: (%d)", chunk.newDeltas.size()); + if (chunk.newDeltas.size() > 0) { + fmt::print(" with version [{0} - {1}]", + chunk.newDeltas[0].version, + chunk.newDeltas[chunk.newDeltas.size() - 1].version); + } + fmt::print(" IncludedVersion: {}\n", chunk.includedVersion); + } + printf("\n"); +} + ACTOR Future clearAndAwaitMerge(Database cx, KeyRange range) { // clear key range and check whether it is merged or not, repeatedly state Transaction tr(cx); diff --git a/fdbserver/BlobManager.actor.cpp b/fdbserver/BlobManager.actor.cpp index 8fa712fa86..446ec2e390 100644 --- a/fdbserver/BlobManager.actor.cpp +++ b/fdbserver/BlobManager.actor.cpp @@ -52,6 +52,7 @@ */ #define BM_DEBUG false +#define BM_PURGE_DEBUG false void handleClientBlobRange(KeyRangeMap* knownBlobRanges, Arena& ar, @@ -1649,7 +1650,9 @@ ACTOR Future persistMergeGranulesDone(Reference bmData, state Key lockKey = blobGranuleLockKeyFor(parentRange); state Future> oldLockFuture = tr->get(lockKey); - wait(updateChangeFeed(tr, + // This has to be + // TODO: fix this better! (privatize change feed key clear) + wait(updateChangeFeed(&tr->getTransaction(), granuleIDToCFKey(parentGranuleIDs[parentIdx]), ChangeFeedStatus::CHANGE_FEED_DESTROY, parentRange)); @@ -3168,8 +3171,8 @@ ACTOR Future fullyDeleteGranule(Reference self, Key historyKey, Version purgeVersion, KeyRange granuleRange) { - if (BM_DEBUG) { - fmt::print("Fully deleting granule {0}: init\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Fully deleting granule {1}: init\n", self->epoch, granuleId.toString()); } // if granule is still splitting and files are needed for new sub-granules to re-snapshot, we can only partially @@ -3195,8 +3198,11 @@ ACTOR Future fullyDeleteGranule(Reference self, filesToDelete.emplace_back(fname); } - if (BM_DEBUG) { - fmt::print("Fully deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Fully deleting granule {1}: deleting {2} files\n", + self->epoch, + granuleId.toString(), + filesToDelete.size()); for (auto filename : filesToDelete) { fmt::print(" - {}\n", filename.c_str()); } @@ -3209,8 +3215,9 @@ ACTOR Future fullyDeleteGranule(Reference self, wait(waitForAll(deletions)); // delete metadata in FDB (history entry and file keys) - if (BM_DEBUG) { - fmt::print("Fully deleting granule {0}: deleting history and file keys\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print( + "BM {0} Fully deleting granule {1}: deleting history and file keys\n", self->epoch, granuleId.toString()); } state Transaction tr(self->db); @@ -3229,8 +3236,8 @@ ACTOR Future fullyDeleteGranule(Reference self, } } - if (BM_DEBUG) { - fmt::print("Fully deleting granule {0}: success\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Fully deleting granule {1}: success\n", self->epoch, granuleId.toString()); } TraceEvent("GranuleFullPurge", self->id) @@ -3242,6 +3249,8 @@ ACTOR Future fullyDeleteGranule(Reference self, ++self->stats.granulesFullyPurged; self->stats.filesPurged += filesToDelete.size(); + CODE_PROBE(true, "full granule purged"); + return Void(); } @@ -3257,8 +3266,8 @@ ACTOR Future partiallyDeleteGranule(Reference self, UID granuleId, Version purgeVersion, KeyRange granuleRange) { - if (BM_DEBUG) { - fmt::print("Partially deleting granule {0}: init\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Partially deleting granule {1}: init\n", self->epoch, granuleId.toString()); } state Reference bstore = wait(getBStoreForGranule(self, granuleRange)); @@ -3307,8 +3316,11 @@ ACTOR Future partiallyDeleteGranule(Reference self, filesToDelete.emplace_back(fname); } - if (BM_DEBUG) { - fmt::print("Partially deleting granule {0}: deleting {1} files\n", granuleId.toString(), filesToDelete.size()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Partially deleting granule {1}: deleting {2} files\n", + self->epoch, + granuleId.toString(), + filesToDelete.size()); for (auto filename : filesToDelete) { fmt::print(" - {0}\n", filename); } @@ -3325,8 +3337,8 @@ ACTOR Future partiallyDeleteGranule(Reference self, wait(waitForAll(deletions)); // delete metadata in FDB (deleted file keys) - if (BM_DEBUG) { - fmt::print("Partially deleting granule {0}: deleting file keys\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Partially deleting granule {1}: deleting file keys\n", self->epoch, granuleId.toString()); } state Transaction tr(self->db); @@ -3345,8 +3357,8 @@ ACTOR Future partiallyDeleteGranule(Reference self, } } - if (BM_DEBUG) { - fmt::print("Partially deleting granule {0}: success\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Partially deleting granule {1}: success\n", self->epoch, granuleId.toString()); } TraceEvent("GranulePartialPurge", self->id) .detail("Epoch", self->epoch) @@ -3357,6 +3369,8 @@ ACTOR Future partiallyDeleteGranule(Reference self, ++self->stats.granulesPartiallyPurged; self->stats.filesPurged += filesToDelete.size(); + CODE_PROBE(true, " partial granule purged"); + return Void(); } @@ -3369,8 +3383,9 @@ ACTOR Future partiallyDeleteGranule(Reference self, * processing this purge intent. */ ACTOR Future purgeRange(Reference self, KeyRangeRef range, Version purgeVersion, bool force) { - if (BM_DEBUG) { - fmt::print("purgeRange starting for range [{0} - {1}) @ purgeVersion={2}, force={3}\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} purgeRange starting for range [{1} - {2}) @ purgeVersion={3}, force={4}\n", + self->epoch, range.begin.printable(), range.end.printable(), purgeVersion, @@ -3392,8 +3407,7 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range // track which granules we have already added to traversal // note: (startKey, startVersion) uniquely identifies a granule - state std::unordered_set, boost::hash>> - visited; + state std::unordered_set, boost::hash>> visited; // find all active granules (that comprise the range) and add to the queue state KeyRangeMap::Ranges activeRanges = self->workerAssignments.intersectingRanges(range); @@ -3404,8 +3418,9 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range state KeyRangeMap::iterator activeRange; for (activeRange = activeRanges.begin(); activeRange != activeRanges.end(); ++activeRange) { - if (BM_DEBUG) { - fmt::print("Checking if active range [{0} - {1}), owned by BW {2}, should be purged\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Checking if active range [{1} - {2}), owned by BW {3}, should be purged\n", + self->epoch, activeRange.begin().printable(), activeRange.end().printable(), activeRange.value().toString()); @@ -3413,6 +3428,10 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range // assumption: purge boundaries must respect granule boundaries if (activeRange.begin() < range.begin || activeRange.end() > range.end) { + TraceEvent(SevWarn, "GranulePurgeRangesUnaligned", self->id) + .detail("Epoch", self->epoch) + .detail("PurgeRange", range) + .detail("GranuleRange", activeRange.range()); continue; } @@ -3422,20 +3441,29 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range loop { try { - if (BM_DEBUG) { - fmt::print("Fetching latest history entry for range [{0} - {1})\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Fetching latest history entry for range [{1} - {2})\n", + self->epoch, activeRange.begin().printable(), activeRange.end().printable()); } + // FIXME: doing this serially will likely be too slow for large purges Optional history = wait(getLatestGranuleHistory(&tr, activeRange.range())); // TODO: can we tell from the krm that this range is not valid, so that we don't need to do a // get if (history.present()) { - if (BM_DEBUG) { - printf("Adding range to history queue\n"); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Adding range to history queue: [{1} - {2}) @ {3} ({4})\n", + self->epoch, + activeRange.begin().printable(), + activeRange.end().printable(), + history.get().version, + (void*)(activeRange.range().begin.begin())); } - visited.insert({ activeRange.range().begin.begin(), history.get().version }); + visited.insert({ activeRange.range().begin.toString(), history.get().version }); historyEntryQueue.push({ activeRange.range(), history.get().version, MAX_VERSION }); + } else if (BM_PURGE_DEBUG) { + fmt::print("BM {0} No history for range, ignoring\n", self->epoch); } break; } catch (Error& e) { @@ -3444,8 +3472,12 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range } } - if (BM_DEBUG) { - printf("Beginning BFS traversal of history\n"); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Beginning BFS traversal of {1} history items for range [{2} - {3}) \n", + self->epoch, + historyEntryQueue.size(), + range.begin.printable(), + range.end.printable()); } while (!historyEntryQueue.empty()) { // process the node at the front of the queue and remove it @@ -3455,8 +3487,9 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range std::tie(currRange, startVersion, endVersion) = historyEntryQueue.front(); historyEntryQueue.pop(); - if (BM_DEBUG) { - fmt::print("Processing history node [{0} - {1}) with versions [{2}, {3})\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Processing history node [{1} - {2}) with versions [{3}, {4})\n", + self->epoch, currRange.begin.printable(), currRange.end.printable(), startVersion, @@ -3481,11 +3514,15 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range } if (!foundHistory) { + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} No history for this node, skipping\n", self->epoch); + } continue; } - if (BM_DEBUG) { - fmt::print("Found history entry for this node. It's granuleID is {0}\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Found history entry for this node. It's granuleID is {1}\n", + self->epoch, currHistoryNode.granuleID.toString()); } @@ -3496,33 +3533,45 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range // and so this granule should be partially deleted // - otherwise, this granule is active, so don't schedule it for deletion if (force || endVersion <= purgeVersion) { - if (BM_DEBUG) { - fmt::print("Granule {0} will be FULLY deleted\n", currHistoryNode.granuleID.toString()); + if (BM_PURGE_DEBUG) { + fmt::print( + "BM {0} Granule {1} will be FULLY deleted\n", self->epoch, currHistoryNode.granuleID.toString()); } toFullyDelete.push_back({ currHistoryNode.granuleID, historyKey, currRange }); } else if (startVersion < purgeVersion) { - if (BM_DEBUG) { - fmt::print("Granule {0} will be partially deleted\n", currHistoryNode.granuleID.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Granule {1} will be partially deleted\n", + self->epoch, + currHistoryNode.granuleID.toString()); } toPartiallyDelete.push_back({ currHistoryNode.granuleID, currRange }); } // add all of the node's parents to the queue + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Checking {1} parents\n", self->epoch, currHistoryNode.parentVersions.size()); + } for (int i = 0; i < currHistoryNode.parentVersions.size(); i++) { // for (auto& parent : currHistoryNode.parentVersions.size()) { // if we already added this node to queue, skip it; otherwise, mark it as visited KeyRangeRef parentRange(currHistoryNode.parentBoundaries[i], currHistoryNode.parentBoundaries[i + 1]); Version parentVersion = currHistoryNode.parentVersions[i]; - if (visited.count({ parentRange.begin.begin(), parentVersion })) { - if (BM_DEBUG) { - fmt::print("Already added {0} to queue, so skipping it\n", currHistoryNode.granuleID.toString()); + std::string beginStr = parentRange.begin.toString(); + if (!visited.insert({ beginStr, parentVersion }).second) { + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Already added [{1} - {2}) @ {3} - {4} to queue, so skipping it\n", + self->epoch, + parentRange.begin.printable(), + parentRange.end.printable(), + parentVersion, + startVersion); } continue; } - visited.insert({ parentRange.begin.begin(), parentVersion }); - if (BM_DEBUG) { - fmt::print("Adding parent [{0} - {1}) with versions [{2} - {3}) to queue\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Adding parent [{1} - {2}) @ {3} - {4} to queue\n", + self->epoch, parentRange.begin.printable(), parentRange.end.printable(), parentVersion, @@ -3550,10 +3599,19 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range // we won't run into any issues with trying to "re-delete" a blob file since deleting // a file that doesn't exist is considered successful + TraceEvent("PurgeGranulesTraversalComplete", self->id) + .detail("Epoch", self->epoch) + .detail("Range", range) + .detail("PurgeVersion", purgeVersion) + .detail("Force", force) + .detail("VisitedCount", visited.size()) + .detail("DeletingFullyCount", toFullyDelete.size()) + .detail("DeletingPartiallyCount", toPartiallyDelete.size()); + state std::vector> partialDeletions; state int i; - if (BM_DEBUG) { - fmt::print("{0} granules to fully delete\n", toFullyDelete.size()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0}: {1} granules to fully delete\n", self->epoch, toFullyDelete.size()); } for (i = toFullyDelete.size() - 1; i >= 0; --i) { state UID granuleId; @@ -3561,22 +3619,22 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range KeyRange keyRange; std::tie(granuleId, historyKey, keyRange) = toFullyDelete[i]; // FIXME: consider batching into a single txn (need to take care of txn size limit) - if (BM_DEBUG) { - fmt::print("About to fully delete granule {0}\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0}: About to fully delete granule {1}\n", self->epoch, granuleId.toString()); } wait(fullyDeleteGranule(self, granuleId, historyKey, purgeVersion, range)); } - if (BM_DEBUG) { - fmt::print("{0} granules to partially delete\n", toPartiallyDelete.size()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0}: {1} granules to partially delete\n", self->epoch, toPartiallyDelete.size()); } for (i = toPartiallyDelete.size() - 1; i >= 0; --i) { UID granuleId; KeyRange range; std::tie(granuleId, range) = toPartiallyDelete[i]; - if (BM_DEBUG) { - fmt::print("About to partially delete granule {0}\n", granuleId.toString()); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0}: About to partially delete granule {1}\n", self->epoch, granuleId.toString()); } partialDeletions.emplace_back(partiallyDeleteGranule(self, granuleId, purgeVersion, range)); } @@ -3588,8 +3646,9 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range // another purgeIntent that got written for this table while we were processing this one. // If that is the case, we should not clear the key. Otherwise, we can just clear the key. - if (BM_DEBUG) { - fmt::print("Successfully purged range [{0} - {1}) at purgeVersion={2}\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0}: Successfully purged range [{1} - {2}) at purgeVersion={3}\n", + self->epoch, range.begin.printable(), range.end.printable(), purgeVersion); @@ -3601,6 +3660,8 @@ ACTOR Future purgeRange(Reference self, KeyRangeRef range .detail("PurgeVersion", purgeVersion) .detail("Force", force); + CODE_PROBE(true, "range purge complete"); + ++self->stats.purgesProcessed; return Void(); } @@ -3651,6 +3712,7 @@ ACTOR Future monitorPurgeKeys(Reference self) { // TODO: replace 10000 with a knob state RangeResult purgeIntents = wait(tr->getRange(blobGranulePurgeKeys, BUGGIFY ? 1 : 10000)); if (purgeIntents.size()) { + CODE_PROBE(true, "BM found purges to process"); int rangeIdx = 0; for (; rangeIdx < purgeIntents.size(); ++rangeIdx) { Version purgeVersion; @@ -3672,8 +3734,9 @@ ACTOR Future monitorPurgeKeys(Reference self) { } purgeMap.insert(range, std::make_pair(purgeVersion, force)); - if (BM_DEBUG) { - fmt::print("about to purge range [{0} - {1}) @ {2}, force={3}\n", + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} about to purge range [{1} - {2}) @ {3}, force={4}\n", + self->epoch, range.begin.printable(), range.end.printable(), purgeVersion, @@ -3725,9 +3788,11 @@ ACTOR Future monitorPurgeKeys(Reference self) { } } - if (BM_DEBUG) { - printf("Done clearing current set of purge intents.\n"); + if (BM_PURGE_DEBUG) { + fmt::print("BM {0} Done clearing current set of purge intents.\n", self->epoch); } + + CODE_PROBE(true, "BM finished processing purge intents"); } } diff --git a/fdbserver/BlobWorker.actor.cpp b/fdbserver/BlobWorker.actor.cpp index b6c50693b0..568c12ebef 100644 --- a/fdbserver/BlobWorker.actor.cpp +++ b/fdbserver/BlobWorker.actor.cpp @@ -602,7 +602,20 @@ ACTOR Future writeDeltaFile(Reference bwData, state std::string fileName = randomBGFilename(bwData->id, granuleID, currentDeltaVersion, ".delta"); - state Value serialized = ObjectWriter::toValue(deltasToWrite, Unversioned()); + state Optional cipherKeysCtx; + state Optional cipherKeysMeta; + state Arena arena; + // TODO support encryption, figure out proper state stuff + /*if (isBlobFileEncryptionSupported()) { + BlobGranuleCipherKeysCtx ciphKeysCtx = wait(getLatestGranuleCipherKeys(bwData, keyRange, &arena)); + cipherKeysCtx = ciphKeysCtx; + cipherKeysMeta = BlobGranuleCipherKeysCtx::toCipherKeysMeta(cipherKeysCtx.get()); + }*/ + + Optional compressFilter = getBlobFileCompressFilter(); + + state Value serialized = serializeChunkedDeltaFile( + deltasToWrite, keyRange, SERVER_KNOBS->BG_DELTA_FILE_TARGET_CHUNK_BYTES, compressFilter, cipherKeysCtx); state size_t serializedSize = serialized.size(); // Free up deltasToWrite here to reduce memory @@ -640,7 +653,7 @@ ACTOR Future writeDeltaFile(Reference bwData, Key dfKey = blobGranuleFileKeyFor(granuleID, currentDeltaVersion, 'D'); // TODO change once we support file multiplexing - Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize); + Value dfValue = blobGranuleFileValueFor(fname, 0, serializedSize, serializedSize, cipherKeysMeta); tr->set(dfKey, dfValue); if (oldGranuleComplete.present()) { @@ -668,7 +681,7 @@ ACTOR Future writeDeltaFile(Reference bwData, wait(delay(deterministicRandom()->random01())); } // FIXME: change when we implement multiplexing - return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize, serializedSize); + return BlobFileIndex(currentDeltaVersion, fname, 0, serializedSize, serializedSize, cipherKeysMeta); } catch (Error& e) { wait(tr->onError(e)); } @@ -753,8 +766,8 @@ ACTOR Future writeSnapshot(Reference bwData, } Optional compressFilter = getBlobFileCompressFilter(); - state Value serialized = - serializeChunkedSnapshot(snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNKS, compressFilter, cipherKeysCtx); + state Value serialized = serializeChunkedSnapshot( + snapshot, SERVER_KNOBS->BG_SNAPSHOT_FILE_TARGET_CHUNK_BYTES, compressFilter, cipherKeysCtx); state size_t serializedSize = serialized.size(); // free snapshot to reduce memory @@ -970,6 +983,7 @@ ACTOR Future compactFromBlob(Reference bwData, snapshotF.cipherKeysMeta); // TODO: optimization - batch 'encryption-key' lookup given the GranuleFile set is known + // FIXME: get cipher keys for delta as well! if (chunk.snapshotFile.get().cipherKeysMetaRef.present()) { ASSERT(isBlobFileEncryptionSupported()); BlobGranuleCipherKeysCtx cipherKeysCtx = @@ -3187,6 +3201,8 @@ ACTOR Future doBlobGranuleFileRequest(Reference bwData, Bl getGranuleCipherKeys(bwData, chunk.snapshotFile.get().cipherKeysMetaRef.get(), &rep.arena); } + // FIXME: get cipher keys for delta files too! + // new deltas (if version is larger than version of last delta file) // FIXME: do trivial key bounds here if key range is not fully contained in request key // range diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 191a791338..f33fc53c96 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -484,8 +484,11 @@ public: } // TODO: unit test needed - ACTOR static Future resumeFromDataMoves(Reference self) { + ACTOR static Future resumeFromDataMoves(Reference self, Future readyToStart) { state KeyRangeMap>::iterator it = self->initData->dataMoveMap.ranges().begin(); + + wait(readyToStart); + for (; it != self->initData->dataMoveMap.ranges().end(); ++it) { const DataMoveMetaData& meta = it.value()->meta; if (it.value()->isCancelled() || (it.value()->valid && !CLIENT_KNOBS->SHARD_ENCODE_LOCATION_METADATA)) { @@ -528,8 +531,8 @@ public: // usage if it turns out to be a problem. Future resumeRelocations() { ASSERT(shardsAffectedByTeamFailure); // has to be allocated - return runAfter(resumeFromShards(Reference::addRef(this), g_network->isSimulated()), - resumeFromDataMoves(Reference::addRef(this))); + Future shardsReady = resumeFromShards(Reference::addRef(this), g_network->isSimulated()); + return resumeFromDataMoves(Reference::addRef(this), shardsReady); } }; diff --git a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp index d0dd490202..84a6e53a78 100644 --- a/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp +++ b/fdbserver/KeyValueStoreShardedRocksDB.actor.cpp @@ -110,9 +110,9 @@ class RocksDBErrorListener : public rocksdb::EventListener { public: RocksDBErrorListener(){}; void OnBackgroundError(rocksdb::BackgroundErrorReason reason, rocksdb::Status* bg_error) override { - TraceEvent(SevError, "RocksDBBGError") + TraceEvent(SevError, "ShardedRocksDBBGError") .detail("Reason", getErrorReason(reason)) - .detail("RocksDBSeverity", bg_error->severity()) + .detail("ShardedRocksDBSeverity", bg_error->severity()) .detail("Status", bg_error->ToString()); std::unique_lock lock(mutex); if (!errorPromise.isValid()) @@ -186,8 +186,8 @@ std::vector> decodeShardMapping(const RangeResu void logRocksDBError(const rocksdb::Status& status, const std::string& method) { auto level = status.IsTimedOut() ? SevWarn : SevError; - TraceEvent e(level, "RocksDBError"); - e.detail("Error", status.ToString()).detail("Method", method).detail("RocksDBSeverity", status.severity()); + TraceEvent e(level, "ShardedRocksDBError"); + e.detail("Error", status.ToString()).detail("Method", method).detail("ShardedRocksDBSeverity", status.severity()); if (status.IsIOError()) { e.detail("SubCode", status.subcode()); } @@ -219,7 +219,7 @@ const char* ShardOpToString(ShardOp op) { } } void logShardEvent(StringRef name, ShardOp op, Severity severity = SevInfo, const std::string& message = "") { - TraceEvent e(severity, "KVSShardEvent"); + TraceEvent e(severity, "ShardedRocksKVSShardEvent"); e.detail("Name", name).detail("Action", ShardOpToString(op)); if (!message.empty()) { e.detail("Message", message); @@ -230,7 +230,7 @@ void logShardEvent(StringRef name, ShardOp op, Severity severity = SevInfo, const std::string& message = "") { - TraceEvent e(severity, "KVSShardEvent"); + TraceEvent e(severity, "ShardedRocksKVSShardEvent"); e.detail("Name", name).detail("Action", ShardOpToString(op)).detail("Begin", range.begin).detail("End", range.end); if (message != "") { e.detail("Message", message); @@ -343,7 +343,7 @@ public: ASSERT(cf); readRangeOptions.background_purge_on_iterator_cleanup = true; readRangeOptions.auto_prefix_mode = (SERVER_KNOBS->ROCKSDB_PREFIX_LEN > 0); - TraceEvent(SevDebug, "ReadIteratorPool") + TraceEvent(SevVerbose, "ShardedRocksReadIteratorPool") .detail("Path", path) .detail("KnobRocksDBReadRangeReuseIterators", SERVER_KNOBS->ROCKSDB_READ_RANGE_REUSE_ITERATORS) .detail("KnobRocksDBPrefixLen", SERVER_KNOBS->ROCKSDB_PREFIX_LEN); @@ -425,7 +425,7 @@ private: ACTOR Future flowLockLogger(const FlowLock* readLock, const FlowLock* fetchLock) { loop { wait(delay(SERVER_KNOBS->ROCKSDB_METRICS_DELAY)); - TraceEvent e("RocksDBFlowLock"); + TraceEvent e("ShardedRocksDBFlowLock"); e.detail("ReadAvailable", readLock->available()); e.detail("ReadActivePermits", readLock->activePermits()); e.detail("ReadWaiters", readLock->waiters()); @@ -588,13 +588,13 @@ public: if (rState->closing) { break; } - TraceEvent(SevInfo, "KVSPhysialShardMetrics") + TraceEvent(SevInfo, "ShardedRocksKVSPhysialShardMetrics") .detail("NumActiveShards", shardManager->numActiveShards()) .detail("TotalPhysicalShards", shardManager->numPhysicalShards()); } } catch (Error& e) { if (e.code() != error_code_actor_cancelled) { - TraceEvent(SevError, "ShardMetricsLoggerError").errorUnsuppressed(e); + TraceEvent(SevError, "ShardedRocksShardMetricsLoggerError").errorUnsuppressed(e); } } return Void(); @@ -602,7 +602,7 @@ public: rocksdb::Status init() { // Open instance. - TraceEvent(SevVerbose, "ShardManagerInitBegin", this->logId).detail("DataPath", path); + TraceEvent(SevInfo, "ShardedRocksShardManagerInitBegin", this->logId).detail("DataPath", path); std::vector columnFamilies; rocksdb::Options options = getOptions(); rocksdb::Status status = rocksdb::DB::ListColumnFamilies(options, path, &columnFamilies); @@ -632,6 +632,8 @@ public: } if (foundMetadata) { + TraceEvent(SevInfo, "ShardedRocksInitLoadPhysicalShards", this->logId) + .detail("PhysicalShardCount", handles.size()); for (auto handle : handles) { if (handle->GetName() == "kvs-metadata") { metadataShard = std::make_shared(db, "kvs-metadata", handle); @@ -639,7 +641,8 @@ public: physicalShards[handle->GetName()] = std::make_shared(db, handle->GetName(), handle); } columnFamilyMap[handle->GetID()] = handle; - TraceEvent(SevInfo, "ShardedRocskDB").detail("FoundShard", handle->GetName()).detail("Action", "Init"); + TraceEvent(SevVerbose, "ShardedRocksInitPhysicalShard", this->logId) + .detail("PhysicalShard", handle->GetName()); } RangeResult metadata; readRangeInDb(metadataShard.get(), prefixRange(shardMappingPrefix), UINT16_MAX, UINT16_MAX, &metadata); @@ -647,7 +650,7 @@ public: std::vector> mapping = decodeShardMapping(metadata, shardMappingPrefix); for (const auto& [range, name] : mapping) { - TraceEvent(SevDebug, "ShardedRocksLoadPhysicalShard", this->logId) + TraceEvent(SevVerbose, "ShardedRocksLoadRange", this->logId) .detail("Range", range) .detail("PhysicalShard", name); auto it = physicalShards.find(name); @@ -662,10 +665,10 @@ public: activePhysicalShardIds.emplace(name); } // TODO: remove unused column families. - } else { // DB is opened with default shard. ASSERT(handles.size() == 1); + // Add SpecialKeys range. This range should not be modified. std::shared_ptr defaultShard = std::make_shared(db, "default", handles[0]); columnFamilyMap[defaultShard->cf->GetID()] = defaultShard->cf; @@ -688,7 +691,7 @@ public: return status; } metadataShard->readIterPool->update(); - TraceEvent(SevInfo, "InitializeMetaDataShard", this->logId) + TraceEvent(SevInfo, "ShardedRocksInitializeMetaDataShard", this->logId) .detail("MetadataShardCF", metadataShard->cf->GetID()); } physicalShards["kvs-metadata"] = metadataShard; @@ -696,7 +699,7 @@ public: writeBatch = std::make_unique(); dirtyShards = std::make_unique>(); - TraceEvent(SevDebug, "ShardManagerInitEnd", this->logId).detail("DataPath", path); + TraceEvent(SevInfo, "ShardedRocksShardManagerInitEnd", this->logId).detail("DataPath", path); return status; } @@ -712,7 +715,7 @@ public: for (auto it = rangeIterator.begin(); it != rangeIterator.end(); ++it) { if (it.value() == nullptr) { - TraceEvent(SevDebug, "ShardedRocksDB") + TraceEvent(SevVerbose, "ShardedRocksDB") .detail("Info", "ShardNotFound") .detail("BeginKey", range.begin) .detail("EndKey", range.end); @@ -724,9 +727,10 @@ public: } PhysicalShard* addRange(KeyRange range, std::string id) { - TraceEvent(SevVerbose, "ShardedRocksAddRangeBegin", this->logId) + TraceEvent(SevInfo, "ShardedRocksAddRangeBegin", this->logId) .detail("Range", range) .detail("PhysicalShardID", id); + // Newly added range should not overlap with any existing range. auto ranges = dataShardMap.intersectingRanges(range); @@ -750,7 +754,7 @@ public: validate(); - TraceEvent(SevVerbose, "ShardedRocksAddRangeEnd", this->logId) + TraceEvent(SevInfo, "ShardedRocksAddRangeEnd", this->logId) .detail("Range", range) .detail("PhysicalShardID", id); @@ -758,7 +762,7 @@ public: } std::vector removeRange(KeyRange range) { - TraceEvent(SevVerbose, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range); + TraceEvent(SevInfo, "ShardedRocksRemoveRangeBegin", this->logId).detail("Range", range); std::vector shardIds; @@ -796,6 +800,7 @@ public: } continue; } + // Range modification could result in more than one segments. Remove the original segment key here. existingShard->dataShards.erase(shardRange.begin.toString()); if (shardRange.begin < range.begin) { @@ -826,7 +831,7 @@ public: validate(); - TraceEvent(SevVerbose, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range); + TraceEvent(SevInfo, "ShardedRocksRemoveRangeEnd", this->logId).detail("Range", range); return shardIds; } @@ -849,7 +854,7 @@ public: TraceEvent(SevError, "ShardedRocksDB").detail("Error", "write to non-exist shard").detail("WriteKey", key); return; } - TraceEvent(SevVerbose, "ShardManagerPut", this->logId) + TraceEvent(SevVerbose, "ShardedRocksShardManagerPut", this->logId) .detail("WriteKey", key) .detail("Value", value) .detail("MapRange", it.range()) @@ -859,7 +864,9 @@ public: ASSERT(dirtyShards != nullptr); writeBatch->Put(it.value()->physicalShard->cf, toSlice(key), toSlice(value)); dirtyShards->insert(it.value()->physicalShard); - TraceEvent(SevVerbose, "ShardManagerPutEnd", this->logId).detail("WriteKey", key).detail("Value", value); + TraceEvent(SevVerbose, "ShardedRocksShardManagerPutEnd", this->logId) + .detail("WriteKey", key) + .detail("Value", value); } void clear(KeyRef key) { @@ -884,7 +891,7 @@ public: } void persistRangeMapping(KeyRangeRef range, bool isAdd) { - TraceEvent(SevDebug, "ShardedRocksDB") + TraceEvent(SevDebug, "ShardedRocksDB", this->logId) .detail("Info", "RangeToPersist") .detail("BeginKey", range.begin) .detail("EndKey", range.end); @@ -902,7 +909,7 @@ public: writeBatch->Put(metadataShard->cf, getShardMappingKey(it.range().begin, shardMappingPrefix), it.value()->physicalShard->id); - TraceEvent(SevDebug, "ShardedRocksDB") + TraceEvent(SevDebug, "ShardedRocksDB", this->logId) .detail("Action", "PersistRangeMapping") .detail("BeginKey", it.range().begin) .detail("EndKey", it.range().end) @@ -911,7 +918,7 @@ public: } else { // Empty range. writeBatch->Put(metadataShard->cf, getShardMappingKey(it.range().begin, shardMappingPrefix), ""); - TraceEvent(SevDebug, "ShardedRocksDB") + TraceEvent(SevDebug, "ShardedRocksDB", this->logId) .detail("Action", "PersistRangeMapping") .detail("BeginKey", it.range().begin) .detail("EndKey", it.range().end) @@ -921,7 +928,7 @@ public: } } else { writeBatch->Put(metadataShard->cf, getShardMappingKey(range.begin, shardMappingPrefix), ""); - TraceEvent(SevDebug, "ShardedRocksDB") + TraceEvent(SevDebug, "ShardedRocksDB", this->logId) .detail("Action", "PersistRangeMapping") .detail("RemoveRange", "True") .detail("BeginKey", range.begin) @@ -972,7 +979,7 @@ public: if (!s.ok()) { logRocksDBError(s, "DestroyDB"); } - TraceEvent("RocksDB").detail("Info", "DBDestroyed"); + TraceEvent("ShardedRocksDB", this->logId).detail("Info", "DBDestroyed"); } rocksdb::DB* getDb() const { return db; } @@ -997,9 +1004,9 @@ public: } void validate() { - TraceEvent(SevVerbose, "ValidateShardManager", this->logId); + TraceEvent(SevVerbose, "ShardedRocksValidateShardManager", this->logId); for (auto s = dataShardMap.ranges().begin(); s != dataShardMap.ranges().end(); ++s) { - TraceEvent e(SevVerbose, "ValidateDataShardMap", this->logId); + TraceEvent e(SevVerbose, "ShardedRocksValidateDataShardMap", this->logId); e.detail("Range", s->range()); const DataShard* shard = s->value(); e.detail("ShardAddress", reinterpret_cast(shard)); @@ -1008,6 +1015,13 @@ public: } else { e.detail("Shard", "Empty"); } + if (shard != nullptr) { + ASSERT(shard->range == static_cast(s->range())); + ASSERT(shard->physicalShard != nullptr); + auto it = shard->physicalShard->dataShards.find(shard->range.begin.toString()); + ASSERT(it != shard->physicalShard->dataShards.end()); + ASSERT(it->second.get() == shard); + } } } @@ -1338,7 +1352,7 @@ std::shared_ptr RocksDBMetrics::getStatsObjForRocksDB() { } void RocksDBMetrics::logStats(rocksdb::DB* db) { - TraceEvent e("RocksDBMetrics"); + TraceEvent e("ShardedRocksDBMetrics"); uint64_t stat; for (auto& [name, ticker, cumulation] : tickerStats) { stat = stats->getTickerCount(ticker); @@ -1361,7 +1375,7 @@ void RocksDBMetrics::logStats(rocksdb::DB* db) { } void RocksDBMetrics::logMemUsagePerShard(std::string shardName, rocksdb::DB* db) { - TraceEvent e("RocksDBShardMemMetrics"); + TraceEvent e("ShardedRocksDBShardMemMetrics"); uint64_t stat; ASSERT(db != nullptr); ASSERT(db->GetIntProperty(rocksdb::DB::Properties::kBlockCacheUsage, &stat)); @@ -1387,7 +1401,7 @@ void RocksDBMetrics::setPerfContext(int index) { } void RocksDBMetrics::logPerfContext(bool ignoreZeroMetric) { - TraceEvent e("RocksDBPerfContextMetrics"); + TraceEvent e("ShardedRocksDBPerfContextMetrics"); e.setMaxEventLength(20000); for (auto& [name, metric, vals] : perfContextMetrics) { uint64_t s = 0; @@ -1650,7 +1664,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { return; } - TraceEvent(SevInfo, "RocksDB").detail("Method", "Open"); + TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Open"); a.done.send(Void()); } @@ -1841,7 +1855,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { } else { a.shardManager->closeAllShards(); } - TraceEvent(SevInfo, "RocksDB").detail("Method", "Close"); + TraceEvent(SevInfo, "ShardedRocksDB").detail("Method", "Close"); a.done.send(Void()); } }; @@ -1908,7 +1922,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { traceBatch.get().addEvent("GetValueDebug", a.debugID.get().first(), "Reader.Before"); } if (readBeginTime - a.startTime > readValueTimeout) { - TraceEvent(SevWarn, "RocksDBError") + TraceEvent(SevWarn, "ShardedRocksDBError") .detail("Error", "Read value request timedout") .detail("Method", "ReadValueAction") .detail("Timeout value", readValueTimeout); @@ -1995,7 +2009,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { "Reader.Before"); //.detail("TaskID", g_network->getCurrentTask()); } if (readBeginTime - a.startTime > readValuePrefixTimeout) { - TraceEvent(SevWarn, "RocksDBError") + TraceEvent(SevWarn, "ShardedRocksDBError") .detail("Error", "Read value prefix request timedout") .detail("Method", "ReadValuePrefixAction") .detail("Timeout value", readValuePrefixTimeout); @@ -2080,7 +2094,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { rocksDBMetrics->getReadRangeQueueWaitHistogram(threadIndex)->sampleSeconds(readBeginTime - a.startTime); } if (readBeginTime - a.startTime > readRangeTimeout) { - TraceEvent(SevWarn, "KVSReadTimeout") + TraceEvent(SevWarn, "ShardedRocksKVSReadTimeout") .detail("Error", "Read range request timedout") .detail("Method", "ReadRangeAction") .detail("Timeout value", readRangeTimeout); @@ -2127,10 +2141,6 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { } } - Histogram::getHistogram( - ROCKSDBSTORAGE_HISTOGRAM_GROUP, "ShardedRocksDBNumShardsInRangeRead"_sr, Histogram::Unit::countLinear) - ->sample(numShards); - result.more = (result.size() == a.rowLimit) || (result.size() == -a.rowLimit) || (accumulatedBytes >= a.byteLimit); if (result.more) { @@ -2184,7 +2194,8 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { readThreads = createGenericThreadPool(); } writeThread->addThread(new Writer(id, 0, shardManager.getColumnFamilyMap(), rocksDBMetrics), "fdb-rocksdb-wr"); - TraceEvent("RocksDBReadThreads").detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM); + TraceEvent("ShardedRocksDBReadThreads", id) + .detail("KnobRocksDBReadParallelism", SERVER_KNOBS->ROCKSDB_READ_PARALLELISM); for (unsigned i = 0; i < SERVER_KNOBS->ROCKSDB_READ_PARALLELISM; ++i) { readThreads->addThread(new Reader(id, i, rocksDBMetrics), "fdb-rocksdb-re"); } @@ -2302,7 +2313,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { auto* shard = shardManager.getDataShard(key); if (shard == nullptr || !shard->physicalShard->initialized()) { // TODO: read non-exist system key range should not cause an error. - TraceEvent(SevWarnAlways, "ShardedRocksDB") + TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id) .detail("Detail", "Read non-exist key range") .detail("ReadKey", key); return Optional(); @@ -2330,7 +2341,7 @@ struct ShardedRocksDBKeyValueStore : IKeyValueStore { auto* shard = shardManager.getDataShard(key); if (shard == nullptr || !shard->physicalShard->initialized()) { // TODO: read non-exist system key range should not cause an error. - TraceEvent(SevWarnAlways, "ShardedRocksDB") + TraceEvent(SevWarnAlways, "ShardedRocksDB", this->id) .detail("Detail", "Read non-exist key range") .detail("ReadKey", key); return Optional(); @@ -2452,7 +2463,7 @@ IKeyValueStore* keyValueStoreShardedRocksDB(std::string const& path, #ifdef SSD_ROCKSDB_EXPERIMENTAL return new ShardedRocksDBKeyValueStore(path, logID); #else - TraceEvent(SevError, "RocksDBEngineInitFailure").detail("Reason", "Built without RocksDB"); + TraceEvent(SevError, "ShardedRocksDBEngineInitFailure").detail("Reason", "Built without RocksDB"); ASSERT(false); return nullptr; #endif // SSD_ROCKSDB_EXPERIMENTAL diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index a3f0af0f63..986a406b4f 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -280,6 +280,13 @@ class TestConfig { if (attrib == "blobGranulesEnabled") { blobGranulesEnabled = strcmp(value.c_str(), "true") == 0; } + if (attrib == "injectSSTargetedRestart") { + injectTargetedSSRestart = strcmp(value.c_str(), "true") == 0; + } + + if (attrib == "injectSSDelay") { + injectSSDelay = strcmp(value.c_str(), "true") == 0; + } } ifs.close(); @@ -327,6 +334,8 @@ public: bool allowDefaultTenant = true; bool allowDisablingTenants = true; + bool injectTargetedSSRestart = false; + bool injectSSDelay = false; ConfigDBType getConfigDBType() const { return configDBType; } @@ -384,7 +393,9 @@ public: .add("blobGranulesEnabled", &blobGranulesEnabled) .add("allowDefaultTenant", &allowDefaultTenant) .add("allowDisablingTenants", &allowDisablingTenants) - .add("randomlyRenameZoneId", &randomlyRenameZoneId); + .add("randomlyRenameZoneId", &randomlyRenameZoneId) + .add("injectTargetedSSRestart", &injectTargetedSSRestart) + .add("injectSSDelay", &injectSSDelay); try { auto file = toml::parse(testFile); if (file.contains("configuration") && toml::find(file, "configuration").is_table()) { @@ -1384,7 +1395,7 @@ void SimulationConfig::setDatacenters(const TestConfig& testConfig) { void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { // Using [0, 4) to disable the RocksDB storage engine. // TODO: Figure out what is broken with the RocksDB engine in simulation. - int storage_engine_type = deterministicRandom()->randomInt(0, 4); + int storage_engine_type = deterministicRandom()->randomInt(0, 6); if (testConfig.storageEngineType.present()) { storage_engine_type = testConfig.storageEngineType.get(); } else { @@ -1392,7 +1403,7 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { while (std::find(testConfig.storageEngineExcludeTypes.begin(), testConfig.storageEngineExcludeTypes.end(), storage_engine_type) != testConfig.storageEngineExcludeTypes.end()) { - storage_engine_type = deterministicRandom()->randomInt(0, 5); + storage_engine_type = deterministicRandom()->randomInt(0, 6); } } @@ -1435,6 +1446,8 @@ void SimulationConfig::setStorageEngine(const TestConfig& testConfig) { TraceEvent(SevWarnAlways, "RocksDBNonDeterminism") .detail("Explanation", "The Sharded RocksDB storage engine is threaded and non-deterministic"); noUnseed = true; + auto& g_knobs = IKnobCollection::getMutableGlobalKnobCollection(); + g_knobs.setKnob("shard_encode_location_metadata", KnobValueRef::create(bool{ true })); break; } default: @@ -2364,6 +2377,13 @@ ACTOR void setupAndRun(std::string dataFolder, testConfig.readFromConfig(testFile); g_simulator.hasDiffProtocolProcess = testConfig.startIncompatibleProcess; g_simulator.setDiffProtocol = false; + if (testConfig.injectTargetedSSRestart && deterministicRandom()->random01() < 0.25) { + g_simulator.injectTargetedSSRestartTime = 60.0 + 340.0 * deterministicRandom()->random01(); + } + + if (testConfig.injectSSDelay && deterministicRandom()->random01() < 0.25) { + g_simulator.injectSSDelayTime = 60.0 + 240.0 * deterministicRandom()->random01(); + } // Build simulator allow list allowList.addTrustedSubnet("0.0.0.0/2"sv); @@ -2377,6 +2397,7 @@ ACTOR void setupAndRun(std::string dataFolder, // https://github.com/apple/foundationdb/issues/5155 if (std::string_view(testFile).find("restarting") != std::string_view::npos) { testConfig.storageEngineExcludeTypes.push_back(4); + testConfig.storageEngineExcludeTypes.push_back(5); // Disable the default tenant in restarting tests for now // TODO: persist the chosen default tenant in the restartInfo.ini file for the second test @@ -2389,6 +2410,7 @@ ACTOR void setupAndRun(std::string dataFolder, // Re-enable the backup and restore related simulation tests when the tests are passing again. if (std::string_view(testFile).find("Backup") != std::string_view::npos) { testConfig.storageEngineExcludeTypes.push_back(4); + testConfig.storageEngineExcludeTypes.push_back(5); } // Disable the default tenant in backup and DR tests for now. This is because backup does not currently duplicate @@ -2402,6 +2424,7 @@ ACTOR void setupAndRun(std::string dataFolder, // in the build. if (!rocksDBEnabled) { testConfig.storageEngineExcludeTypes.push_back(4); + testConfig.storageEngineExcludeTypes.push_back(5); } state ProtocolVersion protocolVersion = currentProtocolVersion; diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index a890b3f028..60d527b483 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -2751,6 +2751,9 @@ ACTOR Future lockedStatusFetcher(Referenceinsert(format("Unable to determine if database is locked (%s).", e.what())); break; } diff --git a/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h b/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h index 838cc18ec4..db4cdc2891 100644 --- a/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h +++ b/fdbserver/include/fdbserver/BlobGranuleValidation.actor.h @@ -51,6 +51,8 @@ bool compareFDBAndBlob(RangeResult fdb, Version v, bool debug); +void printGranuleChunks(const Standalone>& chunks); + ACTOR Future clearAndAwaitMerge(Database cx, KeyRange range); #include "flow/unactorcompiler.h" diff --git a/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h b/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h index 9b4dfca577..854d27c091 100644 --- a/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h +++ b/fdbserver/include/fdbserver/workloads/BulkSetup.actor.h @@ -294,8 +294,8 @@ Future bulkSetup(Database cx, // Here we wait for data in flight to go to 0 (this will not work on a database with other users) if (postSetupWarming != 0) { try { - wait(delay(5.0) >> - waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start + wait(delay(5.0)); + wait(waitForLowInFlight(cx, workload)); // Wait for the data distribution in a small test to start } catch (Error& e) { if (e.code() == error_code_actor_cancelled) throw; diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index d2752110b4..98ba905c2e 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -536,6 +536,9 @@ struct ChangeFeedInfo : ReferenceCounted { Version storageVersion = invalidVersion; // The version between the storage version and the durable version are // being written to disk as part of the current commit in updateStorage. Version durableVersion = invalidVersion; // All versions before the durable version are durable on disk + // FIXME: this needs to get persisted to disk to still fix same races across restart! + Version metadataVersion = invalidVersion; // Last update to the change feed metadata. Used for reasoning about + // fetched metadata vs local metadata Version emptyVersion = 0; // The change feed does not have any mutations before emptyVersion KeyRange range; Key id; @@ -551,8 +554,6 @@ struct ChangeFeedInfo : ReferenceCounted { bool removing = false; bool destroyed = false; - bool possiblyDestroyed = false; - bool refreshInProgress = false; KeyRangeMap>> moveTriggers; @@ -587,12 +588,21 @@ struct ChangeFeedInfo : ReferenceCounted { } void destroy(Version destroyVersion) { + updateMetadataVersion(destroyVersion); removing = true; destroyed = true; - refreshInProgress = false; moved(range); newMutations.trigger(); } + + bool updateMetadataVersion(Version version) { + // don't update metadata version if removing, so that metadata version remains the moved away version + if (!removing && version > metadataVersion) { + metadataVersion = version; + return true; + } + return false; + } }; class ServerWatchMetadata : public ReferenceCounted { @@ -895,7 +905,7 @@ public: KeyRangeMap>> keyChangeFeed; std::map> uidChangeFeed; Deque, Version>> changeFeedVersions; - std::map> changeFeedRemovals; + std::map> changeFeedDestroys; std::set currentChangeFeeds; std::set fetchingChangeFeeds; std::unordered_map> changeFeedClientVersions; @@ -971,6 +981,9 @@ public: FlowLock durableVersionLock; FlowLock fetchKeysParallelismLock; + // Extra lock that prevents too much post-initial-fetch work from building up, such as mutation applying and change + // feed tail fetching + FlowLock fetchKeysParallelismFullLock; FlowLock fetchChangeFeedParallelismLock; int64_t fetchKeysBytesBudget; AsyncVar fetchKeysBudgetUsed; @@ -1046,7 +1059,8 @@ public: Counter sampledBytesCleared; // The number of key-value pairs fetched by fetchKeys() Counter kvFetched; - Counter mutations, setMutations, clearRangeMutations, atomicMutations; + Counter mutations, setMutations, clearRangeMutations, atomicMutations, changeFeedMutations, + changeFeedMutationsDurable; Counter updateBatches, updateVersions; Counter loops; Counter fetchWaitingMS, fetchWaitingCount, fetchExecutingMS, fetchExecutingCount; @@ -1071,6 +1085,8 @@ public: Counter kvScans; // The count of commit operation to the storage engine. Counter kvCommits; + // The count of change feed reads that hit disk + Counter changeFeedDiskReads; LatencySample readLatencySample; LatencyBands readLatencyBands; @@ -1095,15 +1111,17 @@ public: feedBytesFetched("FeedBytesFetched", cc), sampledBytesCleared("SampledBytesCleared", cc), kvFetched("KVFetched", cc), mutations("Mutations", cc), setMutations("SetMutations", cc), clearRangeMutations("ClearRangeMutations", cc), atomicMutations("AtomicMutations", cc), - updateBatches("UpdateBatches", cc), updateVersions("UpdateVersions", cc), loops("Loops", cc), - fetchWaitingMS("FetchWaitingMS", cc), fetchWaitingCount("FetchWaitingCount", cc), - fetchExecutingMS("FetchExecutingMS", cc), fetchExecutingCount("FetchExecutingCount", cc), - readsRejected("ReadsRejected", cc), wrongShardServer("WrongShardServer", cc), - fetchedVersions("FetchedVersions", cc), fetchesFromLogs("FetchesFromLogs", cc), - quickGetValueHit("QuickGetValueHit", cc), quickGetValueMiss("QuickGetValueMiss", cc), - quickGetKeyValuesHit("QuickGetKeyValuesHit", cc), quickGetKeyValuesMiss("QuickGetKeyValuesMiss", cc), - kvScanBytes("KVScanBytes", cc), kvGetBytes("KVGetBytes", cc), eagerReadsKeys("EagerReadsKeys", cc), - kvGets("KVGets", cc), kvScans("KVScans", cc), kvCommits("KVCommits", cc), + changeFeedMutations("ChangeFeedMutations", cc), + changeFeedMutationsDurable("ChangeFeedMutationsDurable", cc), updateBatches("UpdateBatches", cc), + updateVersions("UpdateVersions", cc), loops("Loops", cc), fetchWaitingMS("FetchWaitingMS", cc), + fetchWaitingCount("FetchWaitingCount", cc), fetchExecutingMS("FetchExecutingMS", cc), + fetchExecutingCount("FetchExecutingCount", cc), readsRejected("ReadsRejected", cc), + wrongShardServer("WrongShardServer", cc), fetchedVersions("FetchedVersions", cc), + fetchesFromLogs("FetchesFromLogs", cc), quickGetValueHit("QuickGetValueHit", cc), + quickGetValueMiss("QuickGetValueMiss", cc), quickGetKeyValuesHit("QuickGetKeyValuesHit", cc), + quickGetKeyValuesMiss("QuickGetKeyValuesMiss", cc), kvScanBytes("KVScanBytes", cc), + kvGetBytes("KVGetBytes", cc), eagerReadsKeys("EagerReadsKeys", cc), kvGets("KVGets", cc), + kvScans("KVScans", cc), kvCommits("KVCommits", cc), changeFeedDiskReads("ChangeFeedDiskReads", cc), readLatencySample("ReadLatencyMetrics", self->thisServerID, SERVER_KNOBS->LATENCY_METRICS_LOGGING_INTERVAL, @@ -1133,6 +1151,11 @@ public: specialCounter( cc, "FetchKeysFetchActive", [self]() { return self->fetchKeysParallelismLock.activePermits(); }); specialCounter(cc, "FetchKeysWaiting", [self]() { return self->fetchKeysParallelismLock.waiters(); }); + specialCounter(cc, "FetchKeysFullFetchActive", [self]() { + return self->fetchKeysParallelismFullLock.activePermits(); + }); + specialCounter( + cc, "FetchKeysFullFetchWaiting", [self]() { return self->fetchKeysParallelismFullLock.waiters(); }); specialCounter(cc, "FetchChangeFeedFetchActive", [self]() { return self->fetchChangeFeedParallelismLock.activePermits(); }); @@ -1196,6 +1219,7 @@ public: byteSampleClears(false, LiteralStringRef("\xff\xff\xff")), durableInProgress(Void()), watchBytes(0), numWatches(0), noRecentUpdates(false), lastUpdate(now()), updateEagerReads(nullptr), fetchKeysParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM), + fetchKeysParallelismFullLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM_FULL), fetchChangeFeedParallelismLock(SERVER_KNOBS->FETCH_KEYS_PARALLELISM), fetchKeysBytesBudget(SERVER_KNOBS->STORAGE_FETCH_BYTES), fetchKeysBudgetUsed(false), serveFetchCheckpointParallelismLock(SERVER_KNOBS->SERVE_FETCH_CHECKPOINT_PARALLELISM), @@ -1386,6 +1410,28 @@ public: req.reply.sendError(e); } } + + void maybeInjectTargetedRestart(Version v) { + // inject an SS restart at most once per test + if (g_network->isSimulated() && !g_simulator.speedUpSimulation && + now() > g_simulator.injectTargetedSSRestartTime && + rebootAfterDurableVersion == std::numeric_limits::max()) { + CODE_PROBE(true, "Injecting SS targeted restart"); + TraceEvent("SimSSInjectTargetedRestart", thisServerID).detail("Version", v); + rebootAfterDurableVersion = v; + g_simulator.injectTargetedSSRestartTime = std::numeric_limits::max(); + } + } + + bool maybeInjectDelay() { + if (g_network->isSimulated() && !g_simulator.speedUpSimulation && now() > g_simulator.injectSSDelayTime) { + CODE_PROBE(true, "Injecting SS targeted delay"); + TraceEvent("SimSSInjectDelay", thisServerID); + g_simulator.injectSSDelayTime = std::numeric_limits::max(); + return true; + } + return false; + } }; const StringRef StorageServer::CurrentRunningFetchKeys::emptyString = LiteralStringRef(""); @@ -2198,46 +2244,54 @@ ACTOR Future overlappingChangeFeedsQ(StorageServer* data, OverlappingChang return Void(); } - Version metadataVersion = invalidVersion; + Version metadataWaitVersion = invalidVersion; auto ranges = data->keyChangeFeed.intersectingRanges(req.range); - std::map> rangeIds; + std::map> rangeIds; for (auto r : ranges) { for (auto& it : r.value()) { if (!it->removing) { // Can't tell other SS about a change feed create or stopVersion that may get rolled back, and we only // need to tell it about the metadata if req.minVersion > metadataVersion, since it will get the // information from its own private mutations if it hasn't processed up that version yet - metadataVersion = std::max(metadataVersion, it->metadataCreateVersion); + metadataWaitVersion = std::max(metadataWaitVersion, it->metadataCreateVersion); + // don't wait for all it->metadataVersion updates, if metadata was fetched from elsewhere it's already + // durable, and some updates are unecessary to wait for Version stopVersion; if (it->stopVersion != MAX_VERSION && req.minVersion > it->stopVersion) { stopVersion = it->stopVersion; - metadataVersion = std::max(metadataVersion, stopVersion); + metadataWaitVersion = std::max(metadataWaitVersion, stopVersion); } else { stopVersion = MAX_VERSION; } - rangeIds[it->id] = std::tuple(it->range, it->emptyVersion, stopVersion); + rangeIds[it->id] = std::tuple(it->range, it->emptyVersion, stopVersion, it->metadataVersion); } } } state OverlappingChangeFeedsReply reply; + reply.feedMetadataVersion = data->version.get(); for (auto& it : rangeIds) { - reply.rangeIds.push_back(OverlappingChangeFeedEntry( - it.first, std::get<0>(it.second), std::get<1>(it.second), std::get<2>(it.second))); + reply.feeds.push_back_deep(reply.arena, + OverlappingChangeFeedEntry(it.first, + std::get<0>(it.second), + std::get<1>(it.second), + std::get<2>(it.second), + std::get<3>(it.second))); TraceEvent(SevDebug, "OverlappingChangeFeedEntry", data->thisServerID) .detail("MinVersion", req.minVersion) .detail("FeedID", it.first) .detail("Range", std::get<0>(it.second)) .detail("EmptyVersion", std::get<1>(it.second)) - .detail("StopVersion", std::get<2>(it.second)); + .detail("StopVersion", std::get<2>(it.second)) + .detail("FeedMetadataVersion", std::get<3>(it.second)); } // Make sure all of the metadata we are sending won't get rolled back - if (metadataVersion != invalidVersion && metadataVersion > data->knownCommittedVersion.get()) { + if (metadataWaitVersion != invalidVersion && metadataWaitVersion > data->knownCommittedVersion.get()) { CODE_PROBE(true, "overlapping change feeds waiting for metadata version to be committed"); - wait(data->desiredOldestVersion.whenAtLeast(metadataVersion)); + wait(data->desiredOldestVersion.whenAtLeast(metadataWaitVersion)); } req.reply.send(reply); return Void(); @@ -2366,12 +2420,10 @@ static std::deque>::const_iterator searchChan break; } lastEnd = currentEnd + 1; + jump = std::min((int)(currentEnd - mutations.begin()), jump); currentEnd -= jump; jump <<= 1; } - if (currentEnd < mutations.begin()) { - currentEnd = mutations.begin(); - } auto ret = std::lower_bound(currentEnd, lastEnd, searchKey, MutationsAndVersionRef::OrderByVersion()); // TODO REMOVE: for validation if (ret != mutations.end()) { @@ -2498,6 +2550,7 @@ ACTOR Future> getChangeFeedMutations(Stor remainingDurableBytes)); data->counters.kvScanBytes += res.logicalSize(); + ++data->counters.changeFeedDiskReads; if (!inverted && !req.range.empty()) { data->checkChangeCounter(changeCounter, req.range); @@ -2569,21 +2622,37 @@ ACTOR Future> getChangeFeedMutations(Stor } } else if (memoryVerifyIdx < memoryReply.mutations.size() && version == memoryReply.mutations[memoryVerifyIdx].version) { - fmt::print("ERROR: SS {0} CF {1} SQ {2} has mutation at {3} in memory but all filtered out on disk!\n", - data->thisServerID.toString().substr(0, 4), - req.rangeID.printable().substr(0, 6), - streamUID.toString().substr(0, 8), - version); + if (version > feedInfo->storageVersion && version > feedInfo->fetchVersion) { + // Another validation case - feed was popped, data was fetched, fetched data was persisted but pop + // wasn't yet, then SS restarted. Now SS has the data without the popped version. This looks wrong + // here but is fine. + memoryVerifyIdx++; + } else { + fmt::print( + "ERROR: SS {0} CF {1} SQ {2} has mutation at {3} in memory but all filtered out on disk!\n", + data->thisServerID.toString().substr(0, 4), + req.rangeID.printable().substr(0, 6), + streamUID.toString().substr(0, 8), + version); - fmt::print(" Memory: ({})\n", memoryReply.mutations[memoryVerifyIdx].mutations.size()); - for (auto& it : memoryReply.mutations[memoryVerifyIdx].mutations) { - if (it.type == MutationRef::SetValue) { - fmt::print(" {}=\n", it.param1.printable().c_str()); - } else { - fmt::print(" {} - {}\n", it.param1.printable().c_str(), it.param2.printable().c_str()); + fmt::print(" Memory: ({})\n", memoryReply.mutations[memoryVerifyIdx].mutations.size()); + for (auto& it : memoryReply.mutations[memoryVerifyIdx].mutations) { + if (it.type == MutationRef::SetValue) { + fmt::print(" {}=\n", it.param1.printable().c_str()); + } else { + fmt::print(" {} - {}\n", it.param1.printable().c_str(), it.param2.printable().c_str()); + } } + fmt::print(" Disk(pre-filter): ({})\n", mutations.size()); + for (auto& it : mutations) { + if (it.type == MutationRef::SetValue) { + fmt::print(" {}=\n", it.param1.printable().c_str()); + } else { + fmt::print(" {} - {}\n", it.param1.printable().c_str(), it.param2.printable().c_str()); + } + } + ASSERT(false); } - ASSERT(false); } remainingDurableBytes -= sizeof(KeyValueRef) + @@ -5073,6 +5142,8 @@ void applyChangeFeedMutation(StorageServer* self, MutationRef const& m, Version DEBUG_MUTATION("ChangeFeedWriteSet", version, m, self->thisServerID) .detail("Range", it->range) .detail("ChangeFeedID", it->id); + + ++self->counters.changeFeedMutations; } else { CODE_PROBE(version <= it->emptyVersion, "Skip CF write because version <= emptyVersion"); CODE_PROBE(it->removing, "Skip CF write because removing"); @@ -5098,6 +5169,7 @@ void applyChangeFeedMutation(StorageServer* self, MutationRef const& m, Version DEBUG_MUTATION("ChangeFeedWriteClear", version, m, self->thisServerID) .detail("Range", it->range) .detail("ChangeFeedID", it->id); + ++self->counters.changeFeedMutations; } else { CODE_PROBE(version <= it->emptyVersion, "Skip CF clear because version <= emptyVersion"); CODE_PROBE(it->removing, "Skip CF clear because removing"); @@ -5353,22 +5425,27 @@ ACTOR Future tryGetRange(PromiseStream results, Transaction* // We have to store the version the change feed was stopped at in the SS instead of just the stopped status // In addition to simplifying stopping logic, it enables communicating stopped status when fetching change feeds // from other SS correctly -const Value changeFeedSSValue(KeyRangeRef const& range, Version popVersion, Version stopVersion) { +const Value changeFeedSSValue(KeyRangeRef const& range, + Version popVersion, + Version stopVersion, + Version metadataVersion) { BinaryWriter wr(IncludeVersion(ProtocolVersion::withChangeFeed())); wr << range; wr << popVersion; wr << stopVersion; + wr << metadataVersion; return wr.toValue(); } -std::tuple decodeChangeFeedSSValue(ValueRef const& value) { +std::tuple decodeChangeFeedSSValue(ValueRef const& value) { KeyRange range; - Version popVersion, stopVersion; + Version popVersion, stopVersion, metadataVersion; BinaryReader reader(value, IncludeVersion()); reader >> range; reader >> popVersion; reader >> stopVersion; - return std::make_tuple(range, popVersion, stopVersion); + reader >> metadataVersion; + return std::make_tuple(range, popVersion, stopVersion, metadataVersion); } ACTOR Future changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req) { @@ -5402,10 +5479,12 @@ ACTOR Future changeFeedPopQ(StorageServer* self, ChangeFeedPopRequest req) auto& mLV = self->addVersionToMutationLog(durableVersion); self->addMutationToMutationLog( mLV, - MutationRef( - MutationRef::SetValue, - persistChangeFeedKeys.begin.toString() + feed->second->id.toString(), - changeFeedSSValue(feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion))); + MutationRef(MutationRef::SetValue, + persistChangeFeedKeys.begin.toString() + feed->second->id.toString(), + changeFeedSSValue(feed->second->range, + feed->second->emptyVersion + 1, + feed->second->stopVersion, + feed->second->metadataVersion))); if (feed->second->storageVersion != invalidVersion) { ++self->counters.kvSystemClearRanges; self->addMutationToMutationLog(mLV, @@ -5497,7 +5576,8 @@ ACTOR Future fetchChangeFeedApplier(StorageServer* data, persistChangeFeedKeys.begin.toString() + changeFeedInfo->id.toString(), changeFeedSSValue(changeFeedInfo->range, changeFeedInfo->emptyVersion + 1, - changeFeedInfo->stopVersion))); + changeFeedInfo->stopVersion, + changeFeedInfo->metadataVersion))); data->addMutationToMutationLog( mLV, MutationRef(MutationRef::ClearRange, @@ -5616,8 +5696,10 @@ ACTOR Future fetchChangeFeedApplier(StorageServer* data, mLV, MutationRef(MutationRef::SetValue, persistChangeFeedKeys.begin.toString() + changeFeedInfo->id.toString(), - changeFeedSSValue( - changeFeedInfo->range, changeFeedInfo->emptyVersion + 1, changeFeedInfo->stopVersion))); + changeFeedSSValue(changeFeedInfo->range, + changeFeedInfo->emptyVersion + 1, + changeFeedInfo->stopVersion, + changeFeedInfo->metadataVersion))); data->addMutationToMutationLog(mLV, MutationRef(MutationRef::ClearRange, changeFeedDurableKey(changeFeedInfo->id, 0), @@ -5714,13 +5796,6 @@ ACTOR Future fetchChangeFeed(StorageServer* data, } } - /*fmt::print("DBG: SS {} Feed {} possibly destroyed {}, {} metadata create, {} desired committed\n", - data->thisServerID.toString().substr(0, 4), - changeFeedInfo->id.printable(), - changeFeedInfo->possiblyDestroyed, - changeFeedInfo->metadataCreateVersion, - data->desiredOldestVersion.get());*/ - // There are two reasons for change_feed_not_registered: // 1. The feed was just created, but the ss mutation stream is ahead of the GRV that fetchChangeFeedApplier // uses to read the change feed data from the database. In this case we need to wait and retry @@ -5759,7 +5834,7 @@ ACTOR Future fetchChangeFeed(StorageServer* data, data->changeFeedCleanupDurable[changeFeedInfo->id] = cleanupVersion; } - for (auto& it : data->changeFeedRemovals) { + for (auto& it : data->changeFeedDestroys) { it.second.send(changeFeedInfo->id); } @@ -5775,7 +5850,7 @@ ACTOR Future fetchChangeFeed(StorageServer* data, ACTOR Future> fetchChangeFeedMetadata(StorageServer* data, KeyRange keys, - PromiseStream removals, + PromiseStream destroyedFeeds, UID fetchKeysID) { // Wait for current TLog batch to finish to ensure that we're fetching metadata at a version >= the version of the @@ -5789,82 +5864,55 @@ ACTOR Future> fetchChangeFeedMetadata(StorageServer* data, .detail("FetchVersion", fetchVersion) .detail("FKID", fetchKeysID); - state std::set refreshedFeedIds; - state std::set destroyedFeedIds; - // before fetching feeds from other SS's, refresh any feeds we already have that are being marked as removed + state OverlappingChangeFeedsInfo feedMetadata = wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion)); + // rest of this actor needs to happen without waits that might yield to scheduler, to avoid races in feed metadata. + + // Find set of feeds we currently have that were not present in fetch, to infer that they may have been destroyed. + state std::unordered_map missingFeeds; auto ranges = data->keyChangeFeed.intersectingRanges(keys); for (auto& r : ranges) { for (auto& cfInfo : r.value()) { - auto feedCleanup = data->changeFeedCleanupDurable.find(cfInfo->id); - if (feedCleanup != data->changeFeedCleanupDurable.end() && cfInfo->removing && !cfInfo->destroyed) { - CODE_PROBE(true, "re-fetching feed scheduled for deletion! Un-mark it as removing"); - destroyedFeedIds.insert(cfInfo->id); - - cfInfo->removing = false; - // because we now have a gap in the metadata, it's possible this feed was destroyed - cfInfo->possiblyDestroyed = true; - // Set refreshInProgress, so that if this actor is replaced by an expanded move actor, the new actor - // picks up the refresh - cfInfo->refreshInProgress = true; - // reset fetch versions because everything previously fetched was cleaned up - cfInfo->fetchVersion = invalidVersion; - - cfInfo->durableFetchVersion = NotifiedVersion(); - - TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID) - .detail("RangeID", cfInfo->id) - .detail("Range", cfInfo->range) - .detail("FetchVersion", fetchVersion) - .detail("EmptyVersion", cfInfo->emptyVersion) - .detail("StopVersion", cfInfo->stopVersion) - .detail("FKID", fetchKeysID); - } else if (cfInfo->refreshInProgress) { - CODE_PROBE(true, "Racing refreshes for same change feed in fetch"); - destroyedFeedIds.insert(cfInfo->id); + if (cfInfo->removing && !cfInfo->destroyed) { + missingFeeds.insert({ cfInfo->id, cfInfo->metadataVersion }); } } } - state std::vector feeds = wait(data->cx->getOverlappingChangeFeeds(keys, fetchVersion)); - // handle change feeds removed while fetching overlapping - while (removals.getFuture().isReady()) { - Key remove = waitNext(removals.getFuture()); - for (int i = 0; i < feeds.size(); i++) { - if (feeds[i].rangeId == remove) { - swapAndPop(&feeds, i--); + // handle change feeds destroyed while fetching overlapping info + while (destroyedFeeds.getFuture().isReady()) { + Key destroyed = waitNext(destroyedFeeds.getFuture()); + for (int i = 0; i < feedMetadata.feeds.size(); i++) { + if (feedMetadata.feeds[i].feedId == destroyed) { + missingFeeds.erase(destroyed); // feed definitely destroyed, no need to infer + swapAndPop(&feedMetadata.feeds, i--); } } } std::vector feedIds; - feedIds.reserve(feeds.size()); + feedIds.reserve(feedMetadata.feeds.size()); // create change feed metadata if it does not exist - for (auto& cfEntry : feeds) { - auto cleanupEntry = data->changeFeedCleanupDurable.find(cfEntry.rangeId); + for (auto& cfEntry : feedMetadata.feeds) { + auto cleanupEntry = data->changeFeedCleanupDurable.find(cfEntry.feedId); bool cleanupPending = cleanupEntry != data->changeFeedCleanupDurable.end(); - feedIds.push_back(cfEntry.rangeId); - auto existingEntry = data->uidChangeFeed.find(cfEntry.rangeId); + auto existingEntry = data->uidChangeFeed.find(cfEntry.feedId); bool existing = existingEntry != data->uidChangeFeed.end(); TraceEvent(SevDebug, "FetchedChangeFeedInfo", data->thisServerID) - .detail("RangeID", cfEntry.rangeId) + .detail("RangeID", cfEntry.feedId) .detail("Range", cfEntry.range) .detail("FetchVersion", fetchVersion) .detail("EmptyVersion", cfEntry.emptyVersion) .detail("StopVersion", cfEntry.stopVersion) + .detail("FeedMetadataVersion", cfEntry.feedMetadataVersion) .detail("Existing", existing) + .detail("ExistingMetadataVersion", existing ? existingEntry->second->metadataVersion : invalidVersion) .detail("CleanupPendingVersion", cleanupPending ? cleanupEntry->second : invalidVersion) .detail("FKID", fetchKeysID); bool addMutationToLog = false; Reference changeFeedInfo; - auto fid = destroyedFeedIds.find(cfEntry.rangeId); - if (fid != destroyedFeedIds.end()) { - refreshedFeedIds.insert(cfEntry.rangeId); - destroyedFeedIds.erase(fid); - } - if (!existing) { CODE_PROBE(cleanupPending, "Fetch change feed which is cleanup pending. This means there was a move away and a move back, " @@ -5872,24 +5920,51 @@ ACTOR Future> fetchChangeFeedMetadata(StorageServer* data, changeFeedInfo = Reference(new ChangeFeedInfo()); changeFeedInfo->range = cfEntry.range; - changeFeedInfo->id = cfEntry.rangeId; + changeFeedInfo->id = cfEntry.feedId; changeFeedInfo->emptyVersion = cfEntry.emptyVersion; changeFeedInfo->stopVersion = cfEntry.stopVersion; - data->uidChangeFeed[cfEntry.rangeId] = changeFeedInfo; + data->uidChangeFeed[cfEntry.feedId] = changeFeedInfo; auto rs = data->keyChangeFeed.modify(cfEntry.range); for (auto r = rs.begin(); r != rs.end(); ++r) { r->value().push_back(changeFeedInfo); } - data->keyChangeFeed.coalesce(cfEntry.range.contents()); + data->keyChangeFeed.coalesce(cfEntry.range); addMutationToLog = true; } else { changeFeedInfo = existingEntry->second; + CODE_PROBE(cfEntry.feedMetadataVersion > data->version.get(), + "Change Feed fetched future metadata version"); + + auto fid = missingFeeds.find(cfEntry.feedId); + if (fid != missingFeeds.end()) { + TraceEvent(SevDebug, "ResetChangeFeedInfo", data->thisServerID) + .detail("RangeID", changeFeedInfo->id.printable()) + .detail("Range", changeFeedInfo->range) + .detail("FetchVersion", fetchVersion) + .detail("EmptyVersion", changeFeedInfo->emptyVersion) + .detail("StopVersion", changeFeedInfo->stopVersion) + .detail("PreviousMetadataVersion", changeFeedInfo->metadataVersion) + .detail("NewMetadataVersion", cfEntry.feedMetadataVersion) + .detail("FKID", fetchKeysID); + + missingFeeds.erase(fid); + ASSERT(!changeFeedInfo->destroyed); + ASSERT(changeFeedInfo->removing); + CODE_PROBE(true, "re-fetching feed scheduled for deletion! Un-mark it as removing"); + + changeFeedInfo->removing = false; + // reset fetch versions because everything previously fetched was cleaned up + changeFeedInfo->fetchVersion = invalidVersion; + changeFeedInfo->durableFetchVersion = NotifiedVersion(); + + addMutationToLog = true; + } + if (changeFeedInfo->destroyed) { - // race where multiple feeds fetched overlapping change feed, one realized feed was missing and marked - // it removed+destroyed, then this one fetched the same info + CODE_PROBE(true, "Change feed fetched and destroyed by other fetch while fetching metadata"); continue; } @@ -5909,82 +5984,63 @@ ACTOR Future> fetchChangeFeedMetadata(StorageServer* data, addMutationToLog = true; } } + feedIds.push_back(cfEntry.feedId); + addMutationToLog |= changeFeedInfo->updateMetadataVersion(cfEntry.feedMetadataVersion); if (addMutationToLog) { ASSERT(changeFeedInfo.isValid()); - auto& mLV = data->addVersionToMutationLog(data->data().getLatestVersion()); + Version logV = data->data().getLatestVersion(); + auto& mLV = data->addVersionToMutationLog(logV); data->addMutationToMutationLog( mLV, - MutationRef( - MutationRef::SetValue, - persistChangeFeedKeys.begin.toString() + cfEntry.rangeId.toString(), - changeFeedSSValue(cfEntry.range, changeFeedInfo->emptyVersion + 1, changeFeedInfo->stopVersion))); + MutationRef(MutationRef::SetValue, + persistChangeFeedKeys.begin.toString() + cfEntry.feedId.toString(), + changeFeedSSValue(cfEntry.range, + changeFeedInfo->emptyVersion + 1, + changeFeedInfo->stopVersion, + changeFeedInfo->metadataVersion))); // if we updated pop version, remove mutations while (!changeFeedInfo->mutations.empty() && changeFeedInfo->mutations.front().version <= changeFeedInfo->emptyVersion) { changeFeedInfo->mutations.pop_front(); } + if (BUGGIFY) { + data->maybeInjectTargetedRestart(logV); + } } } - CODE_PROBE(!refreshedFeedIds.empty(), "Feed refreshed between move away and move back"); - CODE_PROBE(!destroyedFeedIds.empty(), "Feed destroyed between move away and move back"); - for (auto& feedId : refreshedFeedIds) { - auto existingEntry = data->uidChangeFeed.find(feedId); - if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed || - !existingEntry->second->refreshInProgress) { - CODE_PROBE(true, "feed refreshed"); + for (auto& feed : missingFeeds) { + auto existingEntry = data->uidChangeFeed.find(feed.first); + ASSERT(existingEntry != data->uidChangeFeed.end()); + ASSERT(existingEntry->second->removing); + ASSERT(!existingEntry->second->destroyed); + + Version fetchedMetadataVersion = feedMetadata.getFeedMetadataVersion(existingEntry->second->range); + Version lastMetadataVersion = feed.second; + // Look for case where feed's range was moved away, feed was destroyed, and then feed's range was moved back. + // This happens where feed is removing, the fetch metadata is higher than the moved away version, and the feed + // isn't in the fetched response. In that case, the feed must have been destroyed between lastMetadataVersion + // and fetchedMetadataVersion + if (lastMetadataVersion >= fetchedMetadataVersion) { + CODE_PROBE(true, "Change Feed fetched higher metadata version before moved away"); continue; } - // Since cleanup put a mutation in the log to delete the change feed data, put one in the log to restore - // it - // We may just want to refactor this so updateStorage does explicit deletes based on - // changeFeedCleanupDurable and not use the mutation log at all for the change feed metadata cleanup. - // Then we wouldn't have to reset anything here or above - // Do the mutation log update here instead of above to ensure we only add it back to the mutation log if we're - // sure it wasn't deleted in the metadata gap - Version metadataVersion = data->data().getLatestVersion(); - auto& mLV = data->addVersionToMutationLog(metadataVersion); - data->addMutationToMutationLog( - mLV, - MutationRef(MutationRef::SetValue, - persistChangeFeedKeys.begin.toString() + existingEntry->second->id.toString(), - changeFeedSSValue(existingEntry->second->range, - existingEntry->second->emptyVersion + 1, - existingEntry->second->stopVersion))); - TraceEvent(SevDebug, "PersistingResetChangeFeedInfo", data->thisServerID) - .detail("RangeID", existingEntry->second->id) - .detail("Range", existingEntry->second->range) - .detail("FetchVersion", fetchVersion) - .detail("EmptyVersion", existingEntry->second->emptyVersion) - .detail("StopVersion", existingEntry->second->stopVersion) - .detail("FKID", fetchKeysID) - .detail("MetadataVersion", metadataVersion); - existingEntry->second->refreshInProgress = false; - } - for (auto& feedId : destroyedFeedIds) { - auto existingEntry = data->uidChangeFeed.find(feedId); - if (existingEntry == data->uidChangeFeed.end() || existingEntry->second->destroyed) { - CODE_PROBE(true, "feed refreshed but then destroyed elsewhere"); - continue; - } - - /*fmt::print("DBG: SS {} fetching feed {} was refreshed but not present!! assuming destroyed\n", - data->thisServerID.toString().substr(0, 4), - feedId.printable());*/ Version cleanupVersion = data->data().getLatestVersion(); + CODE_PROBE(true, "Destroying change feed from fetch metadata"); // TraceEvent(SevDebug, "DestroyingChangeFeedFromFetchMetadata", data->thisServerID) - .detail("RangeID", feedId) + .detail("RangeID", feed.first) .detail("Range", existingEntry->second->range) .detail("Version", cleanupVersion) .detail("FKID", fetchKeysID); if (g_network->isSimulated()) { - ASSERT(g_simulator.validationData.allDestroyedChangeFeedIDs.count(feedId.toString())); + // verify that the feed was actually destroyed and it's not an error in this inference logic + ASSERT(g_simulator.validationData.allDestroyedChangeFeedIDs.count(feed.first.toString())); } - Key beginClearKey = feedId.withPrefix(persistChangeFeedKeys.begin); + Key beginClearKey = feed.first.withPrefix(persistChangeFeedKeys.begin); auto& mLV = data->addVersionToMutationLog(cleanupVersion); data->addMutationToMutationLog(mLV, @@ -5992,15 +6048,18 @@ ACTOR Future> fetchChangeFeedMetadata(StorageServer* data, ++data->counters.kvSystemClearRanges; data->addMutationToMutationLog(mLV, MutationRef(MutationRef::ClearRange, - changeFeedDurableKey(feedId, 0), - changeFeedDurableKey(feedId, cleanupVersion))); + changeFeedDurableKey(feed.first, 0), + changeFeedDurableKey(feed.first, cleanupVersion))); ++data->counters.kvSystemClearRanges; existingEntry->second->destroy(cleanupVersion); - data->changeFeedCleanupDurable[feedId] = cleanupVersion; + data->changeFeedCleanupDurable[feed.first] = cleanupVersion; - for (auto& it : data->changeFeedRemovals) { - it.second.send(feedId); + for (auto& it : data->changeFeedDestroys) { + it.second.send(feed.first); + } + if (BUGGIFY) { + data->maybeInjectTargetedRestart(cleanupVersion); } } return feedIds; @@ -6013,7 +6072,7 @@ ACTOR Future> dispatchChangeFeeds(StorageServer KeyRange keys, Version beginVersion, Version endVersion, - PromiseStream removals, + PromiseStream destroyedFeeds, std::vector* feedIds, std::unordered_set newFeedIds) { state std::unordered_map feedMaxFetched; @@ -6042,7 +6101,7 @@ ACTOR Future> dispatchChangeFeeds(StorageServer loop { Future nextFeed = Never(); - if (!removals.getFuture().isReady()) { + if (!destroyedFeeds.getFuture().isReady()) { bool done = true; while (!feedFetches.empty()) { if (feedFetches.begin()->second.isReady()) { @@ -6062,11 +6121,11 @@ ACTOR Future> dispatchChangeFeeds(StorageServer } } choose { - when(state Key remove = waitNext(removals.getFuture())) { + when(state Key destroyed = waitNext(destroyedFeeds.getFuture())) { wait(delay(0)); - feedFetches.erase(remove); + feedFetches.erase(destroyed); for (int i = 0; i < feedIds->size(); i++) { - if ((*feedIds)[i] == remove) { + if ((*feedIds)[i] == destroyed) { swapAndPop(feedIds, i--); } } @@ -6077,7 +6136,7 @@ ACTOR Future> dispatchChangeFeeds(StorageServer } catch (Error& e) { if (!data->shuttingDown) { - data->changeFeedRemovals.erase(fetchKeysID); + data->changeFeedDestroys.erase(fetchKeysID); } throw; } @@ -6090,6 +6149,8 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { state Future warningLogger = logFetchKeysWarning(shard); state const double startTime = now(); state Version fetchVersion = invalidVersion; + + state PromiseStream destroyedFeeds; state FetchKeysMetricReporter metricReporter(fetchKeysID, startTime, keys, @@ -6098,17 +6159,27 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { data->counters.bytesFetched, data->counters.kvFetched); + // need to set this at the very start of the fetch, to handle any private change feed destroy mutations we get for + // this key range, that apply to change feeds we don't know about yet because their metadata hasn't been fetched yet + data->changeFeedDestroys[fetchKeysID] = destroyedFeeds; + // delay(0) to force a return to the run loop before the work of fetchKeys is started. // This allows adding->start() to be called inline with CSK. - wait(data->coreStarted.getFuture() && delay(0)); + try { + wait(data->coreStarted.getFuture() && delay(0)); - // On SS Reboot, durableVersion == latestVersion, so any mutations we add to the mutation log would be skipped if - // added before latest version advances. - // To ensure this doesn't happen, we wait for version to increase by one if this fetchKeys was initiated by a - // changeServerKeys from restoreDurableState - if (data->version.get() == data->durableVersion.get()) { - wait(data->version.whenAtLeast(data->version.get() + 1)); - wait(delay(0)); + // On SS Reboot, durableVersion == latestVersion, so any mutations we add to the mutation log would be skipped + // if added before latest version advances. To ensure this doesn't happen, we wait for version to increase by + // one if this fetchKeys was initiated by a changeServerKeys from restoreDurableState + if (data->version.get() == data->durableVersion.get()) { + wait(data->version.whenAtLeast(data->version.get() + 1)); + wait(delay(0)); + } + } catch (Error& e) { + if (!data->shuttingDown) { + data->changeFeedDestroys.erase(fetchKeysID); + } + throw e; } try { @@ -6120,9 +6191,8 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { .detail("Version", data->version.get()) .detail("FKID", fetchKeysID); - state PromiseStream removals; - data->changeFeedRemovals[fetchKeysID] = removals; - state Future> fetchCFMetadata = fetchChangeFeedMetadata(data, keys, removals, fetchKeysID); + state Future> fetchCFMetadata = + fetchChangeFeedMetadata(data, keys, destroyedFeeds, fetchKeysID); validate(data); @@ -6145,6 +6215,9 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { TraceEvent(SevDebug, "FetchKeysVersionSatisfied", data->thisServerID).detail("FKID", interval.pairID); + wait(data->fetchKeysParallelismFullLock.take(TaskPriority::DefaultYield)); + state FlowLock::Releaser holdingFullFKPL(data->fetchKeysParallelismFullLock); + wait(data->fetchKeysParallelismLock.take(TaskPriority::DefaultYield)); state FlowLock::Releaser holdingFKPL(data->fetchKeysParallelismLock); @@ -6376,8 +6449,14 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { // being recovered. Instead we wait for the updateStorage loop to commit something (and consequently also what // we have written) - state Future> feedFetchMain = dispatchChangeFeeds( - data, fetchKeysID, keys, 0, fetchVersion + 1, removals, &changeFeedsToFetch, std::unordered_set()); + state Future> feedFetchMain = dispatchChangeFeeds(data, + fetchKeysID, + keys, + 0, + fetchVersion + 1, + destroyedFeeds, + &changeFeedsToFetch, + std::unordered_set()); state Future fetchDurable = data->durableVersion.whenAtLeast(data->storageVersion() + 1); state Future dataArrive = data->version.whenAtLeast(fetchVersion); @@ -6440,7 +6519,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { keys, fetchVersion + 1, shard->transferredVersion, - removals, + destroyedFeeds, &changeFeedsToFetch, newChangeFeeds); @@ -6494,7 +6573,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { } } - data->changeFeedRemovals.erase(fetchKeysID); + data->changeFeedDestroys.erase(fetchKeysID); shard->phase = AddingShard::Waiting; @@ -6519,6 +6598,9 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { // Note that since it receives a pointer to FetchInjectionInfo, the thread does not leave this actor until this // point. + // At this point change feed fetching and mutation injection is complete, so full fetch is finished. + holdingFullFKPL.release(); + // Wait for the transferred version (and therefore the shard data) to be committed and durable. wait(data->durableVersion.whenAtLeast(feedTransferredVersion)); @@ -6547,7 +6629,7 @@ ACTOR Future fetchKeys(StorageServer* data, AddingShard* shard) { .errorUnsuppressed(e) .detail("Version", data->version.get()); if (!data->shuttingDown) { - data->changeFeedRemovals.erase(fetchKeysID); + data->changeFeedDestroys.erase(fetchKeysID); } if (e.code() == error_code_actor_cancelled && !data->shuttingDown && shard->phase >= AddingShard::Fetching) { if (shard->phase < AddingShard::FetchingCF) { @@ -6800,11 +6882,15 @@ void cleanUpChangeFeeds(StorageServer* data, const KeyRangeRef& keys, Version ve auto feed = data->uidChangeFeed.find(f.first); if (feed != data->uidChangeFeed.end()) { + feed->second->updateMetadataVersion(version); feed->second->removing = true; - feed->second->refreshInProgress = false; feed->second->moved(feed->second->range); feed->second->newMutations.trigger(); } + + if (BUGGIFY) { + data->maybeInjectTargetedRestart(durableVersion); + } } else { // if just part of feed's range is moved away auto feed = data->uidChangeFeed.find(f.first); @@ -7425,7 +7511,7 @@ private: .detail("Status", status); // Because of data moves, we can get mutations operating on a change feed we don't yet know about, because - // the fetch hasn't started yet + // the metadata fetch hasn't started yet bool createdFeed = false; if (feed == data->uidChangeFeed.end() && status != ChangeFeedStatus::CHANGE_FEED_DESTROY) { createdFeed = true; @@ -7457,6 +7543,9 @@ private: } data->keyChangeFeed.coalesce(changeFeedRange.contents()); } + if (feed != data->uidChangeFeed.end()) { + feed->second->updateMetadataVersion(currentVersion); + } bool popMutationLog = false; bool addMutationToLog = false; @@ -7518,22 +7607,29 @@ private: feed->second->destroy(currentVersion); data->changeFeedCleanupDurable[feed->first] = cleanupVersion; + + if (BUGGIFY) { + data->maybeInjectTargetedRestart(cleanupVersion); + } } if (status == ChangeFeedStatus::CHANGE_FEED_DESTROY) { - for (auto& it : data->changeFeedRemovals) { + for (auto& it : data->changeFeedDestroys) { it.second.send(changeFeedId); } } if (addMutationToLog) { - auto& mLV = data->addVersionToMutationLog(data->data().getLatestVersion()); + Version logV = data->data().getLatestVersion(); + auto& mLV = data->addVersionToMutationLog(logV); data->addMutationToMutationLog( mLV, MutationRef(MutationRef::SetValue, persistChangeFeedKeys.begin.toString() + changeFeedId.toString(), - changeFeedSSValue( - feed->second->range, feed->second->emptyVersion + 1, feed->second->stopVersion))); + changeFeedSSValue(feed->second->range, + feed->second->emptyVersion + 1, + feed->second->stopVersion, + feed->second->metadataVersion))); if (popMutationLog) { ++data->counters.kvSystemClearRanges; data->addMutationToMutationLog(mLV, @@ -7541,6 +7637,9 @@ private: changeFeedDurableKey(feed->second->id, 0), changeFeedDurableKey(feed->second->id, popVersion))); } + if (BUGGIFY) { + data->maybeInjectTargetedRestart(logV); + } } } else if ((m.type == MutationRef::SetValue || m.type == MutationRef::ClearRange) && m.param1.startsWith(TenantMetadata::tenantMapPrivatePrefix)) { @@ -7753,6 +7852,10 @@ ACTOR Future update(StorageServer* data, bool* pReceivedUpdate) { } } + if (data->maybeInjectDelay()) { + wait(delay(deterministicRandom()->random01() * 10.0)); + } + while (data->byteSampleClearsTooLarge.get()) { wait(data->byteSampleClearsTooLarge.onChange()); } @@ -8394,6 +8497,7 @@ ACTOR Future updateStorage(StorageServer* data) { state std::vector updatedChangeFeeds(modifiedChangeFeeds.begin(), modifiedChangeFeeds.end()); state int curFeed = 0; + state int64_t durableChangeFeedMutations = 0; while (curFeed < updatedChangeFeeds.size()) { auto info = data->uidChangeFeed.find(updatedChangeFeeds[curFeed]); if (info != data->uidChangeFeed.end()) { @@ -8412,6 +8516,7 @@ ACTOR Future updateStorage(StorageServer* data) { // in the stream. We should fix this assert to be strictly > and re-enable it ASSERT(it.version >= info->second->storageVersion); info->second->storageVersion = it.version; + durableChangeFeedMutations++; } if (info->second->fetchVersion != invalidVersion && !info->second->removing) { @@ -8500,6 +8605,7 @@ ACTOR Future updateStorage(StorageServer* data) { TraceEvent("RebootWhenDurableTriggered", data->thisServerID) .detail("NewOldestVersion", newOldestVersion) .detail("RebootAfterDurableVersion", data->rebootAfterDurableVersion); + CODE_PROBE(true, "SS rebooting after durable"); // To avoid brokenPromise error, which is caused by the sender of the durableInProgress (i.e., this // process) never sets durableInProgress, we should set durableInProgress before send the // please_reboot() error. Otherwise, in the race situation when storage server receives both reboot and @@ -8571,6 +8677,8 @@ ACTOR Future updateStorage(StorageServer* data) { } } + data->counters.changeFeedMutationsDurable += durableChangeFeedMutations; + durableInProgress.send(Void()); wait(delay(0, TaskPriority::UpdateStorage)); // Setting durableInProgess could cause the storage server to // shut down, so delay to check for cancellation @@ -8646,7 +8754,8 @@ void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available) { // ASSERT( self->debug_inApplyUpdate ); ASSERT(!keys.empty()); - auto& mLV = self->addVersionToMutationLog(self->data().getLatestVersion()); + Version logV = self->data().getLatestVersion(); + auto& mLV = self->addVersionToMutationLog(logV); KeyRange availableKeys = KeyRangeRef(persistShardAvailableKeys.begin.toString() + keys.begin.toString(), persistShardAvailableKeys.begin.toString() + keys.end.toString()); @@ -8682,6 +8791,10 @@ void setAvailableStatus(StorageServer* self, KeyRangeRef keys, bool available) { .detail("DeleteVersion", mLV.version + 1); } } + + if (BUGGIFY) { + self->maybeInjectTargetedRestart(logV); + } } void updateStorageShard(StorageServer* data, StorageServerShard shard) { @@ -8718,7 +8831,8 @@ void updateStorageShard(StorageServer* data, StorageServerShard shard) { void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned) { ASSERT(!keys.empty()); - auto& mLV = self->addVersionToMutationLog(self->data().getLatestVersion()); + Version logV = self->data().getLatestVersion(); + auto& mLV = self->addVersionToMutationLog(logV); KeyRange assignedKeys = KeyRangeRef(persistShardAssignedKeys.begin.toString() + keys.begin.toString(), persistShardAssignedKeys.begin.toString() + keys.end.toString()); //TraceEvent("SetAssignedStatus", self->thisServerID).detail("Version", mLV.version).detail("RangeBegin", assignedKeys.begin).detail("RangeEnd", assignedKeys.end); @@ -8735,6 +8849,10 @@ void setAssignedStatus(StorageServer* self, KeyRangeRef keys, bool nowAssigned) assignedKeys.end, endAssigned ? LiteralStringRef("1") : LiteralStringRef("0"))); } + + if (BUGGIFY) { + self->maybeInjectTargetedRestart(logV); + } } void StorageServerDisk::clearRange(KeyRangeRef keys) { @@ -9138,13 +9256,15 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor for (feedLoc = 0; feedLoc < changeFeeds.size(); feedLoc++) { Key changeFeedId = changeFeeds[feedLoc].key.removePrefix(persistChangeFeedKeys.begin); KeyRange changeFeedRange; - Version popVersion, stopVersion; - std::tie(changeFeedRange, popVersion, stopVersion) = decodeChangeFeedSSValue(changeFeeds[feedLoc].value); + Version popVersion, stopVersion, metadataVersion; + std::tie(changeFeedRange, popVersion, stopVersion, metadataVersion) = + decodeChangeFeedSSValue(changeFeeds[feedLoc].value); TraceEvent(SevDebug, "RestoringChangeFeed", data->thisServerID) .detail("RangeID", changeFeedId) .detail("Range", changeFeedRange) .detail("StopVersion", stopVersion) - .detail("PopVer", popVersion); + .detail("PopVer", popVersion) + .detail("MetadataVersion", metadataVersion); Reference changeFeedInfo(new ChangeFeedInfo()); changeFeedInfo->range = changeFeedRange; changeFeedInfo->id = changeFeedId; @@ -9152,6 +9272,7 @@ ACTOR Future restoreDurableState(StorageServer* data, IKeyValueStore* stor changeFeedInfo->storageVersion = version; changeFeedInfo->emptyVersion = popVersion - 1; changeFeedInfo->stopVersion = stopVersion; + changeFeedInfo->metadataVersion = metadataVersion; data->uidChangeFeed[changeFeedId] = changeFeedInfo; auto rs = data->keyChangeFeed.modify(changeFeedRange); for (auto r = rs.begin(); r != rs.end(); ++r) { diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 9958fbc57a..a82ddecf12 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -385,7 +385,9 @@ ACTOR Future> getWorkloadIface(WorkloadRequest work, wcx.sharedRandomNumber = work.sharedRandomNumber; workload = IWorkloadFactory::create(testName.toString(), wcx); - wait(workload->initialized()); + if (workload) { + wait(workload->initialized()); + } auto unconsumedOptions = checkAllOptionsConsumed(workload ? workload->options : VectorRef()); if (!workload || unconsumedOptions.size()) { diff --git a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp index 7eaaeb63b1..5d3e2605dd 100644 --- a/fdbserver/workloads/BlobGranuleVerifier.actor.cpp +++ b/fdbserver/workloads/BlobGranuleVerifier.actor.cpp @@ -237,57 +237,64 @@ struct BlobGranuleVerifierWorkload : TestWorkload { while (timeTravelIt != timeTravelChecks.end() && currentTime >= timeTravelIt->first) { state OldRead oldRead = timeTravelIt->second; timeTravelChecksMemory -= oldRead.oldResult.expectedSize(); + // advance iterator before doing read, so if it gets error we don't retry it timeTravelIt = timeTravelChecks.erase(timeTravelIt); if (prevPurgeVersion == -1) { prevPurgeVersion = oldRead.v; } - // advance iterator before doing read, so if it gets error we don't retry it - try { - state Version newPurgeVersion = 0; - state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5; - if (doPurging) { - Version maxPurgeVersion = oldRead.v; - for (auto& it : timeTravelChecks) { - maxPurgeVersion = std::min(it.second.v, maxPurgeVersion); - } - if (prevPurgeVersion < maxPurgeVersion) { - newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion); - prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion); - Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, false)); - wait(cx->waitPurgeGranulesComplete(purgeKey)); - self->purges++; - } else { - doPurging = false; - } + // before doing read, purge just before read version + state Version newPurgeVersion = 0; + state bool doPurging = allowPurging && deterministicRandom()->random01() < 0.5; + if (doPurging) { + CODE_PROBE(true, "BGV considering purge"); + Version maxPurgeVersion = oldRead.v; + for (auto& it : timeTravelChecks) { + maxPurgeVersion = std::min(it.second.v, maxPurgeVersion); } + if (prevPurgeVersion < maxPurgeVersion) { + CODE_PROBE(true, "BGV doing purge"); + newPurgeVersion = deterministicRandom()->randomInt64(prevPurgeVersion, maxPurgeVersion); + prevPurgeVersion = std::max(prevPurgeVersion, newPurgeVersion); + if (BGV_DEBUG) { + fmt::print("BGV Purging @ {0}\n", newPurgeVersion); + } + try { + Key purgeKey = wait(cx->purgeBlobGranules(normalKeys, newPurgeVersion, {}, false)); + if (BGV_DEBUG) { + fmt::print("BGV Purged @ {0}, waiting\n", newPurgeVersion); + } + wait(cx->waitPurgeGranulesComplete(purgeKey)); + } catch (Error& e) { + if (e.code() == error_code_operation_cancelled) { + throw e; + } + // purging shouldn't error, it should retry. + if (BGV_DEBUG) { + fmt::print("Unexpected error {0} purging @ {1}!\n", e.name(), newPurgeVersion); + } + ASSERT(false); + } + CODE_PROBE(true, "BGV purge complete"); + if (BGV_DEBUG) { + fmt::print("BGV Purge complete @ {0}\n", newPurgeVersion); + } + self->purges++; + } else { + doPurging = false; + } + } + + // do time travel read + try { std::pair>> reReadResult = wait(readFromBlob(cx, self->bstore, oldRead.range, 0, oldRead.v)); if (!compareFDBAndBlob(oldRead.oldResult, reReadResult, oldRead.range, oldRead.v, BGV_DEBUG)) { self->mismatches++; } self->timeTravelReads++; - - if (doPurging) { - wait(self->killBlobWorkers(cx, self)); - std::pair>> versionRead = - wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion)); - try { - Version minSnapshotVersion = newPurgeVersion; - for (auto& it : versionRead.second) { - minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion); - } - std::pair>> versionRead = - wait(readFromBlob(cx, self->bstore, oldRead.range, 0, minSnapshotVersion - 1)); - ASSERT(false); - } catch (Error& e) { - if (e.code() == error_code_actor_cancelled) { - throw; - } - ASSERT(e.code() == error_code_blob_granule_transaction_too_old); - } - } } catch (Error& e) { + fmt::print("Error TT: {0}\n", e.name()); if (e.code() == error_code_blob_granule_transaction_too_old) { self->timeTravelTooOld++; // TODO: add debugging info for when this is a failure @@ -297,6 +304,51 @@ struct BlobGranuleVerifierWorkload : TestWorkload { oldRead.v); } } + + // if purged just before read, verify that purge cleaned up data by restarting blob workers and + // reading older than the purge version + if (doPurging) { + wait(self->killBlobWorkers(cx, self)); + if (BGV_DEBUG) { + fmt::print("BGV Reading post-purge [{0} - {1}) @ {2}\n", + oldRead.range.begin.printable(), + oldRead.range.end.printable(), + prevPurgeVersion); + } + // ensure purge version exactly is still readable + std::pair>> versionRead1 = + wait(readFromBlob(cx, self->bstore, oldRead.range, 0, prevPurgeVersion)); + if (BGV_DEBUG) { + fmt::print("BGV Post-purge first read:\n"); + printGranuleChunks(versionRead1.second); + } + try { + // read at purgeVersion - 1, should NOT be readable + Version minSnapshotVersion = newPurgeVersion; + for (auto& it : versionRead1.second) { + minSnapshotVersion = std::min(minSnapshotVersion, it.snapshotVersion); + } + if (BGV_DEBUG) { + fmt::print("BGV Reading post-purge again [{0} - {1}) @ {2}\n", + oldRead.range.begin.printable(), + oldRead.range.end.printable(), + minSnapshotVersion - 1); + } + std::pair>> versionRead2 = + wait(readFromBlob(cx, self->bstore, oldRead.range, 0, minSnapshotVersion - 1)); + if (BGV_DEBUG) { + fmt::print("BGV ERROR: data not purged! Read successful!!\n"); + printGranuleChunks(versionRead2.second); + } + ASSERT(false); + } catch (Error& e) { + if (e.code() == error_code_actor_cancelled) { + throw; + } + ASSERT(e.code() == error_code_blob_granule_transaction_too_old); + CODE_PROBE(true, "BGV verified too old after purge"); + } + } } // pick a random range @@ -471,6 +523,8 @@ struct BlobGranuleVerifierWorkload : TestWorkload { // For some reason simulation is still passing when this fails?.. so assert for now ASSERT(result); + // FIXME: if doPurging was set, possibly do one last purge here, and verify it succeeds with no errors + if (self->clientId == 0 && SERVER_KNOBS->BG_ENABLE_MERGING && deterministicRandom()->random01() < 0.1) { CODE_PROBE(true, "BGV clearing database and awaiting merge"); wait(clearAndAwaitMerge(cx, normalKeys)); diff --git a/fdbserver/workloads/ChangeFeedOperations.actor.cpp b/fdbserver/workloads/ChangeFeedOperations.actor.cpp new file mode 100644 index 0000000000..05b1053ccb --- /dev/null +++ b/fdbserver/workloads/ChangeFeedOperations.actor.cpp @@ -0,0 +1,767 @@ +/* + * ChangeFeedOperations.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbclient/FDBOptions.g.h" +#include "fdbclient/ManagementAPI.actor.h" +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/SystemData.h" +#include "fdbserver/TesterInterface.actor.h" +#include "fdbserver/workloads/workloads.actor.h" +#include "fdbserver/workloads/BulkSetup.actor.h" +#include "flow/Arena.h" +#include "flow/IRandom.h" +#include "flow/Trace.h" +#include "flow/Util.h" +#include "flow/serialize.h" +#include +#include + +#include "flow/actorcompiler.h" // This must be the last #include. + +// enable to debug specific operations for a given change feed +#define DEBUG_KEY ""_sr + +#define DEBUG_CF(feedKey) (feedKey.printable() == DEBUG_KEY) + +ACTOR Future doPop(Database cx, Key key, Key feedID, Version version, Version* doneOut) { + wait(cx->popChangeFeedMutations(feedID, version)); + if (*doneOut < version) { + *doneOut = version; + } + if (DEBUG_CF(key)) { + fmt::print("DBG) {0} Popped through {1}\n", key.printable(), version); + } + // TODO: could strengthen pop checking by validating that a read immediately after the pop completes has no data + return Void(); +} + +struct FeedTestData : ReferenceCounted, NonCopyable { + Key key; + KeyRange keyRange; + Key feedID; + int nextVal; + Future liveReader; + bool lastCleared = false; + + std::vector> pops; + Version poppingVersion; + Version poppedVersion; + Optional stopVersion; + bool destroying; + bool destroyed; + bool complete; + + int popWindow; + int popDelayWindow; + + std::deque>> writesByVersion; + + // these were all committed + std::deque>> pendingCheck; + NotifiedVersion checkVersion; + + FeedTestData(Key key, bool doPops) + : key(key), keyRange(KeyRangeRef(key, keyAfter(key))), feedID(key.withPrefix(LiteralStringRef("CF"))), nextVal(0), + lastCleared(false), poppingVersion(0), poppedVersion(0), destroying(false), destroyed(false), complete(false), + checkVersion(0) { + if (doPops) { + popWindow = deterministicRandom()->randomExp(1, 8); + popDelayWindow = deterministicRandom()->randomInt(0, 2) * deterministicRandom()->randomExp(1, 4); + } else { + popWindow = -1; + popDelayWindow = -1; + } + } + + Value nextValue() { + std::string v = std::to_string(nextVal); + nextVal++; + return Value(v); + } + + void update(Version version, Optional value) { + if (!stopVersion.present()) { + // if feed is stopped, value should not get read + writesByVersion.push_back({ version, value }); + pendingCheck.push_back(writesByVersion.back()); + checkVersion.set(version); + } + } + + void testComplete() { + complete = true; + checkVersion.set(checkVersion.get() + 1); + } + + void pop(Database cx, Version v) { + if (DEBUG_CF(key)) { + fmt::print("DBG) {0} Popping through {1}\n", key.printable(), v); + } + ASSERT(poppingVersion < v); + poppingVersion = v; + while (!writesByVersion.empty() && v > writesByVersion.front().first) { + writesByVersion.pop_front(); + } + while (!pendingCheck.empty() && v > pendingCheck.front().first) { + pendingCheck.pop_front(); + } + pops.push_back(doPop(cx, key, feedID, v, &poppedVersion)); + } +}; + +static void rollbackFeed(Key key, + std::deque>& buffered, + Version version, + MutationRef rollbackMutation) { + Version rollbackVersion; + BinaryReader br(rollbackMutation.param2, Unversioned()); + br >> rollbackVersion; + TraceEvent("ChangeFeedRollback").detail("Key", key).detail("Ver", version).detail("RollbackVer", rollbackVersion); + if (DEBUG_CF(key)) { + fmt::print("DBG) {0} Rolling back {1} -> {2}\n", key.printable(), version, rollbackVersion); + } + while (!buffered.empty() && buffered.back().version > rollbackVersion) { + TraceEvent("ChangeFeedRollbackVer").detail("Ver", buffered.back().version); + buffered.pop_back(); + } +} + +static void checkNextResult(Key key, + std::deque>& buffered, + std::deque>>& checkData) { + // First asserts are checking data is in the form the test is supposed to produce + ASSERT(!buffered.empty()); + ASSERT(buffered.front().mutations.size() == 1); + ASSERT(buffered.front().mutations[0].param1 == key); + + // Below asserts are correctness of change feed invariants. + + // Handle case where txn retried and wrote same value twice. checkData's version is the committed one, so the same + // update may appear at an earlier version. This is fine, as long as it then actually appears at the committed + // version + // TODO: could strengthen this check a bit and only allow it to appear at the lower version if the txn retried on + // commit_unknown_result? + if (checkData.front().first < buffered.front().version) { + fmt::print("ERROR. {0} Check version {1} != {2}.\n Check: {3} {4}\n Buffered: {5} {6}\n", + key.printable(), + checkData.front().first, + buffered.front().version, + checkData.front().second.present() ? "SET" : "CLEAR", + checkData.front().second.present() ? checkData.front().second.get().printable() + : keyAfter(key).printable(), + buffered.front().mutations[0].type == MutationRef::SetValue ? "SET" : "CLEAR", + buffered.front().mutations[0].param2.printable()); + } + ASSERT(checkData.front().first >= buffered.front().version); + + if (checkData.front().second.present()) { + ASSERT(buffered.front().mutations[0].type == MutationRef::SetValue); + ASSERT(buffered.front().mutations[0].param2 == checkData.front().second.get()); + } else { + ASSERT(buffered.front().mutations[0].type == MutationRef::ClearRange); + ASSERT(buffered.front().mutations[0].param2 == keyAfter(key)); + } + + if (checkData.front().first == buffered.front().version) { + checkData.pop_front(); + } + buffered.pop_front(); +} + +ACTOR Future liveReader(Database cx, Reference data, Version begin) { + state Version lastCheckVersion = 0; + state Version nextCheckVersion = 0; + state std::deque> buffered; + state Reference results = makeReference(); + state Future stream = + cx->getChangeFeedStream(results, data->feedID, begin, std::numeric_limits::max(), data->keyRange); + try { + loop { + if (data->complete && data->pendingCheck.empty()) { + return Void(); + } + nextCheckVersion = data->pendingCheck.empty() ? invalidVersion : data->pendingCheck.front().first; + choose { + when(Standalone> res = waitNext(results->mutations.getFuture())) { + for (auto& it : res) { + if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) { + rollbackFeed(data->key, buffered, it.version, it.mutations.back()); + } else { + if (it.mutations.size() == 0) { + // FIXME: THIS SHOULD NOT HAPPEN + // FIXME: these are also getting sent past stopVersion!! + } else { + if (data->stopVersion.present()) { + if (it.version > data->stopVersion.get()) { + fmt::print("DBG) {0} Read data with version {1} > stop version {2} ({3})\n", + data->key.printable(), + it.version, + data->stopVersion.get(), + it.mutations.size()); + } + ASSERT(it.version <= data->stopVersion.get()); + } + buffered.push_back(Standalone(it)); + if (DEBUG_CF(data->key)) { + fmt::print("DBG) {0} Live read through {1} ({2})\n", + data->key.printable(), + it.version, + it.mutations.size()); + } + } + } + } + } + when(wait(data->checkVersion.whenAtLeast(lastCheckVersion + 1))) { + // wake loop and start new whenAtLeast whenever checkVersion is set + lastCheckVersion = data->checkVersion.get(); + } + when(wait(data->pendingCheck.empty() ? Never() + : results->whenAtLeast(data->pendingCheck.front().first))) { + + if (data->pendingCheck.empty() || data->pendingCheck.front().first > nextCheckVersion) { + // pendingCheck wasn't empty before whenAtLeast, and nextCheckVersion = the front version, so if + // either of these are true, the data was popped concurrently and we can move on to checking the + // next value + CODE_PROBE(true, "popped while waiting for whenAtLeast to check next value"); + continue; + } + while (!buffered.empty() && buffered.front().version < data->poppingVersion) { + CODE_PROBE(true, "live reader ignoring data that is being popped"); + buffered.pop_front(); + } + if (buffered.empty()) { + if (data->poppingVersion < data->pendingCheck.front().first) { + fmt::print("DBG) {0} Buffered empty after ready for check, and data not popped! popped " + "{1}, popping {2}, check {3}\n", + data->key.printable(), + data->poppedVersion, + data->poppingVersion, + data->pendingCheck.front().first); + } + ASSERT(data->poppingVersion >= data->pendingCheck.front().first); + data->pendingCheck.pop_front(); + } else { + Version v = buffered.front().version; + if (DEBUG_CF(data->key)) { + fmt::print("DBG) {0} Live checking through {1}\n", + data->key.printable(), + data->pendingCheck.front().first); + } + checkNextResult(data->key, buffered, data->pendingCheck); + if (DEBUG_CF(data->key)) { + fmt::print("DBG) {0} Live Checked through {1}\n", data->key.printable(), v); + } + + if (data->popDelayWindow >= 0 && data->popWindow >= 0 && + data->writesByVersion.size() == data->popWindow + data->popDelayWindow) { + data->pop(cx, data->writesByVersion[data->popWindow - 1].first + 1); + ASSERT(data->writesByVersion.size() == data->popDelayWindow); + } + } + } + } + } + } catch (Error& e) { + throw e; + } +} + +ACTOR Future historicReader(Database cx, + Reference data, + Version begin, + Version end, + bool skipPopped) { + state std::deque>> checkData; + state std::deque> buffered; + state Reference results = makeReference(); + state Future stream = cx->getChangeFeedStream(results, data->feedID, begin, end, data->keyRange); + state Version poppedVersionAtStart = data->poppedVersion; + + if (DEBUG_CF(data->key)) { + fmt::print("DBG) {0} Starting historical read {1} - {2}\n", data->key.printable(), begin, end); + } + + // TODO could cpu optimize this + for (auto& it : data->writesByVersion) { + if (it.first >= end) { + break; + } + if (it.first >= begin) { + checkData.push_back(it); + } + } + + try { + loop { + Standalone> res = waitNext(results->mutations.getFuture()); + for (auto& it : res) { + if (it.mutations.size() == 1 && it.mutations.back().param1 == lastEpochEndPrivateKey) { + rollbackFeed(data->key, buffered, it.version, it.mutations.back()); + } else { + if (it.mutations.size() == 0) { + // FIXME: THIS SHOULD NOT HAPPEN + // FIXME: these are also getting sent past stopVersion!! + } else { + if (data->stopVersion.present()) { + ASSERT(it.version <= data->stopVersion.get()); + } + buffered.push_back(Standalone(it)); + } + } + } + } + } catch (Error& e) { + if (e.code() != error_code_end_of_stream) { + throw; + } + } + + if (skipPopped) { + while (!buffered.empty() && buffered.front().version < data->poppingVersion) { + // ignore data + buffered.pop_front(); + } + while (!checkData.empty() && checkData.front().first < data->poppingVersion) { + checkData.pop_front(); + } + } + + while (!checkData.empty() && !buffered.empty()) { + checkNextResult(data->key, buffered, checkData); + } + // Change feed missing data it should have + ASSERT(checkData.empty()); + // Change feed read extra data it shouldn't have + ASSERT(buffered.empty()); + + // check pop version of cursor + // TODO: this check might not always work if read is for old data and SS is way behind + // FIXME: this check doesn't work for now, probably due to above comment + /*if (data->poppingVersion != 0) { + ASSERT(results->popVersion >= poppedVersionAtStart && results->popVersion <= data->poppingVersion); + }*/ + + return Void(); +} + +enum Op { + CREATE_DELETE = 0, + READ = 1, + UPDATE_CLEAR = 2, + STOP = 3, + POP = 4, + OP_COUNT = 5 /* keep this last */ +}; + +struct ChangeFeedOperationsWorkload : TestWorkload { + // test settings + double testDuration; + int operationsPerSecond; + int targetFeeds; + bool clientsDisjointKeyspace; + bool clearKeyWhenDestroy; + double clearFrequency; + int popMode; + + int opWeights[Op::OP_COUNT]; + int totalOpWeight; + + Future client; + std::unordered_set usedKeys; + std::vector> data; + + ChangeFeedOperationsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) { + testDuration = getOption(options, "testDuration"_sr, 60.0); + operationsPerSecond = getOption(options, "opsPerSecond"_sr, 100.0); + int64_t rand = wcx.sharedRandomNumber; + targetFeeds = deterministicRandom()->randomExp(1, 1 + rand % 10); + targetFeeds *= (0.8 + (deterministicRandom()->random01() * 0.4)); + targetFeeds = std::max(1, targetFeeds / clientCount); + rand /= 10; + clientsDisjointKeyspace = rand % 2; + rand /= 2; + clearKeyWhenDestroy = rand % 2; + rand /= 2; + bool doStops = rand % 2; + rand /= 2; + bool noCreateDelete = rand % 10 == 0; + rand /= 10; + popMode = rand % 3; // 0=none, 1=read-driven, 2=op-driven + rand /= 3; + + ASSERT(clientId >= 0); + ASSERT(clientId < clientCount); + ASSERT(clientCount < 255); + + clearFrequency = deterministicRandom()->random01(); + + for (int i = 0; i < Op::OP_COUNT; i++) { + int randWeight = deterministicRandom()->randomExp(0, 5); + ASSERT(randWeight > 0); + opWeights[i] = randWeight; + } + + if (!doStops) { + opWeights[Op::STOP] = 0; + } + if (noCreateDelete) { + opWeights[Op::CREATE_DELETE] = 0; + } + if (popMode != 2) { + opWeights[Op::POP] = 0; + } + + std::string weightString = "|"; + totalOpWeight = 0; + for (int i = 0; i < Op::OP_COUNT; i++) { + totalOpWeight += opWeights[i]; + weightString += std::to_string(opWeights[i]) + "|"; + } + + TraceEvent("ChangeFeedOperationsInit") + .detail("TargetFeeds", targetFeeds) + .detail("DisjointKeyspace", clientsDisjointKeyspace) + .detail("ClearWhenDestroy", clearKeyWhenDestroy) + .detail("DoStops", doStops) + .detail("NoCreateDelete", noCreateDelete) + .detail("Weights", weightString); + } + + Key unusedNewRandomKey() { + while (true) { + Key k = newRandomKey(); + if (usedKeys.insert(k).second) { + return k; + } + } + } + + Key newRandomKey() { + if (clientsDisjointKeyspace) { + double keyspaceRange = (1.0 / clientCount); + double randPartOfRange = deterministicRandom()->random01() * (keyspaceRange - 0.0001); + double randomDouble = clientId * keyspaceRange + 0.0001 + randPartOfRange; + return doubleToTestKey(randomDouble); + } else { + // this is kinda hacky but it guarantees disjoint keys per client + Key ret = doubleToTestKey(deterministicRandom()->random01()); + std::string str = ret.toString(); + str.back() = (uint8_t)clientId; + return Key(str); + } + } + + // Pick op with weighted average + Op pickRandomOp() { + int r = deterministicRandom()->randomInt(0, totalOpWeight); + int i = 0; + while (i < Op::OP_COUNT && (opWeights[i] <= r || opWeights[i] == 0)) { + r -= opWeights[i]; + i++; + } + ASSERT(i < Op::OP_COUNT); + return (Op)i; + } + + ACTOR Future createNewFeed(Database cx, ChangeFeedOperationsWorkload* self) { + state Transaction tr(cx); + state Key key = self->unusedNewRandomKey(); + state Reference feedData = makeReference(key, self->popMode == 1); + state Value initialValue = feedData->nextValue(); + + if (DEBUG_CF(key)) { + fmt::print("DBG) Creating {0}\n", key.printable()); + } + + loop { + try { + tr.set(key, initialValue); + wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_CREATE, feedData->keyRange)); + wait(tr.commit()); + + Version createVersion = tr.getCommittedVersion(); + if (DEBUG_CF(key)) { + fmt::print("DBG) Created {0} @ {1}\n", key.printable(), createVersion); + } + feedData->update(createVersion, initialValue); + feedData->liveReader = liveReader(cx, feedData, createVersion); + + self->data.push_back(feedData); + + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + std::string description() const override { return "ChangeFeedOperationsWorkload"; } + Future setup(Database const& cx) override { return _setup(cx, this); } + + ACTOR Future _setup(Database cx, ChangeFeedOperationsWorkload* self) { + // create initial targetFeeds feeds + TraceEvent("ChangeFeedOperationsSetup").detail("InitialFeeds", self->targetFeeds).log(); + state int i; + for (i = 0; i < self->targetFeeds; i++) { + wait(self->createNewFeed(cx, self)); + } + TraceEvent("ChangeFeedOperationsSetupComplete"); + return Void(); + } + + Future start(Database const& cx) override { + client = changeFeedOperationsClient(cx->clone(), this); + return delay(testDuration); + } + Future check(Database const& cx) override { + client = Future(); + return _check(cx, this); + } + + ACTOR Future checkFeed(Database cx, ChangeFeedOperationsWorkload* self, Reference feedData) { + state int popIdx; + feedData->testComplete(); + + if (DEBUG_CF(feedData->key)) { + fmt::print("Final check {0} waiting on live reader\n", feedData->key.printable()); + } + // wait on live reader and pops to make sure they complete without error + wait(feedData->liveReader); + if (DEBUG_CF(feedData->key)) { + fmt::print("Final check {0} waiting on {1} pops\n", feedData->key.printable(), feedData->pops.size()); + } + for (popIdx = 0; popIdx < feedData->pops.size(); popIdx++) { + wait(feedData->pops[popIdx]); + } + + // do final check, read everything not popped + if (DEBUG_CF(feedData->key)) { + fmt::print("Final check {0} waiting on data check\n", feedData->key.printable(), feedData->pops.size()); + } + wait(self->doRead(cx, feedData, feedData->writesByVersion.size())); + + // ensure reading [0, poppedVersion) returns no results + if (feedData->poppedVersion > 0) { + if (DEBUG_CF(feedData->key)) { + fmt::print( + "Final check {0} waiting on read popped check\n", feedData->key.printable(), feedData->pops.size()); + } + // FIXME: re-enable checking for popped data by changing skipPopped back to false! + wait(historicReader(cx, feedData, 0, feedData->poppedVersion, true)); + } + + return Void(); + } + + ACTOR Future _check(Database cx, ChangeFeedOperationsWorkload* self) { + TraceEvent("ChangeFeedOperationsCheck").detail("FeedCount", self->data.size()).log(); + fmt::print("Checking {0} feeds\n", self->data.size()); // TODO REMOVE + state std::vector> feedChecks; + for (int i = 0; i < self->data.size(); i++) { + if (self->data[i]->destroying) { + continue; + } + if (DEBUG_CF(self->data[i]->key)) { + fmt::print("Final check {0}\n", self->data[i]->key.printable()); + } + feedChecks.push_back(self->checkFeed(cx, self, self->data[i])); + } + wait(waitForAll(feedChecks)); + // FIXME: check that all destroyed feeds are actually destroyed? + TraceEvent("ChangeFeedOperationsCheckComplete"); + return true; + } + + void getMetrics(std::vector& m) override {} + + ACTOR Future stopFeed(Database cx, Reference feedData) { + state Transaction tr(cx); + if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Stopping\n", feedData->key.printable()); + } + loop { + try { + wait(updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_STOP, feedData->keyRange)); + wait(tr.commit()); + + Version stopVersion = tr.getCommittedVersion(); + if (!feedData->stopVersion.present()) { + feedData->stopVersion = stopVersion; + } + if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Stopped @ {1}\n", feedData->key.printable(), stopVersion); + } + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + void popFeed(Database cx, Reference feedData) { + if (!feedData->writesByVersion.empty()) { + feedData->pop(cx, feedData->writesByVersion.front().first + 1); + } + } + + ACTOR Future destroyFeed(Database cx, ChangeFeedOperationsWorkload* self, int feedIdx) { + state Reference feedData = self->data[feedIdx]; + state Transaction tr(cx); + feedData->destroying = true; + if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Destroying\n", feedData->key.printable()); + } + loop { + try { + wait( + updateChangeFeed(&tr, feedData->feedID, ChangeFeedStatus::CHANGE_FEED_DESTROY, feedData->keyRange)); + if (self->clearKeyWhenDestroy) { + tr.clear(feedData->key); + } + wait(tr.commit()); + + feedData->destroyed = true; + // remove feed from list + ASSERT(self->data[feedIdx]->key == feedData->key); + swapAndPop(&self->data, feedIdx); + if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Destroyed @ {1}\n", feedData->key.printable(), tr.getCommittedVersion()); + } + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + ACTOR Future doRead(Database cx, Reference feedData, int targetReadWidth) { + if (feedData->writesByVersion.empty()) { + return Void(); + } + Version beginVersion; + Version endVersion; + if (targetReadWidth >= feedData->writesByVersion.size()) { + beginVersion = feedData->writesByVersion.front().first; + endVersion = feedData->writesByVersion.back().first + 1; + } else { + // either up to or including end + int randStart = deterministicRandom()->randomInt(0, feedData->writesByVersion.size() - targetReadWidth); + beginVersion = feedData->writesByVersion[randStart].first; + int end = randStart + targetReadWidth; + if (end == feedData->writesByVersion.size()) { + endVersion = feedData->writesByVersion.back().first + 1; + } else { + // Make sure last included value (end version -1) is a committed version for checking + endVersion = feedData->writesByVersion[end].first + 1; + } + } + + if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Reading @ {1} - {2}\n", feedData->key.printable(), beginVersion, endVersion); + } + + // FIXME: this sometimes reads popped data! + wait(historicReader(cx, feedData, beginVersion, endVersion, true)); + + if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Read complete\n", feedData->key.printable()); + } + + return Void(); + } + + ACTOR Future doUpdateClear(Database cx, + ChangeFeedOperationsWorkload* self, + Reference feedData) { + state Transaction tr(cx); + state Optional updateValue; + + // if value is already not set, don't do a clear, otherwise pick either + if (feedData->lastCleared || deterministicRandom()->random01() > self->clearFrequency) { + updateValue = feedData->nextValue(); + if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Setting {1}\n", feedData->key.printable(), updateValue.get().printable()); + } + } else if (DEBUG_CF(feedData->key)) { + fmt::print("DBG) {0} Clearing\n", feedData->key.printable()); + } + loop { + try { + if (updateValue.present()) { + tr.set(feedData->key, updateValue.get()); + } else { + tr.clear(feedData->key); + } + + wait(tr.commit()); + + Version writtenVersion = tr.getCommittedVersion(); + + if (DEBUG_CF(feedData->key) && updateValue.present()) { + fmt::print("DBG) {0} Set {1} @ {2}\n", + feedData->key.printable(), + updateValue.get().printable(), + writtenVersion); + } + if (DEBUG_CF(feedData->key) && !updateValue.present()) { + fmt::print("DBG) {0} Cleared @ {1}\n", feedData->key.printable(), writtenVersion); + } + + feedData->update(writtenVersion, updateValue); + return Void(); + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + + ACTOR Future changeFeedOperationsClient(Database cx, ChangeFeedOperationsWorkload* self) { + state double last = now(); + loop { + state Future waitNextOp = poisson(&last, 1.0 / self->operationsPerSecond); + Op op = self->pickRandomOp(); + int feedIdx = deterministicRandom()->randomInt(0, self->data.size()); + if (op == Op::CREATE_DELETE) { + // bundle these together so random creates/deletes keep about the target number of feeds + if (deterministicRandom()->random01() < 0.5 || self->data.size() == 1) { + wait(self->createNewFeed(cx, self)); + } else { + wait(self->destroyFeed(cx, self, feedIdx)); + } + } else if (op == Op::READ) { + // relatively small random read + wait(self->doRead(cx, self->data[feedIdx], deterministicRandom()->randomExp(2, 8))); + } else if (op == Op::UPDATE_CLEAR) { + wait(self->doUpdateClear(cx, self, self->data[feedIdx])); + } else if (op == Op::STOP) { + wait(self->stopFeed(cx, self->data[feedIdx])); + } else if (op == Op::POP) { + self->popFeed(cx, self->data[feedIdx]); + } else { + ASSERT(false); + } + + wait(waitNextOp); + } + } +}; + +WorkloadFactory ChangeFeedOperationsWorkloadFactory("ChangeFeedOperations"); diff --git a/fdbserver/workloads/PhysicalShardMove.actor.cpp b/fdbserver/workloads/PhysicalShardMove.actor.cpp index 2c59de681d..8d2b90ff78 100644 --- a/fdbserver/workloads/PhysicalShardMove.actor.cpp +++ b/fdbserver/workloads/PhysicalShardMove.actor.cpp @@ -325,6 +325,7 @@ struct PhysicalShardMoveWorkLoad : TestWorkload { TraceEvent("TestCancelDataMoveEnd").detail("DataMove", dataMove.toString()); } + TraceEvent("TestMoveShardStartMoveKeys").detail("DataMove", dataMoveId); wait(moveKeys(cx, dataMoveId, keys, diff --git a/fdbserver/workloads/SkewedReadWrite.actor.cpp b/fdbserver/workloads/SkewedReadWrite.actor.cpp index 78576f957f..7019fc1065 100644 --- a/fdbserver/workloads/SkewedReadWrite.actor.cpp +++ b/fdbserver/workloads/SkewedReadWrite.actor.cpp @@ -215,7 +215,8 @@ struct SkewedReadWriteWorkload : ReadWriteCommon { self->startReadWriteClients(cx, clients); wait(timeout(waitForAll(clients), self->testDuration / self->skewRound, Void())); clients.clear(); - wait(delay(5.0) >> updateServerShards(cx, self)); + wait(delay(5.0)); + wait(updateServerShards(cx, self)); } return Void(); diff --git a/flow/include/flow/IRandom.h b/flow/include/flow/IRandom.h index 1430fb018d..bc628c64e1 100644 --- a/flow/include/flow/IRandom.h +++ b/flow/include/flow/IRandom.h @@ -173,6 +173,19 @@ public: } bool coinflip() { return (this->random01() < 0.5); } + + // Picks a number between 2^minExp and 2^maxExp, but uniformly distributed over exponential buckets 2^n - 2^n+1 + // For example, randomExp(0, 4) would have a 25% chance of returning 1, a 25% chance of returning 2-3, a 25% chance + // of returning 4-7, and a 25% chance of returning 8-15 + // Similar in Expected Value to doing 1 << randomInt(minExp, maxExp+1), except numbers returned aren't just powers + // of 2 + int randomExp(int minExp, int maxExp) { + if (minExp == maxExp) { // N=2, case + return 1 << minExp; + } + int val = 1 << this->randomInt(minExp, maxExp); + return this->randomInt(val, val * 2); + } }; extern FILE* randLog; diff --git a/flow/include/flow/ProtocolVersion.h b/flow/include/flow/ProtocolVersion.h index 98996f2496..f839fce2b0 100644 --- a/flow/include/flow/ProtocolVersion.h +++ b/flow/include/flow/ProtocolVersion.h @@ -174,6 +174,7 @@ public: // introduced features PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, EncryptionAtRest); PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, ShardEncodeLocationMetaData); PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, Tenants); + PROTOCOL_VERSION_FEATURE(0x0FDB00B072000000LL, BlobGranuleFile); }; template <> diff --git a/flow/include/flow/genericactors.actor.h b/flow/include/flow/genericactors.actor.h index 92afe90643..9142945cb6 100644 --- a/flow/include/flow/genericactors.actor.h +++ b/flow/include/flow/genericactors.actor.h @@ -1978,22 +1978,25 @@ Future()(std::declval()).getValue())> runAfter(Fut return res; } -ACTOR template -Future runAfter(Future lhs, Future rhs) { - T val1 = wait(lhs); - U res = wait(rhs); - return res; -} - template auto operator>>=(Future lhs, Fun&& rhs) -> Future()))> { return runAfter(lhs, std::forward(rhs)); } +/* + * NOTE: This implementation can't guarantee the doesn't really enforce the ACTOR execution order. See issue #7708 +ACTOR template +Future runAfter(Future lhs, Future rhs) { + T val1 = wait(lhs); + U res = wait(rhs); + return res; +} + template Future operator>>(Future const& lhs, Future const& rhs) { - return runAfter(lhs, rhs); + return runAfter(lhs, rhs); } + */ /* * IAsyncListener is similar to AsyncVar, but it decouples the input and output, so the translation unit diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9efe533f97..28896e4bff 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -130,8 +130,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/BackupToDBCorrectnessClean.toml) add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmall.toml) add_fdb_test(TEST_FILES fast/BlobGranuleVerifySmallClean.toml) - add_fdb_test(TEST_FILES fast/BlobGranuleVerifyAtomicOps.toml) - add_fdb_test(TEST_FILES fast/BlobGranuleVerifyCycle.toml) + add_fdb_test(TEST_FILES fast/BlobGranuleMoveVerifyCycle.toml) add_fdb_test(TEST_FILES fast/CacheTest.toml) add_fdb_test(TEST_FILES fast/CloggedSideband.toml) add_fdb_test(TEST_FILES fast/CompressionUtilsUnit.toml) @@ -140,6 +139,7 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/CycleAndLock.toml) add_fdb_test(TEST_FILES fast/CycleTest.toml) add_fdb_test(TEST_FILES fast/ChangeFeeds.toml) + add_fdb_test(TEST_FILES fast/ChangeFeedOperations.toml) add_fdb_test(TEST_FILES fast/DataLossRecovery.toml) add_fdb_test(TEST_FILES fast/EncryptionOps.toml) # TODO: fix failures and renable the test @@ -199,6 +199,8 @@ if(WITH_PYTHON) add_fdb_test(TEST_FILES fast/PhysicalShardMove.toml IGNORE) add_fdb_test(TEST_FILES fast/StorageServerCheckpointRestore.toml IGNORE) endif() + add_fdb_test(TEST_FILES rare/BlobGranuleVerifyAtomicOps.toml) + add_fdb_test(TEST_FILES rare/BlobGranuleVerifyCycle.toml) add_fdb_test(TEST_FILES rare/CheckRelocation.toml) add_fdb_test(TEST_FILES rare/ClogUnclog.toml) add_fdb_test(TEST_FILES rare/CloggedCycleWithKills.toml) diff --git a/tests/fast/BlobGranuleMoveVerifyCycle.toml b/tests/fast/BlobGranuleMoveVerifyCycle.toml new file mode 100644 index 0000000000..43d524d534 --- /dev/null +++ b/tests/fast/BlobGranuleMoveVerifyCycle.toml @@ -0,0 +1,48 @@ +[configuration] +blobGranulesEnabled = true +allowDefaultTenant = false +# FIXME: re-enable rocks at some point +storageEngineExcludeTypes = [4] + +[[knobs]] +bg_range_source = "blobRangeKeys" + +[[test]] +testTitle = 'BlobGranuleMoveVerifyCycle' + + [[test.workload]] + testName = 'Cycle' + transactionsPerSecond = 250.0 + testDuration = 60.0 + expectedRate = 0 + + [[test.workload]] + testName = 'RandomMoveKeys' + testDuration = 60.0 + + [[test.workload]] + testName = 'BlobGranuleVerifier' + testDuration = 60.0 + + [[test.workload]] + testName = 'RandomClogging' + testDuration = 60.0 + + [[test.workload]] + testName = 'Rollback' + meanDelay = 60.0 + testDuration = 60.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 60.0 + + [[test.workload]] + testName = 'Attrition' + machinesToKill = 10 + machinesToLeave = 3 + reboot = true + testDuration = 60.0 diff --git a/tests/fast/BlobGranuleVerifySmall.toml b/tests/fast/BlobGranuleVerifySmall.toml index e7821e5c9c..df2ae14ffd 100644 --- a/tests/fast/BlobGranuleVerifySmall.toml +++ b/tests/fast/BlobGranuleVerifySmall.toml @@ -1,6 +1,8 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false +injectTargetedSSRestart = true +injectSSDelay = true # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet # FIXME: re-enable rocks at some point storageEngineExcludeTypes = [3, 4, 5] diff --git a/tests/fast/BlobGranuleVerifySmallClean.toml b/tests/fast/BlobGranuleVerifySmallClean.toml index ca4b5f81cf..840e6198a4 100644 --- a/tests/fast/BlobGranuleVerifySmallClean.toml +++ b/tests/fast/BlobGranuleVerifySmallClean.toml @@ -3,7 +3,7 @@ blobGranulesEnabled = true allowDefaultTenant = false # FIXME: exclude redwood because WriteDuringRead can write massive KV pairs and we don't chunk change feed data on disk yet # FIXME: re-enable rocks at some point -storageEngineExcludeTypes = [3, 4] +storageEngineExcludeTypes = [3, 4, 5] [[knobs]] bg_range_source = "blobRangeKeys" diff --git a/tests/fast/ChangeFeedOperations.toml b/tests/fast/ChangeFeedOperations.toml new file mode 100644 index 0000000000..3ba89beed0 --- /dev/null +++ b/tests/fast/ChangeFeedOperations.toml @@ -0,0 +1,10 @@ +[configuration] +allowDefaultTenant = false + +# TODO add failure events, and then add a version that also supports randomMoveKeys + +[[test]] +testTitle = 'ChangeFeedOperationsTest' + + [[test.workload]] + testName = 'ChangeFeedOperations' diff --git a/tests/fast/BlobGranuleVerifyAtomicOps.toml b/tests/rare/BlobGranuleVerifyAtomicOps.toml similarity index 94% rename from tests/fast/BlobGranuleVerifyAtomicOps.toml rename to tests/rare/BlobGranuleVerifyAtomicOps.toml index 184fac36ce..408a5ded50 100644 --- a/tests/fast/BlobGranuleVerifyAtomicOps.toml +++ b/tests/rare/BlobGranuleVerifyAtomicOps.toml @@ -1,6 +1,8 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false +injectTargetedSSRestart = true +injectSSDelay = true # FIXME: re-enable rocks at some point storageEngineExcludeTypes = [4, 5] diff --git a/tests/fast/BlobGranuleVerifyCycle.toml b/tests/rare/BlobGranuleVerifyCycle.toml similarity index 94% rename from tests/fast/BlobGranuleVerifyCycle.toml rename to tests/rare/BlobGranuleVerifyCycle.toml index a4acbce43b..e0ec5524d7 100644 --- a/tests/fast/BlobGranuleVerifyCycle.toml +++ b/tests/rare/BlobGranuleVerifyCycle.toml @@ -1,6 +1,8 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false +injectTargetedSSRestart = true +injectSSDelay = true # FIXME: re-enable rocks at some point storageEngineExcludeTypes = [4, 5] diff --git a/tests/slow/BlobGranuleCorrectness.toml b/tests/slow/BlobGranuleCorrectness.toml index cd30c64e2c..94b5507689 100644 --- a/tests/slow/BlobGranuleCorrectness.toml +++ b/tests/slow/BlobGranuleCorrectness.toml @@ -2,6 +2,8 @@ blobGranulesEnabled = true allowDefaultTenant = false allowDisablingTenants = false +injectTargetedSSRestart = true +injectSSDelay = true # FIXME: re-enable rocks at some point storageEngineExcludeTypes = [4, 5] diff --git a/tests/slow/BlobGranuleVerifyBalance.toml b/tests/slow/BlobGranuleVerifyBalance.toml index f04d977484..cb138f1350 100644 --- a/tests/slow/BlobGranuleVerifyBalance.toml +++ b/tests/slow/BlobGranuleVerifyBalance.toml @@ -1,6 +1,8 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false +injectTargetedSSRestart = true +injectSSDelay = true # FIXME: re-enable rocks at some point storageEngineExcludeTypes = [4, 5] diff --git a/tests/slow/BlobGranuleVerifyLarge.toml b/tests/slow/BlobGranuleVerifyLarge.toml index 4ad2a806eb..7ee7b52364 100644 --- a/tests/slow/BlobGranuleVerifyLarge.toml +++ b/tests/slow/BlobGranuleVerifyLarge.toml @@ -1,6 +1,8 @@ [configuration] blobGranulesEnabled = true allowDefaultTenant = false +injectTargetedSSRestart = true +injectSSDelay = true # FIXME: re-enable rocks at some point storageEngineExcludeTypes = [4, 5]