From 4351c4d15968b614baf3339cab0b518c89be237d Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 2 Jul 2019 00:58:43 -0700 Subject: [PATCH 001/184] Removed use of the C "struct hack" as it is not valid C++. Replaced zero-length members with functions returning a pointer for arrays or a reference for single members. --- fdbserver/DeltaTree.h | 56 ++++++++++++++++++------------ fdbserver/VersionedBTree.actor.cpp | 50 ++++++++++++++++---------- 2 files changed, 66 insertions(+), 40 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 4a9bee5c98..6797d87a77 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -69,6 +69,7 @@ // // Retrieves the previously stored boolean // bool getPrefixSource() const; // +#pragma pack(push,1) template struct DeltaTree { @@ -76,36 +77,47 @@ struct DeltaTree { return std::numeric_limits::max(); }; -#pragma pack(push,1) struct Node { OffsetT leftChildOffset; OffsetT rightChildOffset; - DeltaT delta[0]; + + inline DeltaT & delta() { + return *(DeltaT *)(this + 1); + }; + + inline const DeltaT & delta() const { + return *(const DeltaT *)(this + 1); + }; Node * rightChild() const { - //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta->size()); - return rightChildOffset == 0 ? nullptr : (Node *)((uint8_t *)delta + rightChildOffset); + //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta().size()); + return rightChildOffset == 0 ? nullptr : (Node *)((uint8_t *)&delta() + rightChildOffset); } Node * leftChild() const { - //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta->size()); - return leftChildOffset == 0 ? nullptr : (Node *)((uint8_t *)delta + leftChildOffset); + //printf("Node(%p): leftOffset=%d rightOffset=%d deltaSize=%d\n", this, (int)leftChildOffset, (int)rightChildOffset, (int)delta().size()); + return leftChildOffset == 0 ? nullptr : (Node *)((uint8_t *)&delta() + leftChildOffset); } int size() const { - return sizeof(Node) + delta->size(); + return sizeof(Node) + delta().size(); } }; -#pragma pack(pop) -#pragma pack(push,1) struct { OffsetT nodeBytes; // Total size of all Nodes including the root uint8_t initialDepth; // Levels in the tree as of the last rebuild - Node root[0]; }; #pragma pack(pop) + inline Node & root() { + return *(Node *)(this + 1); + } + + inline const Node & root() const { + return *(const Node *)(this + 1); + } + int size() const { return sizeof(DeltaTree) + nodeBytes; } @@ -119,18 +131,18 @@ public: struct DecodedNode { DecodedNode(Node *raw, const T *prev, const T *next, Arena &arena) : raw(raw), parent(nullptr), left(nullptr), right(nullptr), prev(prev), next(next), - item(raw->delta->apply(raw->delta->getPrefixSource() ? *prev : *next, arena)) + item(raw->delta().apply(raw->delta().getPrefixSource() ? *prev : *next, arena)) { - //printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta->toString().c_str()); + //printf("DecodedNode1 raw=%p delta=%s\n", raw, raw->delta().toString().c_str()); } DecodedNode(Node *raw, DecodedNode *parent, bool left, Arena &arena) : parent(parent), raw(raw), left(nullptr), right(nullptr), prev(left ? parent->prev : &parent->item), next(left ? &parent->item : parent->next), - item(raw->delta->apply(raw->delta->getPrefixSource() ? *prev : *next, arena)) + item(raw->delta().apply(raw->delta().getPrefixSource() ? *prev : *next, arena)) { - //printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta->toString().c_str()); + //printf("DecodedNode2 raw=%p delta=%s\n", raw, raw->delta().toString().c_str()); } Node *raw; @@ -175,7 +187,7 @@ public: lower = new(arena) T(arena, *lower); upper = new(arena) T(arena, *upper); - root = (tree->nodeBytes == 0) ? nullptr : new (arena) DecodedNode(tree->root, lower, upper, arena); + root = (tree->nodeBytes == 0) ? nullptr : new (arena) DecodedNode(&tree->root(), lower, upper, arena); } const T *lowerBound() const { @@ -330,7 +342,7 @@ public: // The boundary leading to the new page acts as the last time we branched right if(begin != end) { - nodeBytes = build(*root, begin, end, prev, next); + nodeBytes = build(root(), begin, end, prev, next); } else { nodeBytes = 0; @@ -341,7 +353,7 @@ public: private: static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next) { //printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); - //printf("build: root at %p sizeof(Node) %d delta at %p \n", &root, sizeof(Node), root.delta); + //printf("build: root at %p sizeof(Node) %d delta at %p \n", &root, sizeof(Node), &root.delta()); ASSERT(end != begin); int count = end - begin; @@ -370,12 +382,12 @@ private: base = next; } - int deltaSize = item.writeDelta(*root.delta, *base, commonPrefix); - root.delta->setPrefixSource(prefixSourcePrev); - //printf("Serialized %s to %p\n", item.toString().c_str(), root.delta); + int deltaSize = item.writeDelta(root.delta(), *base, commonPrefix); + root.delta().setPrefixSource(prefixSourcePrev); + //printf("Serialized %s to %p\n", item.toString().c_str(), &root.delta()); // Continue writing after the serialized Delta. - uint8_t *wptr = (uint8_t *)root.delta + deltaSize; + uint8_t *wptr = (uint8_t *)&root.delta() + deltaSize; // Serialize left child if(count > 1) { @@ -388,7 +400,7 @@ private: // Serialize right child if(count > 2) { - root.rightChildOffset = wptr - (uint8_t *)root.delta; + root.rightChildOffset = wptr - (uint8_t *)&root.delta(); wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next); } else { diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 5834687548..a926926d5a 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -431,7 +431,14 @@ struct RedwoodRecordRef { }; uint8_t flags; - byte data[]; + + inline byte * data() { + return (byte *)(this + 1); + } + + inline const byte * data() const { + return (const byte *)(this + 1); + } void setPrefixSource(bool val) { if(val) { @@ -447,7 +454,7 @@ struct RedwoodRecordRef { } RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { - Reader r(data); + Reader r(data()); int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS; int prefixLen = r.readVarInt(); @@ -501,19 +508,19 @@ struct RedwoodRecordRef { } int size() const { - Reader r(data); + Reader r(data()); int intFieldSuffixLen = flags & INT_FIELD_SUFFIX_BITS; r.readVarInt(); // prefixlen int valueLen = (flags & HAS_VALUE) ? r.read() : 0; int keySuffixLen = (flags & HAS_KEY_SUFFIX) ? r.readVarInt() : 0; - return sizeof(Delta) + r.rptr - data + intFieldSuffixLen + valueLen + keySuffixLen; + return sizeof(Delta) + r.rptr - data() + intFieldSuffixLen + valueLen + keySuffixLen; } // Delta can't be determined without the RedwoodRecordRef upon which the Delta is based. std::string toString() const { - Reader r(data); + Reader r(data()); std::string flagString = " "; if(flags & PREFIX_SOURCE) flagString += "prefixSource "; @@ -638,7 +645,7 @@ struct RedwoodRecordRef { commonPrefix = getCommonPrefixLen(base, 0); } - Writer w(d.data); + Writer w(d.data()); // prefixLen w.writeVarInt(commonPrefix); @@ -688,7 +695,7 @@ struct RedwoodRecordRef { w.writeString(value.get()); } - return w.wptr - d.data + sizeof(Delta); + return w.wptr - d.data() + sizeof(Delta); } template @@ -737,10 +744,17 @@ struct BTreePage { uint16_t count; uint32_t kvBytes; uint8_t extensionPageCount; - LogicalPageID extensionPages[0]; }; #pragma pack(pop) + inline LogicalPageID * extensionPages() { + return (LogicalPageID *)(this + 1); + } + + inline const LogicalPageID * extensionPages() const { + return (const LogicalPageID *)(this + 1); + } + int size() const { const BinaryTree *t = &tree(); return (uint8_t *)t - (uint8_t *)this + t->size(); @@ -751,15 +765,15 @@ struct BTreePage { } BinaryTree & tree() { - return *(BinaryTree *)(extensionPages + extensionPageCount); + return *(BinaryTree *)(extensionPages() + extensionPageCount); } const BinaryTree & tree() const { - return *(const BinaryTree *)(extensionPages + extensionPageCount); + return *(const BinaryTree *)(extensionPages() + extensionPageCount); } static inline int GetHeaderSize(int extensionPages = 0) { - return sizeof(BTreePage) + extensionPages + sizeof(LogicalPageID); + return sizeof(BTreePage) + (extensionPages * sizeof(LogicalPageID)); } std::string toString(bool write, LogicalPageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { @@ -1603,7 +1617,7 @@ private: for(int e = 0, eEnd = extPages.size(); e < eEnd; ++e) { LogicalPageID eid = m_pager->allocateLogicalPage(); debug_printf("%p: writePages(): Writing extension page op=write id=%u @%" PRId64 " (%d of %lu) referencePageID=%u\n", actor_debug, eid, version, e + 1, extPages.size(), id); - newPage->extensionPages[e] = bigEndian32(eid); + newPage->extensionPages()[e] = bigEndian32(eid); // If replacing the primary page below (version == 0) then pass the primary page's ID as the reference page ID m_pager->writePage(eid, extPages[e], version, (version == 0) ? id : invalidLogicalPageID); ++counts.extPageWrites; @@ -1620,8 +1634,8 @@ private: // Free the old extension pages now that all replacement pages have been written for(int i = 0; i < originalPage->extensionPageCount; ++i) { - //debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages[i])); - //m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages[i]), version); + //debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages()[i])); + //m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages()[i]), version); } return primaryLogicalPageIDs; @@ -1684,8 +1698,8 @@ private: pageGets.push_back(std::move(result)); for(int i = 0; i < pTreePage->extensionPageCount; ++i) { - debug_printf("readPage() Reading extension page op=read id=%u @%" PRId64 " ext=%d/%d\n", bigEndian32(pTreePage->extensionPages[i]), snapshot->getVersion(), i + 1, (int)pTreePage->extensionPageCount); - pageGets.push_back(snapshot->getPhysicalPage(bigEndian32(pTreePage->extensionPages[i]))); + debug_printf("readPage() Reading extension page op=read id=%u @%" PRId64 " ext=%d/%d\n", bigEndian32(pTreePage->extensionPages()[i]), snapshot->getVersion(), i + 1, (int)pTreePage->extensionPageCount); + pageGets.push_back(snapshot->getPhysicalPage(bigEndian32(pTreePage->extensionPages()[i]))); } std::vector> pages = wait(getAll(pageGets)); @@ -3561,12 +3575,12 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { while(1) { if(fwd.get() != items[i]) { printf("forward iterator i=%d\n %s found\n %s expected\n", i, fwd.get().toString().c_str(), items[i].toString().c_str()); - printf("Delta: %s\n", fwd.node->raw->delta->toString().c_str()); + printf("Delta: %s\n", fwd.node->raw->delta().toString().c_str()); ASSERT(false); } if(rev.get() != items[items.size() - 1 - i]) { printf("reverse iterator i=%d\n %s found\n %s expected\n", i, rev.get().toString().c_str(), items[items.size() - 1 - i].toString().c_str()); - printf("Delta: %s\n", rev.node->raw->delta->toString().c_str()); + printf("Delta: %s\n", rev.node->raw->delta().toString().c_str()); ASSERT(false); } ++i; From da9c4e97d34fa430199900ddf9cc76610e0821ce Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 7 Aug 2019 02:36:33 -0700 Subject: [PATCH 002/184] Added new pager interface, IPager2, whose write interface enables forcing the user to handle a page update causing a copy to a new Page ID. Implemented FIFOQueue which uses pages of T stored in a Pager2 instance to implement a FIFO queue. Implemented COWPager, a copy-on-write Pager2 in which all page writes cause a change of Page ID. VersionedBTree, still only operating in single-version mode, now uses Pager2. --- fdbserver/DeltaTree.h | 2 +- fdbserver/IPager.h | 65 +- fdbserver/VersionedBTree.actor.cpp | 1164 ++++++++++++++++++++++++---- flow/FastAlloc.h | 8 + tests/CMakeLists.txt | 1 + tests/RedwoodCorrectnessPager.txt | 6 + 6 files changed, 1113 insertions(+), 133 deletions(-) create mode 100644 tests/RedwoodCorrectnessPager.txt diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 6797d87a77..ad05a419b8 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -69,7 +69,6 @@ // // Retrieves the previously stored boolean // bool getPrefixSource() const; // -#pragma pack(push,1) template struct DeltaTree { @@ -77,6 +76,7 @@ struct DeltaTree { return std::numeric_limits::max(); }; +#pragma pack(push,1) struct Node { OffsetT leftChildOffset; OffsetT rightChildOffset; diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 5823588a17..4d449f917a 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -29,7 +29,7 @@ #define REDWOOD_DEBUG 0 -#define debug_printf_always(...) { fprintf(stdout, "%s %f ", g_network->getLocalAddress().toString().c_str(), now()), fprintf(stdout, __VA_ARGS__); fflush(stdout); } +#define debug_printf_always(...) { fprintf(stdout, "%s %f (%s:%d) ", g_network->getLocalAddress().toString().c_str(), now(), __FUNCTION__, __LINE__), fprintf(stdout, __VA_ARGS__); fflush(stdout); } #define debug_printf_noop(...) @@ -81,6 +81,10 @@ public: virtual Future> getPhysicalPage(LogicalPageID pageID) = 0; virtual Version getVersion() const = 0; + virtual Key getMetaKey() const { + return Key(); + } + virtual ~IPagerSnapshot() {} virtual void addref() = 0; @@ -146,4 +150,63 @@ protected: ~IPager() {} // Destruction should be done using close()/dispose() from the IClosable interface }; +class IPager2 : public IClosable { +public: + // Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed. + virtual Reference newPageBuffer() = 0; + + // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). + // For a given pager instance, separate calls to this function must return the same value. + // Only valid to call after recovery is complete. + virtual int getUsablePageSize() = 0; + + // Allocate a new page ID for a subsequent write. The page will be considered in-use after the next commit + // regardless of whether or not it was written to. + virtual Future newPageID() = 0; + + // Replace the contents of a page with new data. Existing holders of a page reference for pageID + // will see the effects of this write. + virtual void updatePage(LogicalPageID pageID, Reference data) = 0; + + // Try to atomically update the contents of a page as of the next successful commit() + // If the pager is unable to do this at this time, it may choose to write the data to a new page, + // call freePage(pageID), and return the new page id. Otherwise the pageID argument will be returned. + virtual Future atomicUpdatePage(LogicalPageID pageID, Reference data) = 0; + + // Free pageID to be used again after the next commit + virtual void freePage(LogicalPageID pageID) = 0; + + // Returns the data for a page by LogicalPageID + // The data returned will be the later of + // - the most recent committed atomic write + // - the most recent non-atomic write + virtual Future> readPage(LogicalPageID pageID) = 0; + + // Get a snapshot of the metakey and all pages as of the latest committed version. + // When a pager snapshot is created, the pager is guaraunteed to not remove or reuse any pages + // that were freed after the creation of this snapshot until the snapshot is destroyed + virtual Reference getReadSnapshot() = 0; + + // Atomically make durable all pending page writes, page frees, and update the metadata string. + virtual Future commit() = 0; + + // Get the latest meta key set or committed + virtual Key getMetaKey() const = 0; + + // Set the metakey which will be stored in the next commit + virtual void setMetaKey(KeyRef metaKey) = 0; + + // Sets the next commit version + virtual void setVersion(Version v) = 0; + + virtual StorageBytes getStorageBytes() = 0; + + // Returns latest committed version + // After the returned future is ready, future calls must not wait. + virtual Future getLatestVersion() = 0; + +protected: + ~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface +}; + #endif diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index a926926d5a..984c3c2d9a 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -25,6 +25,9 @@ #include "flow/serialize.h" #include "flow/genericactors.actor.h" #include "flow/UnitTest.h" +#include "fdbserver/IPager.h" +#include "fdbrpc/IAsyncFile.h" +#include "flow/ActorCollection.h" #include "fdbserver/MemoryPager.h" #include "fdbserver/IndirectShadowPager.h" #include @@ -35,6 +38,811 @@ #include #include "flow/actorcompiler.h" #include +#include + +// A FIFO queue of T stored as a linked list of pages. +template +class FIFOQueue { + static_assert(std::is_trivially_copyable::value); + +public: +#pragma pack(push, 1) + struct QueueState { + LogicalPageID headPageID = invalidLogicalPageID; + LogicalPageID tailPageID = invalidLogicalPageID; + uint16_t headIndex; + uint16_t tailIndex; + int64_t numPages; + int64_t numEntries; + std::string toString() const { + return format("head: %u:%d tail: %u:%d numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headIndex, tailPageID, (int)tailIndex, numPages, numEntries); + } + }; +#pragma pack(pop) + + struct Cursor { + // These can change when loading transitions from not ready to ready + LogicalPageID pageID; + int index; + Reference page; + + FIFOQueue *queue; + Future loading; + + Cursor() : queue(nullptr) { + } + + void init(FIFOQueue *q, LogicalPageID p) { + queue = q; + pageID = p; + index = 0; + page = queue->pager->newPageBuffer(); + loading = Void(); + writePage(); + } + + void init(FIFOQueue *q, LogicalPageID p, int i) { + queue = q; + pageID = p; + index = i; + loading = loadPage(p, i); + } + + Cursor(Cursor &) = delete; + void operator=(Cursor &) = delete; + + ~Cursor() { + loading.cancel(); + } + +#pragma pack(push, 1) + struct RawPage { + LogicalPageID next; + + inline T & at(int i) { + return ((T *)(this + 1))[i]; + } + }; +#pragma pack(pop) + + bool end() const { + return index == queue->itemsPerPage; + } + + Future loadPage(LogicalPageID newPageID, int newIndex) { + debug_printf("queue(%p, %s) loading page %u index %d\n", this, queue->name.c_str(), newPageID, newIndex); + return map(queue->pager->readPage(newPageID), [=](Reference p) { + page = p; + pageID = newPageID; + index = newIndex; + return Void(); + }); + } + + Future newPage() { + debug_printf("queue(%p, %s) new page\n", this, queue->name.c_str()); + return map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { + pageID = newPageID; + index = 0; + page = queue->pager->newPageBuffer(); + ++queue->numPages; + return Void(); + }); + } + + T & getItem() const { + return ((RawPage *)(page->begin()))->at(index); + } + + bool operator== (const Cursor &rhs) { + return pageID == rhs.pageID && index == rhs.index; + } + + void writePage() { + queue->pager->updatePage(pageID, page); + } + + ACTOR static Future waitThenWriteNext(Cursor *self, T item) { + wait(self->loading); + wait(self->writeNext(item)); + return Void(); + } + + Future writeNext(const T &item) { + // If the cursor is loaded already, write the item and move to the next slot + if(loading.isReady()) { + getItem() = item; + ++queue->numEntries; + ++index; + if(this->end()) { + this->loading = newPage(); + } + return Void(); + } + + return waitThenWriteNext(this, item); + } + + ACTOR static Future> waitThenMoveNext(Cursor *self, Optional upperBound) { + wait(self->loading); + Optional result = wait(self->moveNext(upperBound)); + return result; + } + + // Read and moved past the next item if it is < upperBound + Future> moveNext(const Optional &upperBound = {}) { + if(loading.isReady()) { + if(upperBound.present() && getItem() >= upperBound.get()) { + return Optional(); + } + + T result = getItem(); + --queue->numEntries; + ++index; + + // If this page is out of items, start reading the next one + if(end()) { + loading = loadPage(((RawPage *)page->begin())->next, 0); + --queue->numPages; + } + return Optional(result); + } + + return waitThenMoveNext(this, upperBound); + } + }; + +public: + FIFOQueue() : pager(nullptr) { + } + + FIFOQueue(const FIFOQueue &other) = delete; + void operator=(const FIFOQueue &rhs) = delete; + + // Create a new queue at newPageID + void init(IPager2 *p, LogicalPageID newPageID, std::string queueName) { + debug_printf("FIFOQueue::init(%p, %s) from page id %u\n", this, name.c_str(), newPageID); + pager = p; + name = queueName; + numPages = 1; + numEntries = 0; + head.init(this, newPageID); + tail.init(this, newPageID); + stop.init(this, newPageID); + ASSERT(flush().isReady()); + } + + // Load an existing queue from its queue state + void init(IPager2 *p, const QueueState &qs, std::string queueName) { + debug_printf("FIFOQueue::init(%p, %s) from queue state %u\n", this, name.c_str(), qs.toString().c_str()); + pager = p; + this->name = name; + name = queueName; + numPages = qs.numPages; + numEntries = qs.numEntries; + head.init(this, qs.headPageID, qs.headIndex); + tail.init(this, qs.tailPageID, qs.tailIndex); + stop.init(this, qs.tailPageID, qs.tailIndex); + ASSERT(flush().isReady()); + } + + Future> pop(Optional upperBound = {}) { + if(head == stop) { + return Optional(); + } + return head.moveNext(upperBound); + } + + QueueState getState() const { + QueueState s; + s.headIndex = head.index; + s.headPageID = head.pageID; + s.tailIndex = tail.index; + s.tailPageID = tail.pageID; + s.numEntries = numEntries; + s.numPages = numPages; + return s; + } + + ACTOR static Future writeActor(FIFOQueue *self, FutureStream queue) { + try { + loop { + state T item = waitNext(queue); + self->tail.writeNext(item); + } + } + catch(Error &e) { + if(e.code() != error_code_end_of_stream) { + throw; + } + } + + self->tail.writePage(); + self->stop.init(self, self->tail.pageID, self->tail.index); + + return self->getState(); + } + + void push(const T &item) { + writeQueue.send(item); + } + + // Flush changes to the pager and return the resulting queue state. + Future flush() { + debug_printf("FIFOQueue::flush %p %s\n", this, name.c_str()); + Future oldWriter = writer; + writeQueue.sendError(end_of_stream()); + writeQueue = PromiseStream(); + writer = writeActor(this, writeQueue.getFuture()); + if(!oldWriter.isValid()) { + debug_printf("FIFOQueue::flush %p oldwriter not valid %s\n", this, name.c_str()); + return getState(); + } + return oldWriter; + } + + IPager2 *pager; + int64_t numPages; + int64_t numEntries; + int itemsPerPage; + + PromiseStream writeQueue; + Future writer; + + // Invariant: head <= stop <= tail + Cursor head; + Cursor stop; + Cursor tail; + + // For debugging + std::string name; +}; + +int nextPowerOf2(uint32_t x) { + return 1 << (32 - clz(x - 1)); +} + +class FastAllocatedPage : public IPage, ReferenceCounted { +public: + // Create a fast-allocated page with size total bytes INCLUDING checksum + FastAllocatedPage(int size, int bufferSize) : logicalSize(size), bufferSize(bufferSize) { + buffer = (uint8_t *)allocateFast(bufferSize); + VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize); + }; + + virtual ~FastAllocatedPage() { + freeFast(bufferSize, buffer); + } + + // Usable size, without checksum + int size() const { + return logicalSize - sizeof(Checksum); + } + + uint8_t const* begin() const { + return buffer; + } + + uint8_t* mutate() { + return buffer; + } + + void addref() const { + ReferenceCounted::addref(); + } + + void delref() const { + ReferenceCounted::delref(); + } + + typedef uint32_t Checksum; + + Checksum & getChecksum() { + return *(Checksum *)(buffer + size()); + } + + Checksum calculateChecksum(LogicalPageID pageID) { + return crc32c_append(pageID, buffer, size()); + } + + void updateChecksum(LogicalPageID pageID) { + getChecksum() = calculateChecksum(pageID); + } + + bool verifyChecksum(LogicalPageID pageID) { + return getChecksum() == calculateChecksum(pageID); + } +private: + int logicalSize; + int bufferSize; + uint8_t *buffer; +}; + +// Holds an index of recently used objects. +// ObjectType must have the method +// bool evictable() const; +// indicating if it is safe to evict. +template +class ObjectCache { +public: + ObjectCache(int sizeLimit) : sizeLimit(sizeLimit) { + } + + // Get the object for i or create a new one. + // After a get(), the object for i is the last in evictionOrder. + ObjectType & get(const IndexType &index) { + Entry &entry = cache[index]; + + // If entry is linked into evictionOrder then move it to the back of the order + if(entry.is_linked()) { + // Move the entry to the back of the eviction order + evictionOrder.erase(evictionOrder.iterator_to(entry)); + evictionOrder.push_back(entry); + } + else { + // Finish initializing entry + entry.index = index; + // Insert the newly created Entry at the back of the eviction order + evictionOrder.push_back(entry); + + // If the cache is too big, try to evict the first Entry in the eviction order + if(cache.size() > sizeLimit) { + Entry &toEvict = evictionOrder.front(); + if(toEvict.item.evictable()) { + evictionOrder.pop_front(); + cache.erase(toEvict.index); + } + } + } + + return entry.item; + } + + void clear() { + evictionOrder.clear(); + cache.clear(); + } + +private: + struct Entry : public boost::intrusive::list_base_hook<> { + IndexType index; + ObjectType item; + }; + + int sizeLimit; + boost::intrusive::list evictionOrder; + + // TODO: Use boost intrusive unordered set instead, with a comparator that only considers entry.index + std::unordered_map cache; +}; + + +class COWPager : public IPager2 { +public: + typedef FastAllocatedPage Page; + typedef FIFOQueue LogicalPageQueueT; + + // If the file already exists, pageSize might be different than desiredPageSize + COWPager(int desiredPageSize, std::string filename, int cachedPageLimit) : desiredPageSize(desiredPageSize), filename(filename), pageCache(cachedPageLimit), pHeader(nullptr) { + commitFuture = Void(); + recoverFuture = recover(this); + } + + void setPageSize(int size) { + logicalPageSize = size; + physicalPageSize = smallestPhysicalBlock; + while(logicalPageSize > physicalPageSize) { + physicalPageSize += smallestPhysicalBlock; + } + if(pHeader != nullptr) { + pHeader->pageSize = logicalPageSize; + } + } + + ACTOR static Future recover(COWPager *self) { + ASSERT(!self->recoverFuture.isValid()); + + int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK; + state bool exists = fileExists(self->filename); + if(!exists) { + flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE; + } + + wait(store(self->pageFile, IAsyncFileSystem::filesystem()->open(self->filename, flags, 0644))); + + // Header page is always treated as having a page size of smallestPhysicalBlock + self->setPageSize(smallestPhysicalBlock); + + if(exists) { + debug_printf("File exists, reading header: %s\n", self->filename.c_str()); + + // Read physical page 0 directly + wait(store(self->headerPage, self->readPhysicalPage(self, 0))); + self->pHeader = (Header *)self->headerPage->begin(); + self->setPageSize(self->pHeader->pageSize); + + if(self->logicalPageSize != self->desiredPageSize) { + TraceEvent(SevWarn, "COWPagerPageSizeNotDesired") + .detail("Filename", self->filename) + .detail("ExistingPageSize", self->logicalPageSize) + .detail("DesiredPageSize", self->desiredPageSize); + } + + self->freeList.init(self, self->pHeader->freeList, "FreeListRecovered"); + } + else { + debug_printf("File does not exist, creating header page: %s\n", self->filename.c_str()); + + self->headerPage = self->newPageBuffer(); + self->pHeader = (Header *)self->headerPage->begin(); + + // Now that the header page has been allocated, set page size to desired + self->setPageSize(self->desiredPageSize); + + // Write new header using desiredPageSize + self->pHeader->formatVersion = 1; + self->pHeader->committedVersion = 1; + // No meta key until a user sets one and commits + self->pHeader->setMetaKey(Key()); + + // There will be 2 page IDs in use + // Page 0 will be the header + // Page 1 will be the empty free list queue, which won't actually be written to the file as the page has no content + self->pHeader->pageCount = 2; + + // Create a new free list at page 1 + self->freeList.init(self, 1, "FreeListNew"); + + // Flush free list, store state in header + store(self->pHeader->freeList, self->freeList.flush()); + + // Clear remaining bytes of header + memset(self->headerPage->mutate() + self->pHeader->size(), 0, self->headerPage->size() - self->pHeader->size()); + + // Update header page on disk and sync + wait(self->writePhysicalPage(0, self->headerPage)); + + wait(self->commit()); + } + + self->lastCommittedVersion = self->pHeader->committedVersion; + self->lastCommittedMeta = self->pHeader->getMetaKey(); + + debug_printf("Recovered %s\n", self->filename.c_str()); + return Void(); + } + + // Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed. + Reference newPageBuffer() { + return Reference(new FastAllocatedPage(logicalPageSize, physicalPageSize)); + } + + // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). + // For a given pager instance, separate calls to this function must return the same value. + int getUsablePageSize() { + return logicalPageSize - sizeof(FastAllocatedPage::Checksum); + } + + // Get a new, previously available page ID. The page will be considered in-use after the next commit + // regardless of whether or not it was written to. + Future newPageID() { + Future> nextPageID = freeList.pop(); + if(nextPageID.isReady()) { + if(nextPageID.get().present()) { + return nextPageID.get().get(); + } + return ++pHeader->pageCount; + } + + return map(nextPageID, [=](Optional nextPageID) { + if(nextPageID.present()) { + return nextPageID.get(); + } + return (LogicalPageID)++(pHeader->pageCount); + }); + }; + + Future writePhysicalPage(PhysicalPageID pageID, Reference page) { + ((Page *)page.getPtr())->updateChecksum(pageID); + int physicalSize = (pageID == 0) ? smallestPhysicalBlock : physicalPageSize; + return holdWhile(page, pageFile->write(page->begin(), physicalSize, (int64_t)pageID * physicalSize)); + } + + void updatePage(LogicalPageID pageID, Reference data) { + // Get the cache entry for this page + PageCacheEntry &cacheEntry = pageCache.get(pageID); + debug_printf("COWPager op=write id=%u cached=%d\n", pageID, cacheEntry.page.isValid()); + + // If the cache entry exists and has already been read, copy data over top of the page in the cache + // so any holders of the page reference see the change. + if(cacheEntry.page.isValid()) { + // It should not be the case that we write a page that is still being read. + ASSERT(cacheEntry.page.isReady()); + } + cacheEntry.page = data; + + writes.add(writePhysicalPage(pageID, data)); + } + + Future atomicUpdatePage(LogicalPageID pageID, Reference data) { + freePage(pageID); + return map(newPageID(), [=](LogicalPageID newPageID) { + updatePage(newPageID, data); + return newPageID; + }); + } + + // Free pageID to be used again after the next commit + void freePage(LogicalPageID pageID) { + freeList.push(pageID); + }; + + ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { + state Reference page = self->newPageBuffer(); + + try { + int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); + ASSERT(readBytes == self->physicalPageSize); + ASSERT(((Page *)page.getPtr())->verifyChecksum(pageID)); + } catch(Error &e) { + if(e.code() != error_code_actor_cancelled) { + self->errorPromise.sendError(e); + } + throw; + } + + return page; + } + + // Reads the most recent version of pageID either committed or written using updatePage() + Future> readPage(LogicalPageID pageID) { + PageCacheEntry &cacheEntry = pageCache.get(pageID); + debug_printf("COWPager op=read id=%u cached=%d ready=%d\n", pageID, cacheEntry.page.isValid(), cacheEntry.page.isValid() && cacheEntry.page.isReady()); + + if(!cacheEntry.page.isValid()) { + cacheEntry.page = readPhysicalPage(this, (PhysicalPageID)pageID); + } + + return cacheEntry.page; + } + + // Get snapshot as of the most recent committed version of the pager + Reference getReadSnapshot(); + + void snapshotDestroyed(Version v) { + auto i = snapshotsInUse.find(v); + ASSERT(i != snapshotsInUse.end()); + ASSERT(i->second > 0); + --i->second; + bool first = i == snapshotsInUse.begin(); + if(i->second == 0) { + snapshotsInUse.erase(i); + if(first) { + leastSnapshotVersionChanged.trigger(); + } + } + } + + ACTOR static Future commit_impl(COWPager *self) { + // Flush the free list queue to the pager + LogicalPageQueueT::QueueState freeListState = wait(self->freeList.flush()); + + // Update header in memory + self->pHeader->freeList = freeListState; + + // Wait for all outstanding writes to complete + wait(self->writes.signalAndCollapse()); + + // Sync everything except the header + wait(self->pageFile->sync()); + + // Update header on disk and sync + wait(self->writePhysicalPage(0, self->headerPage)); + wait(self->pageFile->sync()); + + self->lastCommittedVersion = self->pHeader->committedVersion; + self->lastCommittedMeta = self->pHeader->getMetaKey(); + + return Void(); + } + + // Make durable all pending page writes and page frees. + Future commit() { + // Can't have more than one commit outstanding. + ASSERT(commitFuture.isReady()); + commitFuture = commit_impl(this); + return commitFuture; + } + + Key getMetaKey() const { + ASSERT(recoverFuture.isReady()); + return pHeader->getMetaKey(); + } + + void setVersion(Version v) { + pHeader->committedVersion = v; + } + + void setMetaKey(KeyRef metaKey) { + pHeader->setMetaKey(metaKey); + } + + ACTOR void shutdown(COWPager *self, bool dispose) { + self->recoverFuture.cancel(); + + if(self->errorPromise.canBeSet()) + self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + + // Cancel all reads. Any in-progress writes will be holding references to their required pages + self->pageCache.clear(); + + wait(ready(self->writes.signal())); + wait(ready(self->commitFuture)); + + self->pageFile.clear(); + + self->closedPromise.send(Void()); + delete self; + } + + void dispose() { + shutdown(this, true); + } + + void close() { + shutdown(this, false); + } + + Future getError() { + return errorPromise.getFuture(); + } + + Future onClosed() { + return closedPromise.getFuture(); + } + + Future onError() { + return errorPromise.getFuture(); + } + + Future onClose() { + return closedPromise.getFuture(); + } + + StorageBytes getStorageBytes() { + ASSERT(recoverFuture.isReady()); + int64_t free; + int64_t total; + g_network->getDiskBytes(parentDirectory(filename), free, total); + int64_t pagerSize = pHeader->pageCount * physicalPageSize; + int64_t reusable = freeList.numEntries * physicalPageSize; + return StorageBytes(free, total, pagerSize, free + reusable); + } + + Future getLatestVersion() { + return map(recoverFuture, [=](Void) { + return lastCommittedVersion; + }); + } + +private: + ~COWPager() {} + +#pragma pack(push, 1) + // Header is the format of page 0 of the database + struct Header { + Version formatVersion; + uint32_t pageSize; + int64_t pageCount; + FIFOQueue::QueueState freeList; + Version committedVersion; + int32_t metaKeySize; + + Key getMetaKey() const { + return KeyRef((const uint8_t *)this + sizeof(Header), metaKeySize); + } + + void setMetaKey(StringRef key) { + ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); + metaKeySize = key.size(); + memcpy((uint8_t *)this + sizeof(Header), key.begin(), key.size()); + } + + int size() const { + return sizeof(Header) + metaKeySize; + } + + private: + Header(); + }; +#pragma pack(pop) + + struct PageCacheEntry { + Future> page; + + bool evictable() const { + // Don't evict if a page is still being read + return page.isReady(); + } + ~PageCacheEntry() { + page.cancel(); + } + }; + + // Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires + // this in simulation, and it also makes sense for current SSDs. + // Allowing a smaller 'logical' page size is very useful for testing. + static constexpr int smallestPhysicalBlock = 4096; + int physicalPageSize; + int logicalPageSize; // In simulation testing it can be useful to use a small logical page size + + // The header will be written to / read from disk as a smallestPhysicalBlock sized chunk. + Reference headerPage; + Header *pHeader; + + int desiredPageSize; + + Version lastCommittedVersion; + Key lastCommittedMeta; + + std::string filename; + + ObjectCache pageCache; + + Promise closedPromise; + Promise errorPromise; + Future commitFuture; + SignalableActorCollection writes; + Future recoverFuture; + AsyncTrigger leastSnapshotVersionChanged; + std::map snapshotsInUse; + + Reference pageFile; + + LogicalPageQueueT freeList; +}; + +// Prevents pager from reusing freed pages from version until the snapshot is destroyed +class COWPagerSnapshot : public IPagerSnapshot, ReferenceCounted { +public: + COWPagerSnapshot(COWPager *pager, Key meta, Version version) : pager(pager), metaKey(meta), version(version) { + } + virtual ~COWPagerSnapshot() { + pager->snapshotDestroyed(version); + } + + Future> getPhysicalPage(LogicalPageID pageID) { + return map(pager->readPage(pageID), [=](Reference p) { + return Reference(p); + }); + } + + Key getMetaKey() const { + return metaKey; + } + + Version getVersion() const { + return version; + } + + void addref() { + ReferenceCounted::addref(); + } + + void delref() { + ReferenceCounted::delref(); + } + +private: + COWPager *pager; + Version version; + Key metaKey; +}; + +Reference COWPager::getReadSnapshot() { + ++snapshotsInUse[lastCommittedVersion]; + return Reference(new COWPagerSnapshot(this, lastCommittedMeta, lastCommittedVersion)); +} // TODO: Move this to a flow header once it is mature. struct SplitStringRef { @@ -821,14 +1629,14 @@ struct BTreePage { } }; -static void makeEmptyPage(Reference page, uint8_t newFlags, int pageSize) { - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); +static void makeEmptyPage(Reference page, uint8_t newFlags) { BTreePage *btpage = (BTreePage *)page->begin(); btpage->flags = newFlags; btpage->kvBytes = 0; btpage->count = 0; btpage->extensionPageCount = 0; btpage->tree().build(nullptr, nullptr, nullptr, nullptr); + VALGRIND_MAKE_MEM_DEFINED(page->begin() + btpage->tree().size(), page->size() - btpage->tree().size()); } BTreePage::BinaryTree::Reader * getReader(Reference page) { @@ -837,20 +1645,25 @@ BTreePage::BinaryTree::Reader * getReader(Reference page) { struct BoundaryAndPage { Standalone lowerBound; - // Only firstPage or multiPage will be in use at once Reference firstPage; std::vector> extPages; + + std::string toString() const { + return format("[%s, %d pages]", lowerBound.toString().c_str(), extPages.size() + (firstPage ? 1 : 0)); + } }; // Returns a std::vector of pairs of lower boundary key indices within kvPairs and encoded pages. // TODO: Refactor this as an accumulator you add sorted keys to which makes pages. -template -static std::vector buildPages(bool minimalBoundaries, const RedwoodRecordRef &lowerBound, const RedwoodRecordRef &upperBound, std::vector entries, uint8_t newFlags, Allocator const &newBlockFn, int usableBlockSize) { - // This is how much space for the binary tree exists in the page, after the header - int pageSize = usableBlockSize - BTreePage::GetHeaderSize(); +static std::vector buildPages(bool minimalBoundaries, const RedwoodRecordRef &lowerBound, const RedwoodRecordRef &upperBound, std::vector entries, uint8_t newFlags, IPager2 *pager) { + ASSERT(entries.size() > 0); + int usablePageSize = pager->getUsablePageSize(); - // Each new block adds (usableBlockSize - sizeof(LogicalPageID)) more net usable space *for the binary tree* to pageSize. - int netTreeBlockSize = usableBlockSize - sizeof(LogicalPageID); + // This is how much space for the binary tree exists in the page, after the header + int pageSize = usablePageSize - BTreePage::GetHeaderSize(); + + // Each new block adds (usablePageSize - sizeof(LogicalPageID)) more net usable space *for the binary tree* to pageSize. + int netTreeBlockSize = usablePageSize - sizeof(LogicalPageID); int blockCount = 1; std::vector pages; @@ -958,7 +1771,7 @@ static std::vector buildPages(bool minimalBoundaries, const Red int allocatedSize; if(blockCount == 1) { - Reference page = newBlockFn(); + Reference page = pager->newPageBuffer(); VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); btPageMem = page->mutate(); allocatedSize = page->size(); @@ -966,7 +1779,7 @@ static std::vector buildPages(bool minimalBoundaries, const Red } else { ASSERT(blockCount > 1); - allocatedSize = usableBlockSize * blockCount; + allocatedSize = usablePageSize * blockCount; btPageMem = new uint8_t[allocatedSize]; VALGRIND_MAKE_MEM_DEFINED(btPageMem, allocatedSize); } @@ -983,21 +1796,21 @@ static std::vector buildPages(bool minimalBoundaries, const Red } if(blockCount != 1) { - Reference page = newBlockFn(); + Reference page = pager->newPageBuffer(); VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); const uint8_t *rptr = btPageMem; - memcpy(page->mutate(), rptr, usableBlockSize); - rptr += usableBlockSize; + memcpy(page->mutate(), rptr, usablePageSize); + rptr += usablePageSize; std::vector> extPages; for(int b = 1; b < blockCount; ++b) { - Reference extPage = newBlockFn(); + Reference extPage = pager->newPageBuffer(); VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - //debug_printf("block %d write offset %d\n", b, firstBlockSize + (b - 1) * usableBlockSize); - memcpy(extPage->mutate(), rptr, usableBlockSize); - rptr += usableBlockSize; + //debug_printf("block %d write offset %d\n", b, firstBlockSize + (b - 1) * usablePageSize); + memcpy(extPage->mutate(), rptr, usablePageSize); + rptr += usablePageSize; extPages.push_back(std::move(extPage)); } @@ -1027,6 +1840,25 @@ public: // A record which is greater than the last possible record in the tree static RedwoodRecordRef dbEnd; + struct LazyDeleteQueueEntry { + Version version; + LogicalPageID pageID; + }; + + typedef FIFOQueue LazyDeleteQueueT; + + struct MetaKey { + LogicalPageID root; + LazyDeleteQueueT::QueueState lazyDeleteQueue; + KeyRef asKeyRef() const { + return KeyRef((uint8_t *)this, sizeof(MetaKey)); + } + void fromKeyRef(KeyRef k) { + ASSERT(k.size() == sizeof(MetaKey)); + memcpy(this, k.begin(), k.size()); + } + }; + struct Counts { Counts() { memset(this, 0, sizeof(Counts)); @@ -1073,7 +1905,7 @@ public: } void close_impl(bool dispose) { - IPager *pager = m_pager; + auto *pager = m_pager; delete this; if(dispose) pager->dispose(); @@ -1168,32 +2000,46 @@ public: return m_lastCommittedVersion; } - VersionedBTree(IPager *pager, std::string name, bool singleVersion = false, int target_page_size = -1) + VersionedBTree(IPager2 *pager, std::string name, bool singleVersion = false) : m_pager(pager), m_writeVersion(invalidVersion), - m_usablePageSizeOverride(pager->getUsablePageSize()), m_lastCommittedVersion(invalidVersion), m_pBuffer(nullptr), m_name(name), singleVersion(singleVersion) { - if(target_page_size > 0 && target_page_size < m_usablePageSizeOverride) - m_usablePageSizeOverride = target_page_size; m_init = init_impl(this); m_latestCommit = m_init; } ACTOR static Future init_impl(VersionedBTree *self) { - self->m_root = 0; state Version latest = wait(self->m_pager->getLatestVersion()); - if(latest == 0) { + debug_printf("Recovered to version %" PRId64 "\n", latest); + + state Key meta = self->m_pager->getMetaKey(); + if(meta.size() == 0) { + LogicalPageID newRoot = wait(self->m_pager->newPageID()); + debug_printf("new root page %u\n", newRoot); + self->m_header.root = newRoot; ++latest; Reference page = self->m_pager->newPageBuffer(); - makeEmptyPage(page, BTreePage::IS_LEAF, self->m_usablePageSizeOverride); - self->writePage(self->m_root, page, latest, &dbBegin, &dbEnd); - self->m_pager->setLatestVersion(latest); + makeEmptyPage(page, BTreePage::IS_LEAF); + self->writePage(self->m_header.root, page, latest, &dbBegin, &dbEnd); + self->m_pager->setVersion(latest); + + LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); + debug_printf("new lazy delete queue page %u\n", newQueuePage); + self->m_lazyDeleteQueue.init(self->m_pager, newQueuePage, "LazyDeleteQueueNew"); + self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); + self->m_pager->setMetaKey(self->m_header.asKeyRef()); wait(self->m_pager->commit()); + debug_printf("Committed initial commit.\n"); } + else { + self->m_header.fromKeyRef(meta); + self->m_lazyDeleteQueue.init(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); + } + self->m_maxPartSize = std::min(255, self->m_pager->getUsablePageSize() / 5); self->m_lastCommittedVersion = latest; return Void(); } @@ -1222,7 +2068,9 @@ public: if(singleVersion) { ASSERT(v == m_lastCommittedVersion); } - return Reference(new Cursor(m_pager->getReadSnapshot(v), m_root, recordVersion, m_usablePageSizeOverride)); + Reference snapshot = m_pager->getReadSnapshot(/* v */); + Key m = snapshot->getMetaKey(); + return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root, recordVersion)); } // Must be nondecreasing @@ -1258,11 +2106,9 @@ public: private: void writePage(LogicalPageID id, Reference page, Version ver, const RedwoodRecordRef *pageLowerBound, const RedwoodRecordRef *pageUpperBound) { debug_printf("writePage(): %s\n", ((const BTreePage *)page->begin())->toString(true, id, ver, pageLowerBound, pageUpperBound).c_str()); - m_pager->writePage(id, page, ver); + m_pager->updatePage(id, page); //, ver); } - LogicalPageID m_root; - // TODO: Don't use Standalone struct VersionedChildPageSet { Version version; @@ -1506,18 +2352,21 @@ private: * to be sorted later just before being merged into the existing leaf page. */ - IPager *m_pager; + IPager2 *m_pager; MutationBufferT *m_pBuffer; std::map m_mutationBuffers; Version m_writeVersion; Version m_lastCommittedVersion; Future m_latestCommit; - int m_usablePageSizeOverride; Future m_init; std::string m_name; bool singleVersion; + MetaKey m_header; + LazyDeleteQueueT m_lazyDeleteQueue; + int m_maxPartSize; + void printMutationBuffer(MutationBufferT::const_iterator begin, MutationBufferT::const_iterator end) const { #if REDWOOD_DEBUG debug_printf("-------------------------------------\n"); @@ -1565,78 +2414,87 @@ private: return ib; } - void buildNewRoot(Version version, std::vector &pages, std::vector &logicalPageIDs, const BTreePage *pPage) { - //debug_printf("buildNewRoot start %lu\n", pages.size()); + ACTOR static Future buildNewRoot(VersionedBTree *self, Version version, std::vector *pages, std::vector *logicalPageIDs, BTreePage *pPage) { + debug_printf("buildNewRoot start version %" PRId64 ", %lu pages %s\n", version, pages->size()); + // While there are multiple child pages for this version we must write new tree levels. - while(pages.size() > 1) { + while(pages->size() > 1) { std::vector childEntries; - for(int i=0; isize(); i++) { + RedwoodRecordRef entry = pages->at(i).lowerBound.withPageID(logicalPageIDs->at(i)); debug_printf("Added new root entry %s\n", entry.toString().c_str()); childEntries.push_back(entry); } - pages = buildPages(false, dbBegin, dbEnd, childEntries, 0, [=](){ return m_pager->newPageBuffer(); }, m_usablePageSizeOverride); + *pages = buildPages(false, dbBegin, dbEnd, childEntries, 0, self->m_pager); - debug_printf("Writing a new root level at version %" PRId64 " with %lu children across %lu pages\n", version, childEntries.size(), pages.size()); + debug_printf("Writing a new root level at version %" PRId64 " with %lu children across %lu pages\n", version, childEntries.size(), pages->size()); - logicalPageIDs = writePages(pages, version, m_root, pPage, &dbEnd, nullptr); + std::vector ids = wait(writePages(self, *pages, version, self->m_header.root, pPage, &dbEnd, nullptr)); + *logicalPageIDs = std::move(ids); } + + return Void(); } - std::vector writePages(std::vector pages, Version version, LogicalPageID originalID, const BTreePage *originalPage, const RedwoodRecordRef *upperBound, void *actor_debug) { + ACTOR static Future> writePages(VersionedBTree *self, std::vector pages, Version version, LogicalPageID originalID, const BTreePage *originalPage, const RedwoodRecordRef *upperBound, void *actor_debug) { debug_printf("%p: writePages(): %u @%" PRId64 " -> %lu replacement pages\n", actor_debug, originalID, version, pages.size()); ASSERT(version != 0 || pages.size() == 1); - std::vector primaryLogicalPageIDs; + state std::vector primaryLogicalPageIDs; + // TODO: Re-enable this once using pager's atomic replacement // Reuse original primary page ID if it's not the root or if only one page is being written. - if(originalID != m_root || pages.size() == 1) - primaryLogicalPageIDs.push_back(originalID); + //if(originalID != self->m_root || pages.size() == 1) + // primaryLogicalPageIDs.push_back(originalID); // Allocate a primary page ID for each page to be written while(primaryLogicalPageIDs.size() < pages.size()) { - primaryLogicalPageIDs.push_back(m_pager->allocateLogicalPage()); + LogicalPageID id = wait(self->m_pager->newPageID()); + primaryLogicalPageIDs.push_back(id); } debug_printf("%p: writePages(): Writing %lu replacement pages for %d at version %" PRId64 "\n", actor_debug, pages.size(), originalID, version); - for(int i=0; i> *extPages = &pages[i].extPages; // If there are extension pages, write all pages using pager directly because this->writePage() is for whole primary pages - if(extPages.size() != 0) { - BTreePage *newPage = (BTreePage *)pages[i].firstPage->mutate(); - ASSERT(newPage->extensionPageCount == extPages.size()); + if(extPages->size() != 0) { + state BTreePage *newPage = (BTreePage *)pages[i].firstPage->mutate(); + ASSERT(newPage->extensionPageCount == extPages->size()); - for(int e = 0, eEnd = extPages.size(); e < eEnd; ++e) { - LogicalPageID eid = m_pager->allocateLogicalPage(); - debug_printf("%p: writePages(): Writing extension page op=write id=%u @%" PRId64 " (%d of %lu) referencePageID=%u\n", actor_debug, eid, version, e + 1, extPages.size(), id); + state int e; + state int eEnd = extPages->size(); + for(e = 0; e < eEnd; ++e) { + LogicalPageID eid = wait(self->m_pager->newPageID()); + debug_printf("%p: writePages(): Writing extension page op=write id=%u @%" PRId64 " (%d of %lu) referencePageID=%u\n", actor_debug, eid, version, e + 1, extPages->size(), id); newPage->extensionPages()[e] = bigEndian32(eid); // If replacing the primary page below (version == 0) then pass the primary page's ID as the reference page ID - m_pager->writePage(eid, extPages[e], version, (version == 0) ? id : invalidLogicalPageID); + self->m_pager->updatePage(eid, extPages->at(e)); //, version, (version == 0) ? id : invalidLogicalPageID); ++counts.extPageWrites; } - debug_printf("%p: writePages(): Writing primary page op=write id=%u @%" PRId64 " (+%lu extension pages)\n", actor_debug, id, version, extPages.size()); - m_pager->writePage(id, pages[i].firstPage, version); + debug_printf("%p: writePages(): Writing primary page op=write id=%u @%" PRId64 " (+%lu extension pages)\n", actor_debug, id, version, extPages->size()); + self->m_pager->updatePage(id, pages[i].firstPage); // version); } else { debug_printf("%p: writePages(): Writing normal page op=write id=%u @%" PRId64 "\n", actor_debug, id, version); - writePage(id, pages[i].firstPage, version, &pages[i].lowerBound, (i == pages.size() - 1) ? upperBound : &pages[i + 1].lowerBound); + self->writePage(id, pages[i].firstPage, version, &pages[i].lowerBound, (i == pages.size() - 1) ? upperBound : &pages[i + 1].lowerBound); } } // Free the old extension pages now that all replacement pages have been written - for(int i = 0; i < originalPage->extensionPageCount; ++i) { + //for(int i = 0; i < originalPage->extensionPageCount; ++i) { //debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages()[i])); //m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages()[i]), version); - } + //} return primaryLogicalPageIDs; } @@ -1682,11 +2540,12 @@ private: const int m_size; }; - ACTOR static Future> readPage(Reference snapshot, LogicalPageID id, int usablePageSize, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { + ACTOR static Future> readPage(Reference snapshot, LogicalPageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { debug_printf("readPage() op=read id=%u @%" PRId64 " lower=%s upper=%s\n", id, snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); wait(delay(0, TaskDiskRead)); state Reference result = wait(snapshot->getPhysicalPage(id)); + state int usablePageSize = result->size(); ++counts.pageReads; state const BTreePage *pTreePage = (const BTreePage *)result->begin(); @@ -1781,16 +2640,20 @@ private: } self->counts.commitToPage++; - state Reference rawPage = wait(readPage(snapshot, root, self->m_usablePageSizeOverride, decodeLowerBound, decodeUpperBound)); + state Reference rawPage = wait(readPage(snapshot, root, decodeLowerBound, decodeUpperBound)); state BTreePage *page = (BTreePage *) rawPage->begin(); debug_printf("%s commitSubtree(): %s\n", context.c_str(), page->toString(false, root, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); state BTreePage::BinaryTree::Cursor cursor = getReader(rawPage)->getCursor(); cursor.moveFirst(); + state std::vector pages; + state std::vector newPageIDs; + state VersionedChildrenT results; + state Version writeVersion; + // Leaf Page if(page->flags & BTreePage::IS_LEAF) { - VersionedChildrenT results; std::vector merged; debug_printf("%s id=%u MERGING EXISTING DATA WITH MUTATIONS:\n", context.c_str(), root); @@ -1846,8 +2709,7 @@ private: // Output mutations for the mutation boundary start key while(iMutations != iMutationsEnd) { const SingleKeyMutation &m = iMutations->second; - int maxPartSize = std::min(255, self->m_usablePageSizeOverride / 5); - if(m.isClear() || m.value.size() <= maxPartSize) { + if(m.isClear() || m.value.size() <= self->m_maxPartSize) { if(iMutations->first < minVersion || minVersion == invalidVersion) minVersion = iMutations->first; ++changes; @@ -1862,12 +2724,12 @@ private: int start = 0; RedwoodRecordRef whole(iMutationBoundary->first, iMutations->first, m.value); while(bytesLeft > 0) { - int partSize = std::min(bytesLeft, maxPartSize); + int partSize = std::min(bytesLeft, self->m_maxPartSize); // Don't copy the value chunk because this page will stay in memory until after we've built new version(s) of it merged.push_back(whole.split(start, partSize)); bytesLeft -= partSize; start += partSize; - debug_printf("%s Added split %s [mutation, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + debug_printf("%s Added split %s [mutation, boundary start] bytesLeft %d\n", context.c_str(), merged.back().toString().c_str(), bytesLeft); } } ++iMutations; @@ -1950,8 +2812,8 @@ private: return c; } - IPager *pager = self->m_pager; - std::vector pages = buildPages(true, *lowerBound, *upperBound, merged, BTreePage::IS_LEAF, [pager](){ return pager->newPageBuffer(); }, self->m_usablePageSizeOverride); + std::vector newPages = buildPages(true, *lowerBound, *upperBound, merged, BTreePage::IS_LEAF, self->m_pager); + pages = std::move(newPages); if(!self->singleVersion) { ASSERT(false); @@ -1970,13 +2832,14 @@ private: } // Write page(s), get new page IDs - Version writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; - std::vector newPageIDs = self->writePages(pages, writeVersion, root, page, upperBound, THIS); + writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; + std::vector pageIDs = wait(self->writePages(self, pages, writeVersion, root, page, upperBound, THIS)); + newPageIDs = std::move(pageIDs); // If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page - if(root == self->m_root && pages.size() > 1) { + if(root == self->m_header.root && pages.size() > 1) { debug_printf("%s Building new root\n", context.c_str()); - self->buildNewRoot(writeVersion, pages, newPageIDs, page); + wait(self->buildNewRoot(self, writeVersion, &pages, &newPageIDs, page)); } results.push_back({writeVersion, {}, *upperBound}); @@ -2103,7 +2966,7 @@ private: // If we are the root, write a new empty btree if(root == 0) { Reference page = self->m_pager->newPageBuffer(); - makeEmptyPage(page, BTreePage::IS_LEAF, self->m_usablePageSizeOverride); + makeEmptyPage(page, BTreePage::IS_LEAF); RedwoodRecordRef rootEntry = dbBegin.withPageID(0); self->writePage(0, page, self->getLastCommittedVersion() + 1, &dbBegin, &dbEnd); VersionedChildrenT c({ {0, {dbBegin}, dbEnd } }); @@ -2128,14 +2991,16 @@ private: entries.push_back(o); } - std::vector pages = buildPages(false, *lowerBound, *upperBound, entries, 0, [=](){ return self->m_pager->newPageBuffer(); }, self->m_usablePageSizeOverride); + std::vector newPages = buildPages(false, *lowerBound, *upperBound, entries, 0, self->m_pager); + pages = std::move(newPages); - Version writeVersion = self->getLastCommittedVersion() + 1; - std::vector newPageIDs = self->writePages(pages, writeVersion, root, page, upperBound, THIS); + writeVersion = self->getLastCommittedVersion() + 1; + std::vector pageIDs = wait(writePages(self, pages, writeVersion, root, page, upperBound, THIS)); + newPageIDs = std::move(pageIDs); // If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page - if(root == self->m_root) { - self->buildNewRoot(writeVersion, pages, newPageIDs, page); + if(root == self->m_header.root) { + wait(self->buildNewRoot(self, writeVersion, &pages, &newPageIDs, page)); } VersionedChildrenT vc(1); @@ -2181,16 +3046,25 @@ private: debug_printf("%s: Beginning commit of version %" PRId64 "\n", self->m_name.c_str(), writeVersion); // Get the latest version from the pager, which is what we will read at - Version latestVersion = wait(self->m_pager->getLatestVersion()); - debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); + //Version latestVersion = wait(self->m_pager->getLatestVersion()); + //debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); if(REDWOOD_DEBUG) { self->printMutationBuffer(mutations); } - VersionedChildrenT newRoot = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), self->m_root, &dbBegin, &dbEnd, &dbBegin, &dbEnd)); + state RedwoodRecordRef lowerBound = dbBegin.withPageID(self->m_header.root); + VersionedChildrenT newRoot = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(/*latestVersion*/), self->m_header.root, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); + debug_printf("CommitSubtree(root) returned %s\n", toString(newRoot).c_str()); + ASSERT(newRoot.size() == 1); + + self->m_header.root = newRoot.front().children.front().getPageID(); + self->m_pager->setVersion(writeVersion); + wait(store(self->m_header.lazyDeleteQueue, self->m_lazyDeleteQueue.flush())); + + debug_printf("Setting metakey\n"); + self->m_pager->setMetaKey(self->m_header.asKeyRef()); - self->m_pager->setLatestVersion(writeVersion); debug_printf("%s: Committing pager %" PRId64 "\n", self->m_name.c_str(), writeVersion); wait(self->m_pager->commit()); debug_printf("%s: Committed version %" PRId64 "\n", self->m_name.c_str(), writeVersion); @@ -2202,7 +3076,6 @@ private: self->m_lastCommittedVersion = writeVersion; ++self->counts.commits; -printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); committed.send(Void()); return Void(); @@ -2244,13 +3117,13 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return p->isLeaf(); } - Future> getChild(Reference pager, int usablePageSizeOverride) { + Future> getChild(Reference pager) { ASSERT(!isLeaf()); BTreePage::BinaryTree::Cursor next = cursor; next.moveNext(); const RedwoodRecordRef &rec = cursor.get(); LogicalPageID id = rec.getPageID(); - Future> child = readPage(pager, id, usablePageSizeOverride, &rec, &next.getOrUpperBound()); + Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); return map(child, [=](Reference page) { return Reference(new PageCursor(id, page, Reference::addRef(this))); }); @@ -2262,7 +3135,6 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); }; LogicalPageID rootPageID; - int usablePageSizeOverride; Reference pager; Reference pageCursor; @@ -2270,8 +3142,8 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); InternalCursor() { } - InternalCursor(Reference pager, LogicalPageID root, int usablePageSizeOverride) - : pager(pager), rootPageID(root), usablePageSizeOverride(usablePageSizeOverride) { + InternalCursor(Reference pager, LogicalPageID root) + : pager(pager), rootPageID(root) { } std::string toString() const { @@ -2334,7 +3206,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); } // Otherwise read the root page - Future> root = readPage(pager, rootPageID, usablePageSizeOverride, &dbBegin, &dbEnd); + Future> root = readPage(pager, rootPageID, &dbBegin, &dbEnd); return map(root, [=](Reference p) { pageCursor = Reference(new PageCursor(rootPageID, p)); return Void(); @@ -2368,7 +3240,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return true; } - Reference child = wait(self->pageCursor->getChild(self->pager, self->usablePageSizeOverride)); + Reference child = wait(self->pageCursor->getChild(self->pager)); self->pageCursor = child; } else { @@ -2421,7 +3293,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); } } - Reference child = wait(self->pageCursor->getChild(self->pager, self->usablePageSizeOverride)); + Reference child = wait(self->pageCursor->getChild(self->pager)); forward ? child->cursor.moveFirst() : child->cursor.moveLast(); self->pageCursor = child; } @@ -2469,7 +3341,7 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); return true; } - Reference child = wait(self->pageCursor->getChild(self->pager, self->usablePageSizeOverride)); + Reference child = wait(self->pageCursor->getChild(self->pager)); self->pageCursor = child; } else { @@ -2491,9 +3363,9 @@ printf("\nCommitted: %s\n", self->counts.toString(true).c_str()); // KeyValueRefs returned become invalid once the cursor is moved class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { public: - Cursor(Reference pageSource, LogicalPageID root, Version recordVersion, int usablePageSizeOverride) + Cursor(Reference pageSource, LogicalPageID root, Version recordVersion) : m_version(recordVersion), - m_cur1(pageSource, root, usablePageSizeOverride), + m_cur1(pageSource, root), m_cur2(m_cur1) { } @@ -2755,8 +3627,9 @@ class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) : m_filePrefix(filePrefix) { // TODO: This constructor should really just take an IVersionedStore - IPager *pager = new IndirectShadowPager(filePrefix); - m_tree = new VersionedBTree(pager, filePrefix, true, pager->getUsablePageSize()); + int pageSize = 4096; + IPager2 *pager = new COWPager(4096, filePrefix, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + m_tree = new VersionedBTree(pager, filePrefix, true); m_init = catchError(init_impl(this)); } @@ -3698,29 +4571,30 @@ struct SimpleCounter { TEST_CASE("!/redwood/correctness/btree") { state bool useDisk = true; // MemoryPager is not being maintained currently. - state std::string pagerFile = "unittest_pageFile"; - IPager *pager; + state std::string pagerFile = "unittest_pageFile.redwood"; + IPager2 *pager; state bool serialTest = deterministicRandom()->coinflip(); state bool shortTest = deterministicRandom()->coinflip(); state bool singleVersion = true; // Multi-version mode is broken / not finished state double startTime = now(); + state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); + printf("serialTest: %d shortTest: %d singleVersion: %d\n", serialTest, shortTest, singleVersion); if(useDisk) { printf("Deleting existing test data...\n"); deleteFile(pagerFile); - deleteFile(pagerFile + "0.pagerlog"); - deleteFile(pagerFile + "1.pagerlog"); - pager = new IndirectShadowPager(pagerFile); + pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + } + else { + ASSERT(false); + //pager = createMemoryPager(); } - else - pager = createMemoryPager(); printf("Initializing...\n"); - state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? pager->getUsablePageSize() : deterministicRandom()->randomInt(200, 400)); - state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion, pageSize); + state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); // We must be able to fit at least two any two keys plus overhead in a page to prevent @@ -3728,7 +4602,7 @@ TEST_CASE("!/redwood/correctness/btree") { // TODO: Handle arbitrarily large keys state int maxKeySize = deterministicRandom()->randomInt(4, pageSize * 2); state int maxValueSize = deterministicRandom()->randomInt(0, pageSize * 4); - state int maxCommitSize = shortTest ? 1000 : randomSize(10e6); + state int maxCommitSize = shortTest ? 1000 : 1e5 + randomSize(10e6); state int mutationBytesTarget = shortTest ? 5000 : randomSize(50e6); state double clearChance = deterministicRandom()->random01() * .1; @@ -3865,6 +4739,7 @@ TEST_CASE("!/redwood/correctness/btree") { Version v = version; // Avoid capture of version as a member of *this commit = map(btree->commit(), [=](Void) { + printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); // Notify the background verifier that version is committed and therefore readable committedVersions.send(v); return Void(); @@ -3901,8 +4776,8 @@ TEST_CASE("!/redwood/correctness/btree") { wait(closedFuture); debug_printf("Reopening btree\n"); - IPager *pager = new IndirectShadowPager(pagerFile); - btree = new VersionedBTree(pager, pagerFile, singleVersion, pageSize); + IPager2 *pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); Version v = wait(btree->getLatestVersion()); @@ -3941,41 +4816,65 @@ TEST_CASE("!/redwood/correctness/btree") { return Void(); } -ACTOR Future randomSeeks(VersionedBTree *btree, int count) { +ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, char lastChar) { state Version readVer = wait(btree->getLatestVersion()); state int c = 0; state double readStart = timer(); printf("Executing %d random seeks\n", count); state Reference cur = btree->readAtVersion(readVer); while(c < count) { - state Key k = randomString(20, 'a', 'b'); + wait(yield()); + state Key k = randomString(20, firstChar, lastChar); wait(success(cur->findFirstEqualOrGreater(k, false, 0))); ++c; } double elapsed = timer() - readStart; - printf("Point read speed %d/s\n", int(count / elapsed)); + printf("Random seek speed %d/s\n", int(count / elapsed)); return Void(); } - -TEST_CASE("!/redwood/performance/set") { - state std::string pagerFile = "unittest_pageFile"; +TEST_CASE("!/redwood/correctness/pager/cow") { + state std::string pagerFile = "unittest_pageFile.redwood"; printf("Deleting old test data\n"); deleteFile(pagerFile); - deleteFile(pagerFile + "0.pagerlog"); - deleteFile(pagerFile + "1.pagerlog"); - IPager *pager = new IndirectShadowPager(pagerFile); + int pageSize = 4096; + state IPager2 *pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + + wait(success(pager->getLatestVersion())); + state LogicalPageID id = wait(pager->newPageID()); + Reference p = pager->newPageBuffer(); + memset(p->mutate(), (char)id, p->size()); + pager->updatePage(id, p); + pager->setMetaKey(LiteralStringRef("asdfasdf")); + wait(pager->commit()); + Reference p2 = wait(pager->readPage(id)); + printf("%s\n", StringRef(p2->begin(), p2->size()).toHexString().c_str()); + + return Void(); +} + +TEST_CASE("!/redwood/performance/set") { + state std::string pagerFile = "unittest_pageFile.redwood"; + printf("Deleting old test data\n"); + deleteFile(pagerFile); + + int pageSize = 4096; + IPager2 *pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); state bool singleVersion = true; - state VersionedBTree *btree = new VersionedBTree(pager, "unittest_pageFile", singleVersion); + state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); state int nodeCount = 1e9; - state int maxChangesPerVersion = 500000; - state int64_t kvBytesTarget = 200e6; - state int maxKeyPrefixSize = 50; - state int maxValueSize = 100; - state int maxConsecutiveRun = 1; + state int maxChangesPerVersion = 5000; + state int64_t kvBytesTarget = 4000e6; + state int commitTarget = 20e6; + state int maxKeyPrefixSize = 25; + state int maxValueSize = 500; + state int maxConsecutiveRun = 10; + state int minValueSize = 0; + state char firstKeyChar = 'a'; + state char lastKeyChar = 'b'; state int64_t kvBytes = 0; state int64_t kvBytesTotal = 0; state int records = 0; @@ -3987,20 +4886,22 @@ TEST_CASE("!/redwood/performance/set") { state double start = intervalStart; while(kvBytesTotal < kvBytesTarget) { + wait(yield()); + Version lastVer = wait(btree->getLatestVersion()); state Version version = lastVer + 1; btree->setWriteVersion(version); int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); - while(changes > 0) { + while(changes > 0 && kvBytes < commitTarget) { KeyValue kv; - kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(sizeof(uint32_t), maxKeyPrefixSize + sizeof(uint32_t) + 1), 'a', 'b'); + kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(sizeof(uint32_t), maxKeyPrefixSize + sizeof(uint32_t) + 1), firstKeyChar, lastKeyChar); int32_t index = deterministicRandom()->randomInt(0, nodeCount); int runLength = deterministicRandom()->randomInt(1, maxConsecutiveRun + 1); while(runLength > 0 && changes > 0) { *(uint32_t *)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); - kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(0, value.size())); + kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); btree->set(kv); @@ -4011,7 +4912,7 @@ TEST_CASE("!/redwood/performance/set") { } } - if(kvBytes > 2e6) { + if(kvBytes >= commitTarget) { wait(commit); printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); @@ -4023,6 +4924,7 @@ TEST_CASE("!/redwood/performance/set") { double *pIntervalStart = &intervalStart; commit = map(btree->commit(), [=](Void result) { + printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); double elapsed = timer() - *pIntervalStart; printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); *pIntervalStart = timer(); @@ -4039,7 +4941,7 @@ TEST_CASE("!/redwood/performance/set") { printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); state int reads = 30000; - wait(randomSeeks(btree, reads) && randomSeeks(btree, reads) && randomSeeks(btree, reads)); + wait(randomSeeks(btree, reads, firstKeyChar, lastKeyChar) && randomSeeks(btree, reads, firstKeyChar, lastKeyChar) && randomSeeks(btree, reads, firstKeyChar, lastKeyChar)); Future closedFuture = btree->onClosed(); btree->close(); diff --git a/flow/FastAlloc.h b/flow/FastAlloc.h index 1959816e54..ddcadc30dd 100644 --- a/flow/FastAlloc.h +++ b/flow/FastAlloc.h @@ -211,6 +211,10 @@ static void* allocateFast(int size) { if (size <= 128) return FastAllocator<128>::allocate(); if (size <= 256) return FastAllocator<256>::allocate(); if (size <= 512) return FastAllocator<512>::allocate(); + if (size <= 1024) return FastAllocator<1024>::allocate(); + if (size <= 2048) return FastAllocator<2048>::allocate(); + if (size <= 4096) return FastAllocator<4096>::allocate(); + if (size <= 8192) return FastAllocator<8192>::allocate(); return new uint8_t[size]; } @@ -222,6 +226,10 @@ static void freeFast(int size, void* ptr) { if (size <= 128) return FastAllocator<128>::release(ptr); if (size <= 256) return FastAllocator<256>::release(ptr); if (size <= 512) return FastAllocator<512>::release(ptr); + if (size <= 1024) return FastAllocator<1024>::release(ptr); + if (size <= 2048) return FastAllocator<2048>::release(ptr); + if (size <= 4096) return FastAllocator<4096>::release(ptr); + if (size <= 8192) return FastAllocator<8192>::release(ptr); delete[](uint8_t*)ptr; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f0913f2a26..f5268aec4a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -63,6 +63,7 @@ add_fdb_test(TEST_FILES ReadAbsent.txt IGNORE) add_fdb_test(TEST_FILES ReadHalfAbsent.txt IGNORE) add_fdb_test(TEST_FILES RedwoodCorrectnessUnits.txt IGNORE) add_fdb_test(TEST_FILES RedwoodCorrectnessBTree.txt IGNORE) +add_fdb_test(TEST_FILES RedwoodCorrectnessPager.txt IGNORE) add_fdb_test(TEST_FILES fast/RedwoodCorrectnessBTree.txt IGNORE) add_fdb_test(TEST_FILES RedwoodCorrectness.txt IGNORE) add_fdb_test(TEST_FILES RedwoodPerfTests.txt IGNORE) diff --git a/tests/RedwoodCorrectnessPager.txt b/tests/RedwoodCorrectnessPager.txt new file mode 100644 index 0000000000..1dce72a01a --- /dev/null +++ b/tests/RedwoodCorrectnessPager.txt @@ -0,0 +1,6 @@ +testTitle=UnitTests +testName=UnitTests +startDelay=0 +useDB=false +maxTestCases=0 +testsMatching=!/redwood/correctness/pager From a04e3cce2357169e02a61a208bfa80667ce587d0 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 7 Aug 2019 04:11:33 -0700 Subject: [PATCH 003/184] Bug fixes in COWPager cache. Write while a read is outstanding is supported, the new write will wait on the prior read. New writes also wait on old writes. Cache entries no longer cancel in-progress operations when evicted. --- fdbserver/VersionedBTree.actor.cpp | 48 ++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 984c3c2d9a..47b2c46248 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -553,15 +553,33 @@ public: PageCacheEntry &cacheEntry = pageCache.get(pageID); debug_printf("COWPager op=write id=%u cached=%d\n", pageID, cacheEntry.page.isValid()); - // If the cache entry exists and has already been read, copy data over top of the page in the cache - // so any holders of the page reference see the change. - if(cacheEntry.page.isValid()) { - // It should not be the case that we write a page that is still being read. - ASSERT(cacheEntry.page.isReady()); + // If the page is still being read then it's not also being written because a write places + // the new content in the cache entry when the write is launched, not when it is completed. + // Any waiting readers should not see this write (though this might change) + if(cacheEntry.reading()) { + // Wait for the read to finish, then start the right. + cacheEntry.writeFuture = map(success(cacheEntry.page), [=](Void) { + writePhysicalPage(pageID, data); + return Void(); + }); + } + else { + // If the page is being written, wait for this write before issuing the new write + if(cacheEntry.writing()) { + cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { + writePhysicalPage(pageID, data); + return Void(); + }); + } + else { + cacheEntry.writeFuture = writePhysicalPage(pageID, data); + } } - cacheEntry.page = data; - writes.add(writePhysicalPage(pageID, data)); + writes.add(cacheEntry.writeFuture); + + // Always update the page contents immediately regardless of what happened above. + cacheEntry.page = data; } Future atomicUpdatePage(LogicalPageID pageID, Reference data) { @@ -759,13 +777,19 @@ private: struct PageCacheEntry { Future> page; + Future writeFuture; + + bool reading() const { + return page.isValid() && !page.isReady(); + } + + bool writing() const { + return writeFuture.isValid() && !writeFuture.isReady(); + } bool evictable() const { - // Don't evict if a page is still being read - return page.isReady(); - } - ~PageCacheEntry() { - page.cancel(); + // Don't evict if a page is still being read or written + return page.isReady() && !writing(); } }; From bd8ed07f4d803dd0e956854960455c0b517e51cc Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 7 Aug 2019 04:31:11 -0700 Subject: [PATCH 004/184] Missing header. --- fdbserver/VersionedBTree.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 7be0cbf95b..075204e0fc 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -27,6 +27,7 @@ #include "flow/UnitTest.h" #include "fdbserver/IPager.h" #include "fdbrpc/IAsyncFile.h" +#include "fdbrpc/crc32c.h" #include "flow/ActorCollection.h" #include "fdbserver/MemoryPager.h" #include "fdbserver/IndirectShadowPager.h" From 2322571df22623b0bf63764b5f902c8519e3e187 Mon Sep 17 00:00:00 2001 From: mpilman Date: Wed, 7 Aug 2019 08:28:14 -0700 Subject: [PATCH 005/184] Ported flow tutorial to fdb 6 --- documentation/CMakeLists.txt | 1 + documentation/tutorial/CMakeLists.txt | 4 + documentation/tutorial/tutorial.actor.cpp | 470 ++++++++++++++++++++++ 3 files changed, 475 insertions(+) create mode 100644 documentation/tutorial/CMakeLists.txt create mode 100644 documentation/tutorial/tutorial.actor.cpp diff --git a/documentation/CMakeLists.txt b/documentation/CMakeLists.txt index ba4b299433..83fabf20ba 100644 --- a/documentation/CMakeLists.txt +++ b/documentation/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(tutorial) # build a virtualenv set(sphinx_dir ${CMAKE_CURRENT_SOURCE_DIR}/sphinx) set(venv_dir ${CMAKE_CURRENT_BINARY_DIR}/venv) diff --git a/documentation/tutorial/CMakeLists.txt b/documentation/tutorial/CMakeLists.txt new file mode 100644 index 0000000000..5c5e181625 --- /dev/null +++ b/documentation/tutorial/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TUTORIAL_SRCS tutorial.actor.cpp) + +add_flow_target(EXECUTABLE NAME tutorial SRCS "${TUTORIAL_SRCS}") +target_link_libraries(tutorial PUBLIC fdbclient) diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp new file mode 100644 index 0000000000..bf1d5c58b0 --- /dev/null +++ b/documentation/tutorial/tutorial.actor.cpp @@ -0,0 +1,470 @@ +/* + * fdbcli.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flow/flow.h" +#include "flow/Platform.h" +#include "flow/DeterministicRandom.h" +#include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/ReadYourWrites.h" +#include +#include +#include +#include +#include "flow/actorcompiler.h" + +NetworkAddress serverAddress; + +// this is a simple actor that will report how long +// it is already running once a second. +ACTOR Future simpleTimer() { + // we need to remember the time when we first + // started. + // This needs to be a state-variable because + // we will use it in different parts of the + // actor. If you don't understand how state + // variables work, it is a good idea to remove + // the state keyword here and look at the + // generated C++ code from the actor compiler. + state double start_time = g_network->now(); + loop { + wait(delay(1.0)); + std::cout << format("Time: %.2f\n", g_network->now() - start_time); + } +} + +// A actor that demonstrates how choose-when +// blocks work. +ACTOR Future someFuture(Future ready) { + loop { + choose { + when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } + when(int r = wait(ready)) { + std::cout << format("Ready %d\n", r); + wait(delay(double(r))); + std::cout << "Done\n"; + return Void(); + } + } + } +} + +ACTOR Future promiseDemo() { + state Promise promise; + state Future f = someFuture(promise.getFuture()); + wait(delay(3.0)); + promise.send(2); + wait(f); + return Void(); +} + +ACTOR Future eventLoop(AsyncTrigger* trigger) { + loop { + + choose { + when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } + when(wait(trigger->onTrigger())) { std::cout << "Triggered!\n"; } + } + } +} + +ACTOR Future triggerDemo() { + state int runs = 1; + state AsyncTrigger trigger; + state Future triggerLoop = eventLoop(&trigger); + while (++runs < 10) { + wait(delay(1.0)); + std::cout << "trigger.."; + trigger.trigger(); + } + std::cout << "Done."; + return Void(); +} + +struct EchoServerInterface { + constexpr static FileIdentifier file_identifier = 3152015; + RequestStream getInterface; + RequestStream echo; + RequestStream reverse; + + template + void serialize(Ar& ar) { + serializer(ar, echo, reverse); + } +}; + +struct GetInterfaceRequest { + constexpr static FileIdentifier file_identifier = 12004156; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +struct EchoRequest { + constexpr static FileIdentifier file_identifier = 10624019; + std::string message; + // this variable has to be called reply! + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, message, reply); + } +}; + +struct ReverseRequest { + constexpr static FileIdentifier file_identifier = 10765955; + std::string message; + // this variable has to be called reply! + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, message, reply); + } +}; + +uint64_t tokenCounter = 1; + +ACTOR Future echoServer() { + state EchoServerInterface echoServer; + echoServer.getInterface.makeWellKnownEndpoint(UID(-1, ++tokenCounter), TaskPriority::DefaultEndpoint); + loop { + choose { + when(GetInterfaceRequest req = waitNext(echoServer.getInterface.getFuture())) { + req.reply.send(echoServer); + } + when(EchoRequest req = waitNext(echoServer.echo.getFuture())) { req.reply.send(req.message); } + when(ReverseRequest req = waitNext(echoServer.reverse.getFuture())) { + req.reply.send(std::string(req.message.rbegin(), req.message.rend())); + } + } + } +} + +ACTOR Future echoClient() { + state EchoServerInterface server; + server.getInterface = RequestStream(Endpoint({ serverAddress }, UID(-1, ++tokenCounter))); + EchoServerInterface s = wait(server.getInterface.getReply(GetInterfaceRequest())); + server = s; + EchoRequest echoRequest; + echoRequest.message = "Hello World"; + std::string echoMessage = wait(server.echo.getReply(echoRequest)); + std::cout << format("Sent {} to echo, received %s\n", "Hello World", echoMessage.c_str()); + ReverseRequest reverseRequest; + reverseRequest.message = "Hello World"; + std::string reverseString = wait(server.reverse.getReply(reverseRequest)); + std::cout << format("Sent {} to reverse, received {}\n", "Hello World", reverseString.c_str()); + return Void(); +} + +struct SimpleKeyValueStoreInteface { + constexpr static FileIdentifier file_identifier = 8226647; + RequestStream connect; + RequestStream get; + RequestStream set; + RequestStream clear; + + template + void serialize(Ar& ar) { + serializer(ar, connect, get, set, clear); + } +}; + +struct GetKVInterface { + constexpr static FileIdentifier file_identifier = 8062308; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, reply); + } +}; + +struct GetRequest { + constexpr static FileIdentifier file_identifier = 6983506; + std::string key; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, key, reply); + } +}; + +struct SetRequest { + constexpr static FileIdentifier file_identifier = 7554186; + std::string key; + std::string value; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, key, value, reply); + } +}; + +struct ClearRequest { + constexpr static FileIdentifier file_identifier = 8500026; + std::string from; + std::string to; + ReplyPromise reply; + + template + void serialize(Ar& ar) { + serializer(ar, from, to, reply); + } +}; + +ACTOR Future kvStoreServer() { + state SimpleKeyValueStoreInteface inf; + state std::map store; + inf.connect.makeWellKnownEndpoint(UID(-1, ++tokenCounter), TaskPriority::DefaultEndpoint); + loop { + choose { + when(GetKVInterface req = waitNext(inf.connect.getFuture())) { + std::cout << "Received connection attempt\n"; + req.reply.send(inf); + } + when(GetRequest req = waitNext(inf.get.getFuture())) { + auto iter = store.find(req.key); + if (iter == store.end()) { + req.reply.sendError(io_error()); + } else { + req.reply.send(iter->second); + } + } + when(SetRequest req = waitNext(inf.set.getFuture())) { + store[req.key] = req.value; + req.reply.send(Void()); + } + when(ClearRequest req = waitNext(inf.clear.getFuture())) { + auto from = store.lower_bound(req.from); + auto to = store.lower_bound(req.to); + while (from != store.end() && from != to) { + auto next = from; + ++next; + store.erase(from); + from = next; + } + req.reply.send(Void()); + } + } + } +} + +ACTOR Future connect() { + std::cout << format("%ull: Connect...\n", uint64_t(g_network->now())); + SimpleKeyValueStoreInteface c; + c.connect = RequestStream(Endpoint({ serverAddress }, UID(-1, ++tokenCounter))); + SimpleKeyValueStoreInteface result = wait(c.connect.getReply(GetKVInterface())); + std::cout << format("%ull: done..\n", uint64_t(g_network->now())); + return result; +} + +ACTOR Future kvSimpleClient() { + state SimpleKeyValueStoreInteface server = wait(connect()); + std::cout << format("Set %s -> %s\n", "foo", "bar"); + SetRequest setRequest; + setRequest.key = "foo"; + setRequest.value = "bar"; + wait(server.set.getReply(setRequest)); + GetRequest getRequest; + getRequest.key = "foo"; + std::string value = wait(server.get.getReply(getRequest)); + std::cout << format("get(%s) -> %s\n", "foo", value.c_str()); + return Void(); +} + +ACTOR Future kvClient(SimpleKeyValueStoreInteface server, std::shared_ptr ops) { + state Future timeout = delay(20); + state int rangeSize = 2 << 12; + loop { + SetRequest setRequest; + setRequest.key = std::to_string(deterministicRandom()->randomInt(0, rangeSize)); + setRequest.value = "foo"; + wait(server.set.getReply(setRequest)); + ++(*ops); + try { + GetRequest getRequest; + getRequest.key = std::to_string(deterministicRandom()->randomInt(0, rangeSize)); + std::string _ = wait(server.get.getReply(getRequest)); + ++(*ops); + } catch (Error& e) { + if (e.code() != error_code_io_error) { + throw e; + } + } + int from = deterministicRandom()->randomInt(0, rangeSize); + ClearRequest clearRequest; + clearRequest.from = std::to_string(from); + clearRequest.to = std::to_string(from + 100); + wait(server.clear.getReply(clearRequest)); + ++(*ops); + if (timeout.isReady()) { + // we are done + return Void(); + } + } +} + +ACTOR Future throughputMeasurement(std::shared_ptr operations) { + loop { + wait(delay(1.0)); + std::cout << format("%ull op/s\n", *operations); + *operations = 0; + } +} + +ACTOR Future multipleClients() { + SimpleKeyValueStoreInteface server = wait(connect()); + auto ops = std::make_shared(0); + std::vector> clients(100); + for (auto& f : clients) { + f = kvClient(server, ops); + } + auto done = waitForAll(clients); + wait(done || throughputMeasurement(ops)); + return Void(); +} + +std::string clusterFile = "fdb.cluster"; + +ACTOR Future fdbClient() { + wait(delay(30)); + state Database db = Database::createDatabase(clusterFile, 300); + state Transaction tx(db); + state std::string keyPrefix = "/tut/"; + state Key startKey; + state KeyRef endKey = LiteralStringRef("/tut0"); + state int beginIdx = 0; + loop { + try { + tx.reset(); + // this workload is stupidly simple: + // 1. select a random key between 1 + // and 1e8 + // 2. select this key plus the 100 + // next ones + // 3. write 10 values in [k, k+100] + beginIdx = deterministicRandom()->randomInt(0, 1e8 - 100); + startKey = keyPrefix + std::to_string(beginIdx); + Standalone range = wait(tx.getRange(KeyRangeRef(startKey, endKey), 100)); + for (int i = 0; i < 10; ++i) { + Key k = Key(keyPrefix + std::to_string(beginIdx + deterministicRandom()->randomInt(0, 100))); + tx.set(k, LiteralStringRef("foo")); + } + wait(tx.commit()); + std::cout << "Committed\n"; + wait(delay(2.0)); + } catch (Error& e) { + wait(tx.onError(e)); + } + } +} + +ACTOR Future fdbStatusStresser() { + state Database db = Database::createDatabase(clusterFile, 300); + state ReadYourWritesTransaction tx(db); + state Key statusJson(std::string("\xff\xff/status/json")); + loop { + try { + tx.reset(); + Optional _ = wait(tx.get(statusJson)); + } catch (Error& e) { + wait(tx.onError(e)); + } + } +} + +std::unordered_map()>> actors = { { "timer", &simpleTimer }, + { "promiseDemo", &promiseDemo }, + { "triggerDemo", &triggerDemo }, + { "echoServer", &echoServer }, + { "echoClient", &echoClient }, + { "kvStoreServer", &kvStoreServer }, + { "kvSimpleClient", &kvSimpleClient }, + { "multipleClients", &multipleClients }, + { "fdbClient", &fdbClient }, + { "fdbStatusStresser", &fdbStatusStresser } }; + +int main(int argc, char* argv[]) { + bool isServer = false; + std::string port; + std::vector()>> toRun; + // parse arguments + for (int i = 1; i < argc; ++i) { + std::string arg(argv[i]); + if (arg == "-p") { + isServer = true; + if (i + 1 >= argc) { + std::cout << "Excpecting an argument after -p\n"; + return 1; + } + port = std::string(argv[++i]); + continue; + } else if (arg == "-s") { + if (i + 1 >= argc) { + std::cout << "Excpecting an argument after -s\n"; + return 1; + } + serverAddress = NetworkAddress::parse(argv[++i]); + continue; + } else if (arg == "-C") { + clusterFile = argv[++i]; + std::cout << "Using cluster file " << clusterFile << std::endl; + continue; + } + auto actor = actors.find(arg); + if (actor == actors.end()) { + std::cout << format("Error: actor %s does not exist\n", arg.c_str()); + return 1; + } + toRun.push_back(actor->second); + } + platformInit(); + g_network = newNet2(false, true); + NetworkAddress publicAddress = NetworkAddress::parse("0.0.0.0:0"); + if (isServer) { + publicAddress = NetworkAddress::parse("0.0.0.0:" + port); + } + // openTraceFile(publicAddress, TRACE_DEFAULT_ROLL_SIZE, + // TRACE_DEFAULT_MAX_LOGS_SIZE); + try { + if (isServer) { + auto listenError = FlowTransport::transport().bind(publicAddress, publicAddress); + if (listenError.isError()) { + listenError.get(); + } + } + } catch (Error& e) { + std::cout << format("Error while binding to address (%d): %s\n", e.code(), e.what()); + } + // now we start the actors + std::vector> all; + for (auto& f : toRun) { + all.emplace_back(f()); + } + auto f = stopAfter(waitForAll(all)); + g_network->run(); + return 0; +} From f81eeea495575994009bc0dc18cd8a73ad9bb773 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 8 Aug 2019 02:57:23 -0700 Subject: [PATCH 006/184] Bug fixes. COWPager initialization was not flushing non-header pages with fsync() before writing and syncing the header. FIFOQueue was writing the initial page of a new queue multiple times. FIFOQueue::writePage() would unnecessarily (and invalidly) attempt to write if the page is not yet loaded. --- fdbserver/IPager.h | 1 + fdbserver/VersionedBTree.actor.cpp | 52 ++++++++++++++++-------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 4d449f917a..8eb47283de 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -40,6 +40,7 @@ #endif #define BEACON fprintf(stderr, "%s: %s line %d \n", __FUNCTION__, __FILE__, __LINE__) +#define TRACE fprintf(stderr, "%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); #ifndef VALGRIND #define VALGRIND_MAKE_MEM_UNDEFINED(x, y) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 075204e0fc..c41575dde9 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -73,7 +73,8 @@ public: Cursor() : queue(nullptr) { } - void init(FIFOQueue *q, LogicalPageID p) { + void initNew(FIFOQueue *q, LogicalPageID p) { + debug_printf("New queue cursor at page id=%u write=%d\n", p, write); queue = q; pageID = p; index = 0; @@ -82,7 +83,8 @@ public: writePage(); } - void init(FIFOQueue *q, LogicalPageID p, int i) { + void initExisting(FIFOQueue *q, LogicalPageID p, int i) { + debug_printf("Loading queue cursor at page id=%u index=%d\n", p, i); queue = q; pageID = p; index = i; @@ -140,7 +142,10 @@ public: } void writePage() { - queue->pager->updatePage(pageID, page); + // If the page isn't loaded yet then there can't possibly be anything new to write + if(loading.isReady()) { + queue->pager->updatePage(pageID, page); + } } ACTOR static Future waitThenWriteNext(Cursor *self, T item) { @@ -207,9 +212,9 @@ public: name = queueName; numPages = 1; numEntries = 0; - head.init(this, newPageID); - tail.init(this, newPageID); - stop.init(this, newPageID); + tail.initNew(this, newPageID); + head.initExisting(this, tail.pageID, tail.index); + stop.initExisting(this, tail.pageID, tail.index); ASSERT(flush().isReady()); } @@ -221,9 +226,9 @@ public: name = queueName; numPages = qs.numPages; numEntries = qs.numEntries; - head.init(this, qs.headPageID, qs.headIndex); - tail.init(this, qs.tailPageID, qs.tailIndex); - stop.init(this, qs.tailPageID, qs.tailIndex); + head.initExisting(this, qs.headPageID, qs.headIndex); + tail.initExisting(this, qs.tailPageID, qs.tailIndex); + stop.initExisting(this, qs.tailPageID, qs.tailIndex); ASSERT(flush().isReady()); } @@ -246,10 +251,12 @@ public: } ACTOR static Future writeActor(FIFOQueue *self, FutureStream queue) { + state bool modified = false; try { loop { state T item = waitNext(queue); - self->tail.writeNext(item); + wait(self->tail.writeNext(item)); + modified = true; } } catch(Error &e) { @@ -258,8 +265,10 @@ public: } } - self->tail.writePage(); - self->stop.init(self, self->tail.pageID, self->tail.index); + if(modified) { + self->tail.writePage(); + self->stop.initExisting(self, self->tail.pageID, self->tail.index); + } return self->getState(); } @@ -494,15 +503,9 @@ public: // Create a new free list at page 1 self->freeList.init(self, 1, "FreeListNew"); - // Flush free list, store state in header - store(self->pHeader->freeList, self->freeList.flush()); - // Clear remaining bytes of header memset(self->headerPage->mutate() + self->pHeader->size(), 0, self->headerPage->size() - self->pHeader->size()); - // Update header page on disk and sync - wait(self->writePhysicalPage(0, self->headerPage)); - wait(self->commit()); } @@ -552,7 +555,7 @@ public: void updatePage(LogicalPageID pageID, Reference data) { // Get the cache entry for this page PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager op=write id=%u cached=%d\n", pageID, cacheEntry.page.isValid()); + debug_printf("COWPager op=write id=%u cached=%d reading=%d writing=%d\n", pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places // the new content in the cache entry when the write is launched, not when it is completed. @@ -601,6 +604,7 @@ public: try { int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); + debug_printf("op=read_complete id=%u bytes=%d\n", pageID, readBytes); ASSERT(readBytes == self->physicalPageSize); ASSERT(((Page *)page.getPtr())->verifyChecksum(pageID)); } catch(Error &e) { @@ -616,7 +620,7 @@ public: // Reads the most recent version of pageID either committed or written using updatePage() Future> readPage(LogicalPageID pageID) { PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager op=read id=%u cached=%d ready=%d\n", pageID, cacheEntry.page.isValid(), cacheEntry.page.isValid() && cacheEntry.page.isReady()); + debug_printf("COWPager op=read id=%u cached=%d reading=%d writing=%d\n", pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); if(!cacheEntry.page.isValid()) { cacheEntry.page = readPhysicalPage(this, (PhysicalPageID)pageID); @@ -644,10 +648,7 @@ public: ACTOR static Future commit_impl(COWPager *self) { // Flush the free list queue to the pager - LogicalPageQueueT::QueueState freeListState = wait(self->freeList.flush()); - - // Update header in memory - self->pHeader->freeList = freeListState; + wait(store(self->pHeader->freeList, self->freeList.flush())); // Wait for all outstanding writes to complete wait(self->writes.signalAndCollapse()); @@ -655,10 +656,11 @@ public: // Sync everything except the header wait(self->pageFile->sync()); - // Update header on disk and sync + // Update header on disk and sync again. wait(self->writePhysicalPage(0, self->headerPage)); wait(self->pageFile->sync()); + // Update last committed state for use in creating snapshots at current version. self->lastCommittedVersion = self->pHeader->committedVersion; self->lastCommittedMeta = self->pHeader->getMetaKey(); From 046dd76d73a49c203a6e12204c39830e2c7c0206 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 8 Aug 2019 23:08:08 -0700 Subject: [PATCH 007/184] Cleaned up the Redwood BTree correctness test, tweaked the parameters to favor shorter tests and added a time limit. --- fdbserver/VersionedBTree.actor.cpp | 56 ++++++++++++++++-------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c41575dde9..19440b9b77 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -4596,44 +4596,46 @@ struct SimpleCounter { }; TEST_CASE("!/redwood/correctness/btree") { - state bool useDisk = true; // MemoryPager is not being maintained currently. - state std::string pagerFile = "unittest_pageFile.redwood"; IPager2 *pager; state bool serialTest = deterministicRandom()->coinflip(); state bool shortTest = deterministicRandom()->coinflip(); state bool singleVersion = true; // Multi-version mode is broken / not finished - state double startTime = now(); state int pageSize = shortTest ? 200 : (deterministicRandom()->coinflip() ? 4096 : deterministicRandom()->randomInt(200, 400)); - printf("serialTest: %d shortTest: %d singleVersion: %d\n", serialTest, shortTest, singleVersion); - - if(useDisk) { - printf("Deleting existing test data...\n"); - deleteFile(pagerFile); - pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); - } - else { - ASSERT(false); - //pager = createMemoryPager(); - } - - printf("Initializing...\n"); - state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); - wait(btree->init()); - // We must be able to fit at least two any two keys plus overhead in a page to prevent // a situation where the tree cannot be grown upward with decreasing level size. - // TODO: Handle arbitrarily large keys state int maxKeySize = deterministicRandom()->randomInt(4, pageSize * 2); state int maxValueSize = deterministicRandom()->randomInt(0, pageSize * 4); - state int maxCommitSize = shortTest ? 1000 : 1e5 + randomSize(10e6); - state int mutationBytesTarget = shortTest ? 5000 : randomSize(50e6); - state double clearChance = deterministicRandom()->random01() * .1; + state int maxCommitSize = shortTest ? 1000 : randomSize(std::min((maxKeySize + maxValueSize) * 20000, 10e6)); + state int mutationBytesTarget = shortTest ? 5000 : randomSize(std::min(maxCommitSize * 100, 100e6)); + state double clearProbability = deterministicRandom()->random01() * .1; + state double coldStartProbability = deterministicRandom()->random01(); + state double maxWallClockDuration = 60; - printf("Using page size %d, max key size %d, max value size %d, clearchance %f, total mutation byte target %d\n", pageSize, maxKeySize, maxValueSize, clearChance, mutationBytesTarget); + printf("\n"); + printf("serialTest: %d\n", serialTest); + printf("shortTest: %d\n", shortTest); + printf("singleVersion: %d\n", serialTest); + printf("pageSize: %d\n", pageSize); + printf("maxKeySize: %d\n", maxKeySize); + printf("maxValueSize: %d\n", maxValueSize); + printf("maxCommitSize: %d\n", maxCommitSize); + printf("mutationBytesTarget: %d\n", mutationBytesTarget); + printf("clearProbability: %f\n", clearProbability); + printf("coldStartProbability: %f\n", coldStartProbability); + printf("\n"); + + printf("Deleting existing test data...\n"); + deleteFile(pagerFile); + + printf("Initializing...\n"); + state double startTime = timer(); + pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); + wait(btree->init()); state std::map, Optional> written; state std::set keys; @@ -4660,7 +4662,7 @@ TEST_CASE("!/redwood/correctness/btree") { state Future commit = Void(); - while(mutationBytes.get() < mutationBytesTarget) { + while(mutationBytes.get() < mutationBytesTarget && (timer() - startTime) < maxWallClockDuration) { if(now() - startTime > 600) { mutationBytesTarget = mutationBytes.get(); } @@ -4672,7 +4674,7 @@ TEST_CASE("!/redwood/correctness/btree") { } // Sometimes do a clear range - if(deterministicRandom()->random01() < clearChance) { + if(deterministicRandom()->random01() < clearProbability) { Key start = randomKV(maxKeySize, 1).key; Key end = (deterministicRandom()->random01() < .01) ? keyAfter(start) : randomKV(maxKeySize, 1).key; @@ -4786,7 +4788,7 @@ TEST_CASE("!/redwood/correctness/btree") { mutationBytesTargetThisCommit = randomSize(maxCommitSize); // Recover from disk at random - if(!serialTest && useDisk && deterministicRandom()->random01() < .02) { + if(!serialTest && deterministicRandom()->random01() < coldStartProbability) { printf("Recovering from disk.\n"); // Wait for outstanding commit From 2f92cf8c96f4e789984891e2db16c5a2377d1e14 Mon Sep 17 00:00:00 2001 From: Trevor Clinkenbeard Date: Thu, 29 Aug 2019 16:34:49 -0700 Subject: [PATCH 008/184] Use lock aware transaction for pingDatabase --- fdbserver/tester.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 43f9a01cb9..c9cb4f0867 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -360,6 +360,7 @@ ACTOR Future pingDatabase( Database cx ) { loop { try { tr.setOption( FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE ); + tr.setOption( FDBTransactionOptions::LOCK_AWARE ); Optional v = wait( tr.get( StringRef("/Liveness/" + deterministicRandom()->randomUniqueID().toString() ) ) ); tr.makeSelfConflicting(); wait( tr.commit() ); From abc22d261006f311fe930c93298a77e377515e1f Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sun, 11 Aug 2019 03:26:00 -0700 Subject: [PATCH 009/184] COWPager bug fixes involving shut down while operations are in progress. --- fdbserver/VersionedBTree.actor.cpp | 68 ++++++++++++++++++------------ 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 19440b9b77..6d2b022af9 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -408,8 +408,12 @@ public: return entry.item; } - void clear() { + // Clears the cache and calls destroy() on each ObjectType + void destroy() { evictionOrder.clear(); + for(auto &entry : cache) { + entry.second.item.destroy(); + } cache.clear(); } @@ -426,6 +430,19 @@ private: std::unordered_map cache; }; +ACTOR template Future forwardError(Future f, Promise target) { + try { + T x = wait(f); + return x; + } + catch(Error &e) { + if(e.code() != error_code_actor_cancelled && target.canBeSet()) { + target.sendError(e); + } + + throw e; + } +} class COWPager : public IPager2 { public: @@ -435,7 +452,7 @@ public: // If the file already exists, pageSize might be different than desiredPageSize COWPager(int desiredPageSize, std::string filename, int cachedPageLimit) : desiredPageSize(desiredPageSize), filename(filename), pageCache(cachedPageLimit), pHeader(nullptr) { commitFuture = Void(); - recoverFuture = recover(this); + recoverFuture = forwardError(recover(this), errorPromise); } void setPageSize(int size) { @@ -538,12 +555,14 @@ public: return ++pHeader->pageCount; } - return map(nextPageID, [=](Optional nextPageID) { + Future f = map(nextPageID, [=](Optional nextPageID) { if(nextPageID.present()) { return nextPageID.get(); } return (LogicalPageID)++(pHeader->pageCount); }); + + return forwardError(f, errorPromise); }; Future writePhysicalPage(PhysicalPageID pageID, Reference page) { @@ -580,18 +599,19 @@ public: } } - writes.add(cacheEntry.writeFuture); + writes.add(forwardError(cacheEntry.writeFuture, errorPromise)); // Always update the page contents immediately regardless of what happened above. cacheEntry.page = data; } Future atomicUpdatePage(LogicalPageID pageID, Reference data) { - freePage(pageID); - return map(newPageID(), [=](LogicalPageID newPageID) { + Future f = map(newPageID(), [=](LogicalPageID newPageID) { updatePage(newPageID, data); return newPageID; }); + + return forwardError(f, errorPromise); } // Free pageID to be used again after the next commit @@ -601,19 +621,10 @@ public: ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { state Reference page = self->newPageBuffer(); - - try { - int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); - debug_printf("op=read_complete id=%u bytes=%d\n", pageID, readBytes); - ASSERT(readBytes == self->physicalPageSize); - ASSERT(((Page *)page.getPtr())->verifyChecksum(pageID)); - } catch(Error &e) { - if(e.code() != error_code_actor_cancelled) { - self->errorPromise.sendError(e); - } - throw; - } - + int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); + debug_printf("op=read_complete id=%u bytes=%d\n", pageID, readBytes); + ASSERT(readBytes == self->physicalPageSize); + ASSERT(((Page *)page.getPtr())->verifyChecksum(pageID)); return page; } @@ -626,7 +637,7 @@ public: cacheEntry.page = readPhysicalPage(this, (PhysicalPageID)pageID); } - return cacheEntry.page; + return forwardError(cacheEntry.page, errorPromise); } // Get snapshot as of the most recent committed version of the pager @@ -671,7 +682,7 @@ public: Future commit() { // Can't have more than one commit outstanding. ASSERT(commitFuture.isReady()); - commitFuture = commit_impl(this); + commitFuture = forwardError(commit_impl(this), errorPromise); return commitFuture; } @@ -690,15 +701,15 @@ public: ACTOR void shutdown(COWPager *self, bool dispose) { self->recoverFuture.cancel(); + self->commitFuture.cancel(); if(self->errorPromise.canBeSet()) self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress - // Cancel all reads. Any in-progress writes will be holding references to their required pages - self->pageCache.clear(); + // Destroy the cache, cancelling reads and writes in progress + self->pageCache.destroy(); wait(ready(self->writes.signal())); - wait(ready(self->commitFuture)); self->pageFile.clear(); @@ -722,10 +733,6 @@ public: return closedPromise.getFuture(); } - Future onError() { - return errorPromise.getFuture(); - } - Future onClose() { return closedPromise.getFuture(); } @@ -794,6 +801,11 @@ private: // Don't evict if a page is still being read or written return page.isReady() && !writing(); } + + void destroy() { + page.cancel(); + writeFuture.cancel(); + } }; // Physical page sizes will always be a multiple of 4k because AsyncFileNonDurable requires From e0873e2ba07a3ff230bd928892a4f39c9316ffd8 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sun, 11 Aug 2019 18:33:20 -0700 Subject: [PATCH 010/184] Removed COWPager snapshot lifetime management for now as it's the wrong strategy and causes crashes when snapshot references outlive the pager. --- fdbserver/VersionedBTree.actor.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 6d2b022af9..da675b7eff 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -643,20 +643,6 @@ public: // Get snapshot as of the most recent committed version of the pager Reference getReadSnapshot(); - void snapshotDestroyed(Version v) { - auto i = snapshotsInUse.find(v); - ASSERT(i != snapshotsInUse.end()); - ASSERT(i->second > 0); - --i->second; - bool first = i == snapshotsInUse.begin(); - if(i->second == 0) { - snapshotsInUse.erase(i); - if(first) { - leastSnapshotVersionChanged.trigger(); - } - } - } - ACTOR static Future commit_impl(COWPager *self) { // Flush the free list queue to the pager wait(store(self->pHeader->freeList, self->freeList.flush())); @@ -847,7 +833,6 @@ public: COWPagerSnapshot(COWPager *pager, Key meta, Version version) : pager(pager), metaKey(meta), version(version) { } virtual ~COWPagerSnapshot() { - pager->snapshotDestroyed(version); } Future> getPhysicalPage(LogicalPageID pageID) { From 57f55c1e99b55071119d55079405dfb1c66669f1 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 13 Aug 2019 22:41:41 -0700 Subject: [PATCH 011/184] Bug fix - FIFOQueue design changed to not rely on the durability of unchanging bytes in modified pages that are not fsync'd. --- fdbserver/VersionedBTree.actor.cpp | 164 ++++++++++++++++++----------- 1 file changed, 105 insertions(+), 59 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index da675b7eff..f830f0939d 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -42,6 +42,13 @@ #include // A FIFO queue of T stored as a linked list of pages. +// Each page contains some number of T items and a link to the next page. +// When the queue is flushed, the final page is ended and linked to a newly allocated +// but not-yet-written-to page, which future writes after the flush will write to. +// Committing changes to a queue involves flushing the queue, calling fsync, and then +// writing the QueueState somewhere and making it durable. +// The write pattern is designed such that non-fsync'd writes are not relied on, to include +// unchanging bytes in a page that was updated but not fsync'd. template class FIFOQueue { static_assert(std::is_trivially_copyable::value); @@ -52,11 +59,11 @@ public: LogicalPageID headPageID = invalidLogicalPageID; LogicalPageID tailPageID = invalidLogicalPageID; uint16_t headIndex; - uint16_t tailIndex; + // Note that there is no tail index because the tail page is always never-before-written and its index will start at 0 int64_t numPages; int64_t numEntries; std::string toString() const { - return format("head: %u:%d tail: %u:%d numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headIndex, tailPageID, (int)tailIndex, numPages, numEntries); + return format("head: %u:%d tail: %u:%d numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headIndex, tailPageID, numPages, numEntries); } }; #pragma pack(pop) @@ -70,25 +77,44 @@ public: FIFOQueue *queue; Future loading; + // Cursor will not read this page or anything beyond it. + LogicalPageID endPageID; + Cursor() : queue(nullptr) { } - void initNew(FIFOQueue *q, LogicalPageID p) { - debug_printf("New queue cursor at page id=%u write=%d\n", p, write); - queue = q; - pageID = p; - index = 0; - page = queue->pager->newPageBuffer(); - loading = Void(); - writePage(); + void setEnd(Cursor &end) { + endPageID = end.pageID; } - void initExisting(FIFOQueue *q, LogicalPageID p, int i) { + // Point cursor to a page which has never been written before, allocate + // a page buffer and initialize it + void initWrite(FIFOQueue *q, LogicalPageID newPageID) { + debug_printf("New queue cursor at page id=%u write=%d\n", newPageID, write); + queue = q; + pageID = newPageID; + initNewPageBuffer(); + } + + // Point cursor to a page to read from. Begin loading the page if beginLoad is set. + void initRead(FIFOQueue *q, LogicalPageID p, int i, LogicalPageID endPageID) { debug_printf("Loading queue cursor at page id=%u index=%d\n", p, i); queue = q; pageID = p; index = i; - loading = loadPage(p, i); + + // If cursor is not pointed at the end page then start loading it. + // The end page will not have been written to disk yet. + loading = (p == endPageID) ? Future() : loadPage(); + } + + void initNewPageBuffer() { + index = 0; + page = queue->pager->newPageBuffer(); + auto p = raw(); + p->next = 0; + p->count = 0; + loading = Void(); } Cursor(Cursor &) = delete; @@ -98,9 +124,14 @@ public: loading.cancel(); } + Future ready() { + return loading; + } + #pragma pack(push, 1) struct RawPage { LogicalPageID next; + uint32_t count; inline T & at(int i) { return ((T *)(this + 1))[i]; @@ -108,44 +139,46 @@ public: }; #pragma pack(pop) - bool end() const { - return index == queue->itemsPerPage; + RawPage * raw() const { + return ((RawPage *)(page->begin())); } - Future loadPage(LogicalPageID newPageID, int newIndex) { - debug_printf("queue(%p, %s) loading page %u index %d\n", this, queue->name.c_str(), newPageID, newIndex); - return map(queue->pager->readPage(newPageID), [=](Reference p) { + Future loadPage() { + debug_printf("queue(%p, %s) loading page %u index %d\n", this, queue->name.c_str(), pageID, index); + return map(queue->pager->readPage(pageID), [=](Reference p) { page = p; - pageID = newPageID; - index = newIndex; return Void(); }); } Future newPage() { + ASSERT(page); debug_printf("queue(%p, %s) new page\n", this, queue->name.c_str()); return map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { - pageID = newPageID; - index = 0; - page = queue->pager->newPageBuffer(); + auto p = raw(); + p->next = newPageID; + writePage(); ++queue->numPages; + pageID = newPageID; + initNewPageBuffer(); return Void(); }); } - T & getItem() const { - return ((RawPage *)(page->begin()))->at(index); - } - bool operator== (const Cursor &rhs) { return pageID == rhs.pageID && index == rhs.index; } + bool empty() { + return raw()->count == 0; + } + void writePage() { - // If the page isn't loaded yet then there can't possibly be anything new to write - if(loading.isReady()) { - queue->pager->updatePage(pageID, page); - } + // Pages are never written after being read, so if the write cursor is not + // ready then it is getting a new page ID which must be written to the next + // page ID of the page behind it. + ASSERT(loading.isReady()); + queue->pager->updatePage(pageID, page); } ACTOR static Future waitThenWriteNext(Cursor *self, T item) { @@ -157,10 +190,12 @@ public: Future writeNext(const T &item) { // If the cursor is loaded already, write the item and move to the next slot if(loading.isReady()) { - getItem() = item; + auto p = raw(); + p->at(index) = item; + ++p->count; ++queue->numEntries; ++index; - if(this->end()) { + if(index == queue->itemsPerPage) { this->loading = newPage(); } return Void(); @@ -177,20 +212,31 @@ public: // Read and moved past the next item if it is < upperBound Future> moveNext(const Optional &upperBound = {}) { + // If loading is not valid then this page cannot be loaded now so return nothing + if(!loading.isValid()) { + return Optional(); + } + + // If loading is ready, read an item and move forward if(loading.isReady()) { - if(upperBound.present() && getItem() >= upperBound.get()) { + auto p = raw(); + if(upperBound.present() && p->at(index) >= upperBound.get()) { return Optional(); } - T result = getItem(); + T result = p->at(index); --queue->numEntries; ++index; // If this page is out of items, start reading the next one - if(end()) { - loading = loadPage(((RawPage *)page->begin())->next, 0); + if(index == p->count) { + queue->pager->freePage(pageID); + pageID = p->next; + index = 0; --queue->numPages; + loading = (pageID == endPageID) ? Future() : loadPage(); } + return Optional(result); } @@ -206,44 +252,43 @@ public: void operator=(const FIFOQueue &rhs) = delete; // Create a new queue at newPageID - void init(IPager2 *p, LogicalPageID newPageID, std::string queueName) { + void create(IPager2 *p, LogicalPageID newPageID, std::string queueName) { debug_printf("FIFOQueue::init(%p, %s) from page id %u\n", this, name.c_str(), newPageID); pager = p; name = queueName; numPages = 1; numEntries = 0; - tail.initNew(this, newPageID); - head.initExisting(this, tail.pageID, tail.index); - stop.initExisting(this, tail.pageID, tail.index); + itemsPerPage = (pager->getUsablePageSize() - sizeof(typename Cursor::RawPage)) / sizeof(T); + tail.initWrite(this, newPageID); + head.initRead(this, newPageID, 0, newPageID); ASSERT(flush().isReady()); } // Load an existing queue from its queue state - void init(IPager2 *p, const QueueState &qs, std::string queueName) { + void recover(IPager2 *p, const QueueState &qs, std::string queueName) { debug_printf("FIFOQueue::init(%p, %s) from queue state %u\n", this, name.c_str(), qs.toString().c_str()); pager = p; this->name = name; name = queueName; numPages = qs.numPages; numEntries = qs.numEntries; - head.initExisting(this, qs.headPageID, qs.headIndex); - tail.initExisting(this, qs.tailPageID, qs.tailIndex); - stop.initExisting(this, qs.tailPageID, qs.tailIndex); + itemsPerPage = (pager->getUsablePageSize() - sizeof(typename Cursor::RawPage)) / sizeof(T); + tail.initWrite(this, qs.tailPageID); + head.initRead(this, qs.headPageID, qs.headIndex, qs.tailPageID); ASSERT(flush().isReady()); } Future> pop(Optional upperBound = {}) { - if(head == stop) { - return Optional(); - } return head.moveNext(upperBound); } QueueState getState() const { + // It only makes sense to save queue state when the tail cursor points to a new empty page + ASSERT(tail.index == 0); + QueueState s; s.headIndex = head.index; s.headPageID = head.pageID; - s.tailIndex = tail.index; s.tailPageID = tail.pageID; s.numEntries = numEntries; s.numPages = numPages; @@ -251,12 +296,10 @@ public: } ACTOR static Future writeActor(FIFOQueue *self, FutureStream queue) { - state bool modified = false; try { loop { state T item = waitNext(queue); wait(self->tail.writeNext(item)); - modified = true; } } catch(Error &e) { @@ -265,11 +308,14 @@ public: } } - if(modified) { - self->tail.writePage(); - self->stop.initExisting(self, self->tail.pageID, self->tail.index); + wait(self->tail.ready()); + + if(!self->tail.empty()) { + wait(self->tail.newPage()); } + self->head.setEnd(self->tail); + return self->getState(); } @@ -299,9 +345,9 @@ public: PromiseStream writeQueue; Future writer; - // Invariant: head <= stop <= tail + // Head points to the next location to read Cursor head; - Cursor stop; + // Tail points to the next location to write Cursor tail; // For debugging @@ -495,7 +541,7 @@ public: .detail("DesiredPageSize", self->desiredPageSize); } - self->freeList.init(self, self->pHeader->freeList, "FreeListRecovered"); + self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); } else { debug_printf("File does not exist, creating header page: %s\n", self->filename.c_str()); @@ -518,7 +564,7 @@ public: self->pHeader->pageCount = 2; // Create a new free list at page 1 - self->freeList.init(self, 1, "FreeListNew"); + self->freeList.create(self, 1, "FreeListNew"); // Clear remaining bytes of header memset(self->headerPage->mutate() + self->pHeader->size(), 0, self->headerPage->size() - self->pHeader->size()); @@ -2053,7 +2099,7 @@ public: LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); debug_printf("new lazy delete queue page %u\n", newQueuePage); - self->m_lazyDeleteQueue.init(self->m_pager, newQueuePage, "LazyDeleteQueueNew"); + self->m_lazyDeleteQueue.create(self->m_pager, newQueuePage, "LazyDeleteQueueNew"); self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); self->m_pager->setMetaKey(self->m_header.asKeyRef()); wait(self->m_pager->commit()); @@ -2061,7 +2107,7 @@ public: } else { self->m_header.fromKeyRef(meta); - self->m_lazyDeleteQueue.init(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); + self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); } self->m_maxPartSize = std::min(255, self->m_pager->getUsablePageSize() / 5); self->m_lastCommittedVersion = latest; From 537b8dc7ace4965014d18805f2c999c0f3bd8c7a Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 14 Aug 2019 03:01:46 -0700 Subject: [PATCH 012/184] Bug fix, COWPager failed to reopen a created but unsync'd pager file. Added proper checksum error handling. --- fdbserver/VersionedBTree.actor.cpp | 35 ++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index f830f0939d..5c67fbcb2c 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -525,9 +525,16 @@ public: // Header page is always treated as having a page size of smallestPhysicalBlock self->setPageSize(smallestPhysicalBlock); + state int64_t fileSize = 0; if(exists) { - debug_printf("File exists, reading header: %s\n", self->filename.c_str()); + wait(store(fileSize, self->pageFile->size())); + } + + debug_printf("COWPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize); + + if(exists && fileSize >= self->smallestPhysicalBlock) { + debug_printf("COWPager(%s) recovering using existing file\n"); // Read physical page 0 directly wait(store(self->headerPage, self->readPhysicalPage(self, 0))); @@ -544,7 +551,7 @@ public: self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); } else { - debug_printf("File does not exist, creating header page: %s\n", self->filename.c_str()); + debug_printf("COWPager(%s) creating new pager\n"); self->headerPage = self->newPageBuffer(); self->pHeader = (Header *)self->headerPage->begin(); @@ -575,7 +582,7 @@ public: self->lastCommittedVersion = self->pHeader->committedVersion; self->lastCommittedMeta = self->pHeader->getMetaKey(); - debug_printf("Recovered %s\n", self->filename.c_str()); + debug_printf("COWPager(%s) recovered. LogicalPageSize=%d PhysicalPageSize=%d\n", self->filename.c_str(), self->logicalPageSize, self->physicalPageSize); return Void(); } @@ -620,7 +627,7 @@ public: void updatePage(LogicalPageID pageID, Reference data) { // Get the cache entry for this page PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager op=write id=%u cached=%d reading=%d writing=%d\n", pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("COWPager(%s) op=write id=%u cached=%d reading=%d writing=%d\n", filename.c_str(), pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places // the new content in the cache entry when the write is launched, not when it is completed. @@ -668,16 +675,28 @@ public: ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { state Reference page = self->newPageBuffer(); int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); - debug_printf("op=read_complete id=%u bytes=%d\n", pageID, readBytes); + debug_printf("COWPager(%s) op=read_complete id=%u bytes=%d\n", self->filename.c_str(), pageID, readBytes); ASSERT(readBytes == self->physicalPageSize); - ASSERT(((Page *)page.getPtr())->verifyChecksum(pageID)); + Page *p = (Page *)page.getPtr(); + if(!p->verifyChecksum(pageID)) { + Error e = checksum_failed(); + TraceEvent(SevError, "COWPagerChecksumFailed") + .detail("Filename", self->filename.c_str()) + .detail("PageID", pageID) + .detail("PageSize", self->physicalPageSize) + .detail("Offset", pageID * self->physicalPageSize) + .detail("CalculatedChecksum", p->calculateChecksum(pageID)) + .detail("ChecksumInPage", p->getChecksum()) + .error(e); + throw e; + } return page; } // Reads the most recent version of pageID either committed or written using updatePage() Future> readPage(LogicalPageID pageID) { PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager op=read id=%u cached=%d reading=%d writing=%d\n", pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("COWPager(%s) op=read id=%u cached=%d reading=%d writing=%d\n", filename.c_str(), pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); if(!cacheEntry.page.isValid()) { cacheEntry.page = readPhysicalPage(this, (PhysicalPageID)pageID); @@ -698,10 +717,12 @@ public: // Sync everything except the header wait(self->pageFile->sync()); + debug_printf("COWPager(%s) commit sync 1\n", self->filename.c_str()); // Update header on disk and sync again. wait(self->writePhysicalPage(0, self->headerPage)); wait(self->pageFile->sync()); + debug_printf("COWPager(%s) commit sync 2\n", self->filename.c_str()); // Update last committed state for use in creating snapshots at current version. self->lastCommittedVersion = self->pHeader->committedVersion; From af14bfc2551952098915c9416fb16bcc76702aa1 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 14 Aug 2019 03:05:37 -0700 Subject: [PATCH 013/184] Changed COWPager page cache size argument to bytes instead of pages and changed initialization to use appropriate knobs in simulation. --- fdbserver/VersionedBTree.actor.cpp | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 5c67fbcb2c..b79af597d4 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -421,7 +421,7 @@ private: template class ObjectCache { public: - ObjectCache(int sizeLimit) : sizeLimit(sizeLimit) { + ObjectCache(int sizeLimit = 0) : sizeLimit(sizeLimit) { } // Get the object for i or create a new one. @@ -496,7 +496,11 @@ public: typedef FIFOQueue LogicalPageQueueT; // If the file already exists, pageSize might be different than desiredPageSize - COWPager(int desiredPageSize, std::string filename, int cachedPageLimit) : desiredPageSize(desiredPageSize), filename(filename), pageCache(cachedPageLimit), pHeader(nullptr) { + // Use pageCacheSizeBytes == 0 for default + COWPager(int desiredPageSize, std::string filename, int pageCacheSizeBytes) : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) { + if(pageCacheBytes == 0) { + pageCacheBytes = g_network->isSimulated() ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) : FLOW_KNOBS->PAGE_CACHE_4K; + } commitFuture = Void(); recoverFuture = forwardError(recover(this), errorPromise); } @@ -579,6 +583,7 @@ public: wait(self->commit()); } + self->pageCache = PageCacheT(self->pageCacheBytes / self->physicalPageSize); self->lastCommittedVersion = self->pHeader->committedVersion; self->lastCommittedMeta = self->pHeader->getMetaKey(); @@ -868,6 +873,8 @@ private: int physicalPageSize; int logicalPageSize; // In simulation testing it can be useful to use a small logical page size + int64_t pageCacheBytes; + // The header will be written to / read from disk as a smallestPhysicalBlock sized chunk. Reference headerPage; Header *pHeader; @@ -879,7 +886,8 @@ private: std::string filename; - ObjectCache pageCache; + typedef ObjectCache PageCacheT; + PageCacheT pageCache; Promise closedPromise; Promise errorPromise; @@ -3718,8 +3726,7 @@ class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) : m_filePrefix(filePrefix) { // TODO: This constructor should really just take an IVersionedStore - int pageSize = 4096; - IPager2 *pager = new COWPager(4096, filePrefix, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + IPager2 *pager = new COWPager(4096, filePrefix, 0); m_tree = new VersionedBTree(pager, filePrefix, true); m_init = catchError(init_impl(this)); } @@ -4697,7 +4704,7 @@ TEST_CASE("!/redwood/correctness/btree") { printf("Initializing...\n"); state double startTime = timer(); - pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + pager = new COWPager(pageSize, pagerFile, 0); state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); @@ -4869,7 +4876,7 @@ TEST_CASE("!/redwood/correctness/btree") { wait(closedFuture); debug_printf("Reopening btree\n"); - IPager2 *pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + IPager2 *pager = new COWPager(pageSize, pagerFile, 0); btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); @@ -4932,7 +4939,7 @@ TEST_CASE("!/redwood/correctness/pager/cow") { deleteFile(pagerFile); int pageSize = 4096; - state IPager2 *pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + state IPager2 *pager = new COWPager(pageSize, pagerFile, 0); wait(success(pager->getLatestVersion())); state LogicalPageID id = wait(pager->newPageID()); From 8c0b9b5111aecc7ee8fb80bf52688906c20f87e7 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 14 Aug 2019 04:41:12 -0700 Subject: [PATCH 014/184] COWPager now uses Page 1 as a write-ahead copy of the header which is written and sync'd before modifying Page 0. --- fdbserver/VersionedBTree.actor.cpp | 62 ++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b79af597d4..38b8232d74 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -212,7 +212,7 @@ public: // Read and moved past the next item if it is < upperBound Future> moveNext(const Optional &upperBound = {}) { - // If loading is not valid then this page cannot be loaded now so return nothing + // If loading is not valid then either the cursor is not initialized or it points to a page not yet durable. if(!loading.isValid()) { return Optional(); } @@ -536,12 +536,36 @@ public: } debug_printf("COWPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize); + // TODO: If the file exists but appears to never have been successfully committed is this an error or + // should recovery proceed with a new pager instance? - if(exists && fileSize >= self->smallestPhysicalBlock) { + // If there are at least 2 pages then try to recover the existing file + if(exists && fileSize >= (self->smallestPhysicalBlock * 2)) { debug_printf("COWPager(%s) recovering using existing file\n"); - // Read physical page 0 directly - wait(store(self->headerPage, self->readPhysicalPage(self, 0))); + state bool recoveredHeader = false; + + // Read physical page 0 directly, checksum not required + wait(store(self->headerPage, self->readPhysicalPage(self, 0, false))); + + // If the checksum fails for the header page, try to recover it from page 1 + if(!self->headerPage.castTo()->verifyChecksum(0)) { + TraceEvent(SevWarn, "COWPagerRecoveringHeader").detail("Filename", self->filename); + + wait(store(self->headerPage, self->readPhysicalPage(self, 1, false))); + + if(!self->headerPage.castTo()->verifyChecksum(0)) { + if(g_network->isSimulated()) { + // TODO: Detect if process is being restarted and only throw injected if so? + throw io_error().asInjectedFault(); + } + + TraceEvent(SevError, "COWPagerRecoveryFailed").detail("Filename", self->filename); + throw io_error(); + } + recoveredHeader = true; + } + self->pHeader = (Header *)self->headerPage->begin(); self->setPageSize(self->pHeader->pageSize); @@ -553,6 +577,19 @@ public: } self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); + + // If the header was recovered from Page 1 then write and sync it to Page 0 before continuing. + if(recoveredHeader) { + // Write the header to page 0 + wait(self->writePhysicalPage(0, self->headerPage)); + + // Wait for all outstanding writes to complete + wait(self->writes.signalAndCollapse()); + + // Sync header + wait(self->pageFile->sync()); + debug_printf("COWPager(%s) Header recovery complete.\n", self->filename.c_str()); + } } else { debug_printf("COWPager(%s) creating new pager\n"); @@ -569,13 +606,13 @@ public: // No meta key until a user sets one and commits self->pHeader->setMetaKey(Key()); - // There will be 2 page IDs in use - // Page 0 will be the header - // Page 1 will be the empty free list queue, which won't actually be written to the file as the page has no content + // There are 2 reserved pages: + // Page 0 - header + // Page 1 - header write-ahead "log" self->pHeader->pageCount = 2; - // Create a new free list at page 1 - self->freeList.create(self, 1, "FreeListNew"); + // Create a new free list + self->freeList.create(self, self->newPageID().get(), "FreeListNew"); // Clear remaining bytes of header memset(self->headerPage->mutate() + self->pHeader->size(), 0, self->headerPage->size() - self->pHeader->size()); @@ -677,13 +714,13 @@ public: freeList.push(pageID); }; - ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { + ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID, bool verifyChecksum = true) { state Reference page = self->newPageBuffer(); int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); debug_printf("COWPager(%s) op=read_complete id=%u bytes=%d\n", self->filename.c_str(), pageID, readBytes); ASSERT(readBytes == self->physicalPageSize); Page *p = (Page *)page.getPtr(); - if(!p->verifyChecksum(pageID)) { + if(verifyChecksum && !p->verifyChecksum(pageID)) { Error e = checksum_failed(); TraceEvent(SevError, "COWPagerChecksumFailed") .detail("Filename", self->filename.c_str()) @@ -717,6 +754,9 @@ public: // Flush the free list queue to the pager wait(store(self->pHeader->freeList, self->freeList.flush())); + // Write the header write-ahead "log" at Page 1 + wait(self->writePhysicalPage(1, self->headerPage)); + // Wait for all outstanding writes to complete wait(self->writes.signalAndCollapse()); From 95c80040496ab2856fe09318eb601c0b89cdae2b Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 14 Aug 2019 05:22:08 -0700 Subject: [PATCH 015/184] Bug fixes. COWPager header recovery was using the wrong checksum input and did not work for physical page sizes other than 4k. --- fdbserver/VersionedBTree.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 38b8232d74..89918bb6bb 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -549,12 +549,12 @@ public: wait(store(self->headerPage, self->readPhysicalPage(self, 0, false))); // If the checksum fails for the header page, try to recover it from page 1 - if(!self->headerPage.castTo()->verifyChecksum(0)) { + if(BUGGIFY || !self->headerPage.castTo()->verifyChecksum(0)) { TraceEvent(SevWarn, "COWPagerRecoveringHeader").detail("Filename", self->filename); wait(store(self->headerPage, self->readPhysicalPage(self, 1, false))); - if(!self->headerPage.castTo()->verifyChecksum(0)) { + if(!self->headerPage.castTo()->verifyChecksum(1)) { if(g_network->isSimulated()) { // TODO: Detect if process is being restarted and only throw injected if so? throw io_error().asInjectedFault(); @@ -662,7 +662,7 @@ public: Future writePhysicalPage(PhysicalPageID pageID, Reference page) { ((Page *)page.getPtr())->updateChecksum(pageID); - int physicalSize = (pageID == 0) ? smallestPhysicalBlock : physicalPageSize; + int physicalSize = (pageID == 0 || pageID == 1) ? smallestPhysicalBlock : physicalPageSize; return holdWhile(page, pageFile->write(page->begin(), physicalSize, (int64_t)pageID * physicalSize)); } From 65ddae13739542b2c561c033cc3cbc6dc450e96b Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 15 Aug 2019 15:44:54 -0700 Subject: [PATCH 016/184] Bug fix, ObjectCache could evict the object it just added and then return an invalid reference to it. --- fdbserver/VersionedBTree.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 89918bb6bb..3e7910a2fa 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -444,7 +444,8 @@ public: // If the cache is too big, try to evict the first Entry in the eviction order if(cache.size() > sizeLimit) { Entry &toEvict = evictionOrder.front(); - if(toEvict.item.evictable()) { + // Don't evict the entry that was just added as then we can't return a reference to it. + if(toEvict.index != index && toEvict.item.evictable()) { evictionOrder.pop_front(); cache.erase(toEvict.index); } From ca118459346a19c7863937cdc3e72f8bf35865bd Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 15 Aug 2019 15:49:18 -0700 Subject: [PATCH 017/184] Debug output tweaks. --- fdbserver/VersionedBTree.actor.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 3e7910a2fa..f8d6efdd1b 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -144,7 +144,7 @@ public: } Future loadPage() { - debug_printf("queue(%p, %s) loading page %u index %d\n", this, queue->name.c_str(), pageID, index); + debug_printf("queue(%p, %s) loading page id=%u index=%d\n", this, queue->name.c_str(), pageID, index); return map(queue->pager->readPage(pageID), [=](Reference p) { page = p; return Void(); @@ -153,8 +153,8 @@ public: Future newPage() { ASSERT(page); - debug_printf("queue(%p, %s) new page\n", this, queue->name.c_str()); return map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { + debug_printf("queue(%p, %s) new page id=%u\n", this, queue->name.c_str(), newPageID); auto p = raw(); p->next = newPageID; writePage(); @@ -177,6 +177,7 @@ public: // Pages are never written after being read, so if the write cursor is not // ready then it is getting a new page ID which must be written to the next // page ID of the page behind it. + debug_printf("queue(%p, %s) write page id=%u\n", this, queue->name.c_str(), pageID); ASSERT(loading.isReady()); queue->pager->updatePage(pageID, page); } @@ -662,6 +663,7 @@ public: }; Future writePhysicalPage(PhysicalPageID pageID, Reference page) { + debug_printf("COWPager(%s) op=write id=%u\n", filename.c_str(), pageID); ((Page *)page.getPtr())->updateChecksum(pageID); int physicalSize = (pageID == 0 || pageID == 1) ? smallestPhysicalBlock : physicalPageSize; return holdWhile(page, pageFile->write(page->begin(), physicalSize, (int64_t)pageID * physicalSize)); @@ -722,6 +724,7 @@ public: ASSERT(readBytes == self->physicalPageSize); Page *p = (Page *)page.getPtr(); if(verifyChecksum && !p->verifyChecksum(pageID)) { + debug_printf("COWPager(%s) checksum failed id=%u\n", self->filename.c_str(), pageID); Error e = checksum_failed(); TraceEvent(SevError, "COWPagerChecksumFailed") .detail("Filename", self->filename.c_str()) @@ -2159,7 +2162,7 @@ public: state Key meta = self->m_pager->getMetaKey(); if(meta.size() == 0) { LogicalPageID newRoot = wait(self->m_pager->newPageID()); - debug_printf("new root page %u\n", newRoot); + debug_printf("new root page id=%u\n", newRoot); self->m_header.root = newRoot; ++latest; Reference page = self->m_pager->newPageBuffer(); @@ -2168,7 +2171,7 @@ public: self->m_pager->setVersion(latest); LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); - debug_printf("new lazy delete queue page %u\n", newQueuePage); + debug_printf("new lazy delete queue page id=%u\n", newQueuePage); self->m_lazyDeleteQueue.create(self->m_pager, newQueuePage, "LazyDeleteQueueNew"); self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); self->m_pager->setMetaKey(self->m_header.asKeyRef()); From 61054492b624a346c363fac635210b34e373f842 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 16 Aug 2019 03:24:55 -0700 Subject: [PATCH 018/184] Bug fix in the design of the COWPager commit sequence. Page 1 is now used to store a copy of the previous committed header rather than the new one, as recovering to an unsync'd new header from Page 1 is incorrect behavior since other pager writes may not have made it to disk. Also fixed header page size handling which would write unusable backup headers when using >4k pages. --- fdbserver/VersionedBTree.actor.cpp | 125 +++++++++++++++++++---------- 1 file changed, 82 insertions(+), 43 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index f8d6efdd1b..914882d176 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -63,7 +63,7 @@ public: int64_t numPages; int64_t numEntries; std::string toString() const { - return format("head: %u:%d tail: %u:%d numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headIndex, tailPageID, numPages, numEntries); + return format("head: %u:%d tail: %u numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headIndex, tailPageID, numPages, numEntries); } }; #pragma pack(pop) @@ -90,7 +90,7 @@ public: // Point cursor to a page which has never been written before, allocate // a page buffer and initialize it void initWrite(FIFOQueue *q, LogicalPageID newPageID) { - debug_printf("New queue cursor at page id=%u write=%d\n", newPageID, write); + debug_printf("FIFOQueue(%s): New write queue cursor at page id=%u\n", q->name.c_str(), newPageID); queue = q; pageID = newPageID; initNewPageBuffer(); @@ -98,7 +98,7 @@ public: // Point cursor to a page to read from. Begin loading the page if beginLoad is set. void initRead(FIFOQueue *q, LogicalPageID p, int i, LogicalPageID endPageID) { - debug_printf("Loading queue cursor at page id=%u index=%d\n", p, i); + debug_printf("FIFOQueue(%s): New read queue cursor at page id=%u index=%d end page id=%u\n", q->name.c_str(), p, i, endPageID); queue = q; pageID = p; index = i; @@ -144,7 +144,7 @@ public: } Future loadPage() { - debug_printf("queue(%p, %s) loading page id=%u index=%d\n", this, queue->name.c_str(), pageID, index); + debug_printf("FIFOQueue(%s): loading page id=%u index=%d\n", queue->name.c_str(), pageID, index); return map(queue->pager->readPage(pageID), [=](Reference p) { page = p; return Void(); @@ -154,7 +154,7 @@ public: Future newPage() { ASSERT(page); return map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { - debug_printf("queue(%p, %s) new page id=%u\n", this, queue->name.c_str(), newPageID); + debug_printf("FIFOQueue(%s): new page id=%u\n", queue->name.c_str(), newPageID); auto p = raw(); p->next = newPageID; writePage(); @@ -177,7 +177,7 @@ public: // Pages are never written after being read, so if the write cursor is not // ready then it is getting a new page ID which must be written to the next // page ID of the page behind it. - debug_printf("queue(%p, %s) write page id=%u\n", this, queue->name.c_str(), pageID); + debug_printf("FIFOQueue(%s): write page id=%u\n", queue->name.c_str(), pageID); ASSERT(loading.isReady()); queue->pager->updatePage(pageID, page); } @@ -254,7 +254,7 @@ public: // Create a new queue at newPageID void create(IPager2 *p, LogicalPageID newPageID, std::string queueName) { - debug_printf("FIFOQueue::init(%p, %s) from page id %u\n", this, name.c_str(), newPageID); + debug_printf("FIFOQueue(%s): create from page id %u\n", queueName.c_str(), newPageID); pager = p; name = queueName; numPages = 1; @@ -267,9 +267,8 @@ public: // Load an existing queue from its queue state void recover(IPager2 *p, const QueueState &qs, std::string queueName) { - debug_printf("FIFOQueue::init(%p, %s) from queue state %u\n", this, name.c_str(), qs.toString().c_str()); + debug_printf("FIFOQueue(%s): recover from queue state %s\n", queueName.c_str(), qs.toString().c_str()); pager = p; - this->name = name; name = queueName; numPages = qs.numPages; numEntries = qs.numEntries; @@ -293,6 +292,8 @@ public: s.tailPageID = tail.pageID; s.numEntries = numEntries; s.numPages = numPages; + + debug_printf("FIFOQueue(%s): getState(): %s\n", name.c_str(), s.toString().c_str()); return s; } @@ -326,13 +327,13 @@ public: // Flush changes to the pager and return the resulting queue state. Future flush() { - debug_printf("FIFOQueue::flush %p %s\n", this, name.c_str()); + debug_printf("FIFOQueue(%s): flush\n", name.c_str()); Future oldWriter = writer; writeQueue.sendError(end_of_stream()); writeQueue = PromiseStream(); writer = writeActor(this, writeQueue.getFuture()); if(!oldWriter.isValid()) { - debug_printf("FIFOQueue::flush %p oldwriter not valid %s\n", this, name.c_str()); + debug_printf("FIFOQueue(%s): flush, oldwriter not valid\n", name.c_str()); return getState(); } return oldWriter; @@ -518,6 +519,10 @@ public: } } + void updateCommittedHeader() { + memcpy(lastCommittedHeaderPage->mutate(), headerPage->begin(), smallestPhysicalBlock); + } + ACTOR static Future recover(COWPager *self) { ASSERT(!self->recoverFuture.isValid()); @@ -531,8 +536,10 @@ public: // Header page is always treated as having a page size of smallestPhysicalBlock self->setPageSize(smallestPhysicalBlock); - state int64_t fileSize = 0; + self->lastCommittedHeaderPage = self->newPageBuffer(); + self->pLastCommittedHeader = (Header *)self->lastCommittedHeaderPage->begin(); + state int64_t fileSize = 0; if(exists) { wait(store(fileSize, self->pageFile->size())); } @@ -547,14 +554,14 @@ public: state bool recoveredHeader = false; - // Read physical page 0 directly, checksum not required - wait(store(self->headerPage, self->readPhysicalPage(self, 0, false))); + // Read physical page 0 directly + wait(store(self->headerPage, self->readHeaderPage(self, 0))); - // If the checksum fails for the header page, try to recover it from page 1 + // If the checksum fails for the header page, try to recover committed header backup from page 1 if(BUGGIFY || !self->headerPage.castTo()->verifyChecksum(0)) { TraceEvent(SevWarn, "COWPagerRecoveringHeader").detail("Filename", self->filename); - wait(store(self->headerPage, self->readPhysicalPage(self, 1, false))); + wait(store(self->headerPage, self->readHeaderPage(self, 1))); if(!self->headerPage.castTo()->verifyChecksum(1)) { if(g_network->isSimulated()) { @@ -562,8 +569,11 @@ public: throw io_error().asInjectedFault(); } - TraceEvent(SevError, "COWPagerRecoveryFailed").detail("Filename", self->filename); - throw io_error(); + Error e = checksum_failed(); + TraceEvent(SevError, "COWPagerRecoveryFailed") + .detail("Filename", self->filename) + .error(e); + throw e; } recoveredHeader = true; } @@ -580,10 +590,11 @@ public: self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); - // If the header was recovered from Page 1 then write and sync it to Page 0 before continuing. + // If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing. + // If this fails, the backup header is still in tact for the next recovery attempt. if(recoveredHeader) { // Write the header to page 0 - wait(self->writePhysicalPage(0, self->headerPage)); + wait(self->writeHeaderPage(0, self->headerPage)); // Wait for all outstanding writes to complete wait(self->writes.signalAndCollapse()); @@ -592,8 +603,15 @@ public: wait(self->pageFile->sync()); debug_printf("COWPager(%s) Header recovery complete.\n", self->filename.c_str()); } + + // Update the last committed header with the one that was recovered (which is the last known committed header) + self->updateCommittedHeader(); } else { + // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully committed. + // A new pager will be created in its place. + // TODO: Is the right behavior? + debug_printf("COWPager(%s) creating new pager\n"); self->headerPage = self->newPageBuffer(); @@ -610,21 +628,27 @@ public: // There are 2 reserved pages: // Page 0 - header - // Page 1 - header write-ahead "log" + // Page 1 - header backup self->pHeader->pageCount = 2; // Create a new free list self->freeList.create(self, self->newPageID().get(), "FreeListNew"); - // Clear remaining bytes of header - memset(self->headerPage->mutate() + self->pHeader->size(), 0, self->headerPage->size() - self->pHeader->size()); + // The first commit() below will flush the queue and update the queue state in the header, + // but since the queue will not be used between now and then its state will not change. + // In order to populate lastCommittedHeader, update the header now with the queue's state. + self->pHeader->freeList = self->freeList.getState(); + + // Set remaining header bytes to \xff + memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, self->headerPage->size() - self->pHeader->size()); + + // Since there is no previously committed header use the initial header for the initial commit. + self->updateCommittedHeader(); wait(self->commit()); } self->pageCache = PageCacheT(self->pageCacheBytes / self->physicalPageSize); - self->lastCommittedVersion = self->pHeader->committedVersion; - self->lastCommittedMeta = self->pHeader->getMetaKey(); debug_printf("COWPager(%s) recovered. LogicalPageSize=%d PhysicalPageSize=%d\n", self->filename.c_str(), self->logicalPageSize, self->physicalPageSize); return Void(); @@ -662,11 +686,16 @@ public: return forwardError(f, errorPromise); }; + Future writeHeaderPage(PhysicalPageID pageID, Reference page) { + debug_printf("COWPager(%s) header op=write id=%u\n", filename.c_str(), pageID); + ((Page *)page.getPtr())->updateChecksum(pageID); + return holdWhile(page, pageFile->write(page->begin(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); + } + Future writePhysicalPage(PhysicalPageID pageID, Reference page) { debug_printf("COWPager(%s) op=write id=%u\n", filename.c_str(), pageID); ((Page *)page.getPtr())->updateChecksum(pageID); - int physicalSize = (pageID == 0 || pageID == 1) ? smallestPhysicalBlock : physicalPageSize; - return holdWhile(page, pageFile->write(page->begin(), physicalSize, (int64_t)pageID * physicalSize)); + return holdWhile(page, pageFile->write(page->begin(), physicalPageSize, (int64_t)pageID * physicalPageSize)); } void updatePage(LogicalPageID pageID, Reference data) { @@ -717,13 +746,24 @@ public: freeList.push(pageID); }; - ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID, bool verifyChecksum = true) { + // Header pages use a page size of smallestPhysicalBlock + // If the user chosen physical page size is larger, then there will be a gap of unused space after + // between the end of page 1 and the start of page 2. + ACTOR static Future> readHeaderPage(COWPager *self, PhysicalPageID pageID) { + state Reference page(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)); + int readBytes = wait(self->pageFile->read(page->mutate(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); + debug_printf("COWPager(%s) header op=read_complete id=%u bytes=%d\n", self->filename.c_str(), pageID, readBytes); + ASSERT(readBytes == smallestPhysicalBlock); + return page; + } + + ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { state Reference page = self->newPageBuffer(); int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); debug_printf("COWPager(%s) op=read_complete id=%u bytes=%d\n", self->filename.c_str(), pageID, readBytes); ASSERT(readBytes == self->physicalPageSize); Page *p = (Page *)page.getPtr(); - if(verifyChecksum && !p->verifyChecksum(pageID)) { + if(!p->verifyChecksum(pageID)) { debug_printf("COWPager(%s) checksum failed id=%u\n", self->filename.c_str(), pageID); Error e = checksum_failed(); TraceEvent(SevError, "COWPagerChecksumFailed") @@ -755,11 +795,11 @@ public: Reference getReadSnapshot(); ACTOR static Future commit_impl(COWPager *self) { - // Flush the free list queue to the pager - wait(store(self->pHeader->freeList, self->freeList.flush())); + // Write old committed header to Page 1 + self->writes.add(forwardError(self->writeHeaderPage(1, self->lastCommittedHeaderPage), self->errorPromise)); - // Write the header write-ahead "log" at Page 1 - wait(self->writePhysicalPage(1, self->headerPage)); + // Flush the free list queue to the pager and get the new queue state into the header + wait(store(self->pHeader->freeList, self->freeList.flush())); // Wait for all outstanding writes to complete wait(self->writes.signalAndCollapse()); @@ -769,13 +809,12 @@ public: debug_printf("COWPager(%s) commit sync 1\n", self->filename.c_str()); // Update header on disk and sync again. - wait(self->writePhysicalPage(0, self->headerPage)); + wait(self->writeHeaderPage(0, self->headerPage)); wait(self->pageFile->sync()); debug_printf("COWPager(%s) commit sync 2\n", self->filename.c_str()); - // Update last committed state for use in creating snapshots at current version. - self->lastCommittedVersion = self->pHeader->committedVersion; - self->lastCommittedMeta = self->pHeader->getMetaKey(); + // Update the last committed header for use in the next commit. + self->updateCommittedHeader(); return Void(); } @@ -851,7 +890,7 @@ public: Future getLatestVersion() { return map(recoverFuture, [=](Void) { - return lastCommittedVersion; + return pLastCommittedHeader->committedVersion; }); } @@ -925,8 +964,8 @@ private: int desiredPageSize; - Version lastCommittedVersion; - Key lastCommittedMeta; + Reference lastCommittedHeaderPage; + Header *pLastCommittedHeader; std::string filename; @@ -983,8 +1022,8 @@ private: }; Reference COWPager::getReadSnapshot() { - ++snapshotsInUse[lastCommittedVersion]; - return Reference(new COWPagerSnapshot(this, lastCommittedMeta, lastCommittedVersion)); + ++snapshotsInUse[pLastCommittedHeader->committedVersion]; + return Reference(new COWPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion)); } // TODO: Move this to a flow header once it is mature. @@ -2558,7 +2597,7 @@ private: } ACTOR static Future buildNewRoot(VersionedBTree *self, Version version, std::vector *pages, std::vector *logicalPageIDs, BTreePage *pPage) { - debug_printf("buildNewRoot start version %" PRId64 ", %lu pages %s\n", version, pages->size()); + debug_printf("buildNewRoot start version %" PRId64 ", %lu pages\n", version, pages->size()); // While there are multiple child pages for this version we must write new tree levels. while(pages->size() > 1) { From 8d2d1f4f24c71107b9f4bb4d284297c363d6d9fa Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 16 Aug 2019 04:17:29 -0700 Subject: [PATCH 019/184] Bug fix, COWPager recovery can't simulate header read failure using buggify anymore because the backup header is now a previous version and it is invalid to not recover with an fsync'd latest header. Debug output improvements. --- fdbserver/VersionedBTree.actor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 914882d176..a6080d0016 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -558,7 +558,7 @@ public: wait(store(self->headerPage, self->readHeaderPage(self, 0))); // If the checksum fails for the header page, try to recover committed header backup from page 1 - if(BUGGIFY || !self->headerPage.castTo()->verifyChecksum(0)) { + if(!self->headerPage.castTo()->verifyChecksum(0)) { TraceEvent(SevWarn, "COWPagerRecoveringHeader").detail("Filename", self->filename); wait(store(self->headerPage, self->readHeaderPage(self, 1))); @@ -650,7 +650,7 @@ public: self->pageCache = PageCacheT(self->pageCacheBytes / self->physicalPageSize); - debug_printf("COWPager(%s) recovered. LogicalPageSize=%d PhysicalPageSize=%d\n", self->filename.c_str(), self->logicalPageSize, self->physicalPageSize); + debug_printf("COWPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, self->physicalPageSize); return Void(); } @@ -796,7 +796,7 @@ public: ACTOR static Future commit_impl(COWPager *self) { // Write old committed header to Page 1 - self->writes.add(forwardError(self->writeHeaderPage(1, self->lastCommittedHeaderPage), self->errorPromise)); + self->writes.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); // Flush the free list queue to the pager and get the new queue state into the header wait(store(self->pHeader->freeList, self->freeList.flush())); @@ -806,12 +806,12 @@ public: // Sync everything except the header wait(self->pageFile->sync()); - debug_printf("COWPager(%s) commit sync 1\n", self->filename.c_str()); + debug_printf("COWPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), self->pHeader->committedVersion); // Update header on disk and sync again. wait(self->writeHeaderPage(0, self->headerPage)); wait(self->pageFile->sync()); - debug_printf("COWPager(%s) commit sync 2\n", self->filename.c_str()); + debug_printf("COWPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), self->pHeader->committedVersion); // Update the last committed header for use in the next commit. self->updateCommittedHeader(); From 1882b58d21211ca71d4bf462d59f959c6781f414 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sat, 17 Aug 2019 05:21:54 -0700 Subject: [PATCH 020/184] COWPager dispose() was not deleting the page file. --- fdbserver/VersionedBTree.actor.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index a6080d0016..a30f213a9a 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -854,6 +854,10 @@ public: self->pageFile.clear(); + if(dispose) { + wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(self->filename, true)); + } + self->closedPromise.send(Void()); delete self; } From 1bb323fa8c921e87bbe91022c3c2f99788638b8d Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sun, 18 Aug 2019 09:24:30 -0700 Subject: [PATCH 021/184] Bug fix in FIFOQueue pop() when freeing an exhausted page causes a recursive pop() from the same queue, which happens when the queue is the freelist itself and the write cursor is also at the end of its page. --- fdbserver/VersionedBTree.actor.cpp | 47 ++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index a30f213a9a..3232c801de 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -97,11 +97,12 @@ public: } // Point cursor to a page to read from. Begin loading the page if beginLoad is set. - void initRead(FIFOQueue *q, LogicalPageID p, int i, LogicalPageID endPageID) { - debug_printf("FIFOQueue(%s): New read queue cursor at page id=%u index=%d end page id=%u\n", q->name.c_str(), p, i, endPageID); + void initRead(FIFOQueue *q, LogicalPageID p, int i, LogicalPageID end) { + debug_printf("FIFOQueue(%s): New read queue cursor at page id=%u index=%d end page id=%u\n", q->name.c_str(), p, i, end); queue = q; pageID = p; index = i; + endPageID = end; // If cursor is not pointed at the end page then start loading it. // The end page will not have been written to disk yet. @@ -153,8 +154,10 @@ public: Future newPage() { ASSERT(page); - return map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { - debug_printf("FIFOQueue(%s): new page id=%u\n", queue->name.c_str(), newPageID); + ASSERT(loading.isReady()); + + loading = map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { + debug_printf("FIFOQueue(%s): new page id=%u\n", queue->name.c_str(), newPageID); auto p = raw(); p->next = newPageID; writePage(); @@ -163,6 +166,8 @@ public: initNewPageBuffer(); return Void(); }); + + return loading; } bool operator== (const Cursor &rhs) { @@ -174,11 +179,7 @@ public: } void writePage() { - // Pages are never written after being read, so if the write cursor is not - // ready then it is getting a new page ID which must be written to the next - // page ID of the page behind it. debug_printf("FIFOQueue(%s): write page id=%u\n", queue->name.c_str(), pageID); - ASSERT(loading.isReady()); queue->pager->updatePage(pageID, page); } @@ -197,7 +198,7 @@ public: ++queue->numEntries; ++index; if(index == queue->itemsPerPage) { - this->loading = newPage(); + newPage(); } return Void(); } @@ -222,20 +223,31 @@ public: if(loading.isReady()) { auto p = raw(); if(upperBound.present() && p->at(index) >= upperBound.get()) { + debug_printf("FIFOQueue(%s) pop upperbound limit exceeded\n", queue->name.c_str()); return Optional(); } + debug_printf("FIFOQueue(%s) read cursor pop from page id=%u index=%d count=%d\n", queue->name.c_str(), pageID, index, p->count); T result = p->at(index); --queue->numEntries; ++index; + debug_printf("FIFOQueue(%s) read cursor popped from page id=%u index=%d count=%d\n", queue->name.c_str(), pageID, index, p->count); // If this page is out of items, start reading the next one if(index == p->count) { - queue->pager->freePage(pageID); + LogicalPageID oldPageID = pageID; pageID = p->next; index = 0; --queue->numPages; + debug_printf("FIFOQueue(%s) advancing to next page id=%u endPageID=%u\n", queue->name.c_str(), pageID, endPageID); loading = (pageID == endPageID) ? Future() : loadPage(); + + // freePage() must be called after setting the loading future because freePage() might pop from this + // queue recursively if the pager's free list is being stored in this queue. + queue->pager->freePage(oldPageID); + } + else { + debug_printf("FIFOQueue(%s) index and count are not the same %d %u\n", queue->name.c_str(), index, p->count); } return Optional(result); @@ -310,12 +322,15 @@ public: } } + // Wait for tail to be ready to write to a page wait(self->tail.ready()); + // If tail page is not empty, link it to a new unwritten/empty page if(!self->tail.empty()) { wait(self->tail.newPage()); } + // After queue is flushed, head may read everything written so far (which will have been committed) self->head.setEnd(self->tail); return self->getState(); @@ -671,16 +686,24 @@ public: Future> nextPageID = freeList.pop(); if(nextPageID.isReady()) { if(nextPageID.get().present()) { + debug_printf("COWPager(%s) new page id=%u from ready freelist\n", filename.c_str(), nextPageID.get().get()); return nextPageID.get().get(); } - return ++pHeader->pageCount; + LogicalPageID id = pHeader->pageCount; + ++pHeader->pageCount; + debug_printf("COWPager(%s) new page id=%u at end of file\n", filename.c_str(), id); + return id; } Future f = map(nextPageID, [=](Optional nextPageID) { if(nextPageID.present()) { + debug_printf("COWPager(%s) new page id=%u from freelist after wait\n", filename.c_str(), nextPageID.get()); return nextPageID.get(); } - return (LogicalPageID)++(pHeader->pageCount); + LogicalPageID id = pHeader->pageCount; + ++pHeader->pageCount; + debug_printf("COWPager(%s) new page id=%u at end of file\n", filename.c_str(), id); + return id; }); return forwardError(f, errorPromise); From 5384cf8f9cc2942684bc8817a68201b305097e15 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sun, 18 Aug 2019 22:29:24 -0700 Subject: [PATCH 022/184] Bug fixes in FIFOQueue. Read cursor would not start loading pages again after its end was pushed forward. Queue flushing of the free list queue would leave tail cursor in a bad state. --- fdbserver/VersionedBTree.actor.cpp | 35 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 3232c801de..a9d6ba193e 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -80,7 +80,7 @@ public: // Cursor will not read this page or anything beyond it. LogicalPageID endPageID; - Cursor() : queue(nullptr) { + Cursor() : queue(nullptr), pageID(0), endPageID(0) { } void setEnd(Cursor &end) { @@ -94,6 +94,7 @@ public: queue = q; pageID = newPageID; initNewPageBuffer(); + loading = Void(); } // Point cursor to a page to read from. Begin loading the page if beginLoad is set. @@ -115,7 +116,6 @@ public: auto p = raw(); p->next = 0; p->count = 0; - loading = Void(); } Cursor(Cursor &) = delete; @@ -125,8 +125,8 @@ public: loading.cancel(); } - Future ready() { - return loading; + Future notLoading() { + return loading.isValid() ? loading : Void(); } #pragma pack(push, 1) @@ -192,6 +192,7 @@ public: Future writeNext(const T &item) { // If the cursor is loaded already, write the item and move to the next slot if(loading.isReady()) { + debug_printf("FIFOQueue(%s): write next to %u:%d\n", queue->name.c_str(), pageID, index); auto p = raw(); p->at(index) = item; ++p->count; @@ -214,9 +215,18 @@ public: // Read and moved past the next item if it is < upperBound Future> moveNext(const Optional &upperBound = {}) { - // If loading is not valid then either the cursor is not initialized or it points to a page not yet durable. + // If loading is not valid then either the cursor is not initialized. + // It may have at one time pointed to a page not yet committed. if(!loading.isValid()) { - return Optional(); + // If the pageID isn't the endPageID then start loading the page + if(pageID != endPageID) { + debug_printf("FIFOQueue(%s) starting load of page id=%u which is no longer the end page id=%u\n", queue->name.c_str(), pageID, endPageID); + loading = loadPage(); + } + else { + // Otherwise we can't read anymore so return nothing + return Optional(); + } } // If loading is ready, read an item and move forward @@ -231,7 +241,6 @@ public: T result = p->at(index); --queue->numEntries; ++index; - debug_printf("FIFOQueue(%s) read cursor popped from page id=%u index=%d count=%d\n", queue->name.c_str(), pageID, index, p->count); // If this page is out of items, start reading the next one if(index == p->count) { @@ -246,9 +255,6 @@ public: // queue recursively if the pager's free list is being stored in this queue. queue->pager->freePage(oldPageID); } - else { - debug_printf("FIFOQueue(%s) index and count are not the same %d %u\n", queue->name.c_str(), index, p->count); - } return Optional(result); } @@ -322,8 +328,13 @@ public: } } - // Wait for tail to be ready to write to a page - wait(self->tail.ready()); + // Wait for the head cursor to be done loading because it might free a page, which would add to the + // free list queue, which might be this queue. + wait(self->head.notLoading()); + + // Wait for the final write to the queue to be finished, it may be waiting for a new pageID after + // filling a page to capacity. + wait(self->tail.notLoading()); // If tail page is not empty, link it to a new unwritten/empty page if(!self->tail.empty()) { From b19ef86ab9a89c19e57155e40908a634d5a3ea34 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sun, 1 Sep 2019 23:03:31 -0700 Subject: [PATCH 023/184] Pager2 interface now supports getting a read snapshot at a version and setting the oldest readable version. FIFOQueue now supports pushFront() which is needed for the BTree's incremental tree deletion process. --- fdbserver/DeltaTree.h | 6 +- fdbserver/IPager.h | 14 +- fdbserver/VersionedBTree.actor.cpp | 539 +++++++++++++++++++++++------ 3 files changed, 446 insertions(+), 113 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index 6797d87a77..cd6b021e6c 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -1,5 +1,5 @@ /* - * MutablePrefixTree.h + * DeltaTree.h * * This source file is part of the FoundationDB open source project * @@ -20,11 +20,11 @@ #pragma once +#include "fdbserver/PrefixTree.h" #include "flow/flow.h" #include "flow/Arena.h" #include "fdbclient/FDBTypes.h" #include "fdbserver/Knobs.h" -#include "fdbserver/PrefixTree.h" #include // Delta Tree is a memory mappable binary tree of T objects such that each node's item is @@ -209,7 +209,7 @@ public: } }; - // Cursor provides a way to seek into a PrefixTree and iterate over its contents + // Cursor provides a way to seek into a DeltaTree and iterate over its contents // All Cursors from a Reader share the same decoded node 'cache' (tree of DecodedNodes) struct Cursor { Cursor() : reader(nullptr), node(nullptr) { diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 8eb47283de..731b32cc3b 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -183,10 +183,9 @@ public: // - the most recent non-atomic write virtual Future> readPage(LogicalPageID pageID) = 0; - // Get a snapshot of the metakey and all pages as of the latest committed version. - // When a pager snapshot is created, the pager is guaraunteed to not remove or reuse any pages - // that were freed after the creation of this snapshot until the snapshot is destroyed - virtual Reference getReadSnapshot() = 0; + // Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion() + // The snapshot shall be usable until setOldVersion() is called with a version > v. + virtual Reference getReadSnapshot(Version v) = 0; // Atomically make durable all pending page writes, page frees, and update the metadata string. virtual Future commit() = 0; @@ -206,6 +205,13 @@ public: // After the returned future is ready, future calls must not wait. virtual Future getLatestVersion() = 0; + // The pager can invalidate snapshots at versions < v and reuse + // any pages that were freed as of version v + virtual void setOldestVersion(Version v) = 0; + + // Get the oldest readable version + virtual Future getOldestVersion() = 0; + protected: ~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface }; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index a9d6ba193e..59e6bb8746 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -42,13 +42,21 @@ #include // A FIFO queue of T stored as a linked list of pages. +// Operations are popFront(), pushBack(), and pushFront(), and flush(). +// Flush() will ensure all queue pages are written to the pager. +// popFront() will only return records that have been flushed. +// // Each page contains some number of T items and a link to the next page. -// When the queue is flushed, the final page is ended and linked to a newly allocated -// but not-yet-written-to page, which future writes after the flush will write to. +// When the queue is flushed, the last page in the chain is ended and linked to a newly allocated +// but not-yet-written-to pageID, which future writes after the flush will write to. +// Items pushed onto the front of the queue are written to a separate linked list until flushed, +// at which point that list becomes the new front of the queue. +// // Committing changes to a queue involves flushing the queue, calling fsync, and then -// writing the QueueState somewhere and making it durable. -// The write pattern is designed such that non-fsync'd writes are not relied on, to include -// unchanging bytes in a page that was updated but not fsync'd. +// writing the QueueState which flush() returns somewhere and making it durable. +// +// The write pattern is designed such that no written/updated yet not fsync'd page is ever +// expected to be valid. template class FIFOQueue { static_assert(std::is_trivially_copyable::value); @@ -80,24 +88,49 @@ public: // Cursor will not read this page or anything beyond it. LogicalPageID endPageID; - Cursor() : queue(nullptr), pageID(0), endPageID(0) { + Cursor() : queue(nullptr), pageID(invalidLogicalPageID), endPageID(invalidLogicalPageID) { + } + + Cursor(const Cursor &c) = delete; + + ~Cursor() { + loading.cancel(); + } + + Cursor & operator=(const Cursor &c) { + ASSERT(c.notLoading()); + pageID = c.pageID; + index = c.index; + page = c.page; + queue = c.queue; + endPageID = c.endPageID; + loading = Void(); + return *this; } void setEnd(Cursor &end) { endPageID = end.pageID; } - // Point cursor to a page which has never been written before, allocate - // a page buffer and initialize it - void initWrite(FIFOQueue *q, LogicalPageID newPageID) { - debug_printf("FIFOQueue(%s): New write queue cursor at page id=%u\n", q->name.c_str(), newPageID); + // Initializes a cursor that will write to new pages in the forward direction starting from newPageID + void initWriteTail(FIFOQueue *q, LogicalPageID newPageID) { + debug_printf("FIFOQueue(%s): New writeTail queue cursor at page id=%u\n", q->name.c_str(), newPageID); queue = q; - pageID = newPageID; - initNewPageBuffer(); + initNewTailPage(newPageID); loading = Void(); } - // Point cursor to a page to read from. Begin loading the page if beginLoad is set. + // Initializes a cursor that will write to new pages in the reverse direction, allocating pages as needed. + void initWriteHead(FIFOQueue *q) { + debug_printf("FIFOQueue(%s): New writeHead queue cursor\n", q->name.c_str()); + queue = q; + // Initially the page is invalid and the index is 0 + initNewHeadPage(invalidLogicalPageID); + index = 0; + loading = Void(); + } + + // Initializes a cursor that will read in the forward direction starting from pageID p, index i up to but not touching pageID end void initRead(FIFOQueue *q, LogicalPageID p, int i, LogicalPageID end) { debug_printf("FIFOQueue(%s): New read queue cursor at page id=%u index=%d end page id=%u\n", q->name.c_str(), p, i, end); queue = q; @@ -110,29 +143,35 @@ public: loading = (p == endPageID) ? Future() : loadPage(); } - void initNewPageBuffer() { + void initNewTailPage(LogicalPageID newPageID) { + pageID = newPageID; index = 0; page = queue->pager->newPageBuffer(); - auto p = raw(); - p->next = 0; - p->count = 0; + setNext(0, 0); + raw()->endIndex = 0; } - Cursor(Cursor &) = delete; - void operator=(Cursor &) = delete; - - ~Cursor() { - loading.cancel(); + void initNewHeadPage(LogicalPageID newPageID) { + page = queue->pager->newPageBuffer(); + setNext(pageID, index); + raw()->endIndex = queue->itemsPerPage; + pageID = newPageID; + index = queue->itemsPerPage; } - Future notLoading() { + Future onNotLoading() const { return loading.isValid() ? loading : Void(); } + bool notLoading() const { + return !loading.isValid() || loading.isReady(); + } + #pragma pack(push, 1) struct RawPage { - LogicalPageID next; - uint32_t count; + LogicalPageID nextPageID; + uint16_t nextIndex; + uint16_t endIndex; inline T & at(int i) { return ((T *)(this + 1))[i]; @@ -144,6 +183,16 @@ public: return ((RawPage *)(page->begin())); } + void setNext(LogicalPageID pageID, int index) { + RawPage *p = raw(); + p->nextPageID = pageID; + p->nextIndex = index; + } + + void setNext(const Cursor &cursor) { + setNext(cursor.pageID, cursor.index); + } + Future loadPage() { debug_printf("FIFOQueue(%s): loading page id=%u index=%d\n", queue->name.c_str(), pageID, index); return map(queue->pager->readPage(pageID), [=](Reference p) { @@ -152,18 +201,36 @@ public: }); } - Future newPage() { + // Allocate a new next page for the cursor's old page to link to, write the old page, then point the cursor at the new page. + Future newTailPage() { ASSERT(page); ASSERT(loading.isReady()); loading = map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { - debug_printf("FIFOQueue(%s): new page id=%u\n", queue->name.c_str(), newPageID); - auto p = raw(); - p->next = newPageID; + debug_printf("FIFOQueue(%s): new tail page id=%u\n", queue->name.c_str(), newPageID); + setNext(newPageID, 0); writePage(); ++queue->numPages; - pageID = newPageID; - initNewPageBuffer(); + initNewTailPage(newPageID); + return Void(); + }); + + return loading; + } + + // Allocate a new previous page which links to the cursor's old page, write the old page if first is false, and then point the cursor at the new page. + Future newHeadPage() { + ASSERT(page); + ASSERT(loading.isReady()); + + loading = map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { + debug_printf("FIFOQueue(%s): new head page id=%u\n", queue->name.c_str(), newPageID); + // Write the page if it has a valid ID and a valid nextPageID + if(pageID != invalidLogicalPageID && raw()->nextPageID != invalidLogicalPageID) { + writePage(); + } + initNewHeadPage(newPageID); + ++queue->numPages; return Void(); }); @@ -175,7 +242,7 @@ public: } bool empty() { - return raw()->count == 0; + return raw()->endIndex == 0; } void writePage() { @@ -183,28 +250,53 @@ public: queue->pager->updatePage(pageID, page); } - ACTOR static Future waitThenWriteNext(Cursor *self, T item) { + ACTOR static Future waitThenWriteTail(Cursor *self, T item) { wait(self->loading); - wait(self->writeNext(item)); + wait(self->writeTail(item)); return Void(); } - Future writeNext(const T &item) { + Future writeTail(const T &item) { // If the cursor is loaded already, write the item and move to the next slot if(loading.isReady()) { - debug_printf("FIFOQueue(%s): write next to %u:%d\n", queue->name.c_str(), pageID, index); + debug_printf("FIFOQueue(%s): writeTail to %u:%d\n", queue->name.c_str(), pageID, index); auto p = raw(); p->at(index) = item; - ++p->count; ++queue->numEntries; ++index; + p->endIndex = index; if(index == queue->itemsPerPage) { - newPage(); + newTailPage(); } return Void(); } - return waitThenWriteNext(this, item); + return waitThenWriteTail(this, item); + } + + ACTOR static Future waitThenWriteHead(Cursor *self, T item) { + wait(self->loading); + wait(self->writeHead(item)); + return Void(); + } + + Future writeHead(const T &item) { + // If the cursor is loaded already, write the item and move to the next slot + if(loading.isReady()) { + debug_printf("FIFOQueue(%s): writeHead to %u:%d\n", queue->name.c_str(), pageID, index); + if(index == 0) { + newHeadPage(); + } + else { + --index; + auto p = raw(); + p->at(index) = item; + ++queue->numEntries; + return Void(); + } + } + + return waitThenWriteHead(this, item); } ACTOR static Future> waitThenMoveNext(Cursor *self, Optional upperBound) { @@ -232,21 +324,22 @@ public: // If loading is ready, read an item and move forward if(loading.isReady()) { auto p = raw(); - if(upperBound.present() && p->at(index) >= upperBound.get()) { - debug_printf("FIFOQueue(%s) pop upperbound limit exceeded\n", queue->name.c_str()); + T result = p->at(index); + + if(upperBound.present() && upperBound.get() < result) { + debug_printf("FIFOQueue(%s) read cursor page id=%u index=%d endIndex=%d exceeds upper bound\n", queue->name.c_str(), pageID, index, p->endIndex); return Optional(); } - debug_printf("FIFOQueue(%s) read cursor pop from page id=%u index=%d count=%d\n", queue->name.c_str(), pageID, index, p->count); - T result = p->at(index); + debug_printf("FIFOQueue(%s) read cursor pop from page id=%u index=%d endIndex=%d\n", queue->name.c_str(), pageID, index, p->endIndex); --queue->numEntries; ++index; // If this page is out of items, start reading the next one - if(index == p->count) { + if(index == p->endIndex) { LogicalPageID oldPageID = pageID; - pageID = p->next; - index = 0; + pageID = p->nextPageID; + index = p->nextIndex; --queue->numPages; debug_printf("FIFOQueue(%s) advancing to next page id=%u endPageID=%u\n", queue->name.c_str(), pageID, endPageID); loading = (pageID == endPageID) ? Future() : loadPage(); @@ -278,8 +371,8 @@ public: numPages = 1; numEntries = 0; itemsPerPage = (pager->getUsablePageSize() - sizeof(typename Cursor::RawPage)) / sizeof(T); - tail.initWrite(this, newPageID); - head.initRead(this, newPageID, 0, newPageID); + tailWriter.initWriteTail(this, newPageID); + headReader.initRead(this, newPageID, 0, newPageID); ASSERT(flush().isReady()); } @@ -291,23 +384,23 @@ public: numPages = qs.numPages; numEntries = qs.numEntries; itemsPerPage = (pager->getUsablePageSize() - sizeof(typename Cursor::RawPage)) / sizeof(T); - tail.initWrite(this, qs.tailPageID); - head.initRead(this, qs.headPageID, qs.headIndex, qs.tailPageID); + tailWriter.initWriteTail(this, qs.tailPageID); + headReader.initRead(this, qs.headPageID, qs.headIndex, qs.tailPageID); ASSERT(flush().isReady()); } Future> pop(Optional upperBound = {}) { - return head.moveNext(upperBound); + return headReader.moveNext(upperBound); } QueueState getState() const { // It only makes sense to save queue state when the tail cursor points to a new empty page - ASSERT(tail.index == 0); + ASSERT(tailWriter.index == 0); QueueState s; - s.headIndex = head.index; - s.headPageID = head.pageID; - s.tailPageID = tail.pageID; + s.headIndex = headReader.index; + s.headPageID = headReader.pageID; + s.tailPageID = tailWriter.pageID; s.numEntries = numEntries; s.numPages = numPages; @@ -315,11 +408,11 @@ public: return s; } - ACTOR static Future writeActor(FIFOQueue *self, FutureStream queue) { + ACTOR static Future pushBackActor(FIFOQueue *self, FutureStream input) { try { loop { - state T item = waitNext(queue); - wait(self->tail.writeNext(item)); + state T item = waitNext(input); + wait(self->tailWriter.writeTail(item)); } } catch(Error &e) { @@ -330,39 +423,118 @@ public: // Wait for the head cursor to be done loading because it might free a page, which would add to the // free list queue, which might be this queue. - wait(self->head.notLoading()); + wait(self->headReader.onNotLoading()); // Wait for the final write to the queue to be finished, it may be waiting for a new pageID after // filling a page to capacity. - wait(self->tail.notLoading()); + wait(self->tailWriter.onNotLoading()); // If tail page is not empty, link it to a new unwritten/empty page - if(!self->tail.empty()) { - wait(self->tail.newPage()); + if(!self->tailWriter.empty()) { + wait(self->tailWriter.newTailPage()); + } + + // We should not reach here until the pushFrontActor has already finished + ASSERT(self->pushFrontFuture.isReady()); + ASSERT(self->headWriterFront.notLoading()); + ASSERT(self->headWriterBack.notLoading()); + + // If any new pages were pushed on the front of the queue, link the tail page of the new front pages + // to the current head and write the page, then update head to point to the head of the new front pages. + if(self->headWriterBack.pageID != invalidLogicalPageID) { + self->headWriterBack.setNext(self->headReader); + self->headWriterBack.writePage(); + self->headReader = self->headWriterFront; } // After queue is flushed, head may read everything written so far (which will have been committed) - self->head.setEnd(self->tail); + self->headReader.setEnd(self->tailWriter); return self->getState(); } - void push(const T &item) { - writeQueue.send(item); + // Create pages to prepend to the front of the queue. + ACTOR static Future pushFrontActor(FIFOQueue *self, FutureStream input) { + self->headWriterFront.initWriteHead(self); + self->headWriterBack.initWriteHead(self); + + state bool first = true; + + try { + loop { + state T item = waitNext(input); + wait(self->headWriterFront.writeHead(item)); + if(first) { + self->headWriterBack = self->headWriterFront; + first = false; + } + } + } + catch(Error &e) { + if(e.code() != error_code_end_of_stream) { + throw; + } + } + + // If any items were written, then at least one page was written. + if(!first) { + // If the head is on a different page than the tail then write the head page + if(self->headWriterFront.pageID != self->headWriterBack.pageID) { + self->headWriterFront.writePage(); + } + } + + return Void(); + } + + void pushBack(const T &item) { + debug_printf("FIFOQueue(%s): pushBack\n", name.c_str()); + pushBackQueue.send(item); + } + + void pushFront(const T &item) { + debug_printf("FIFOQueue(%s): pushFront\n", name.c_str()); + pushFrontQueue.send(item); } // Flush changes to the pager and return the resulting queue state. - Future flush() { - debug_printf("FIFOQueue(%s): flush\n", name.c_str()); - Future oldWriter = writer; - writeQueue.sendError(end_of_stream()); - writeQueue = PromiseStream(); - writer = writeActor(this, writeQueue.getFuture()); - if(!oldWriter.isValid()) { - debug_printf("FIFOQueue(%s): flush, oldwriter not valid\n", name.c_str()); - return getState(); + ACTOR static Future flush_impl(FIFOQueue *self) { + debug_printf("FIFOQueue(%s): flush\n", self->name.c_str()); + + // Signal head writer to flush and wait for it + // This must be done first in case this queue is the freelist itself, since + // flushing the head writer might require getting a new pageID. + if(self->pushFrontFuture.isValid()) { + debug_printf("FIFOQueue(%s): headWriter valid\n", self->name.c_str()); + self->pushFrontQueue.sendError(end_of_stream()); + wait(self->pushFrontFuture); } - return oldWriter; + + state QueueState qstate; + + // Signal tail writer to flush and wait for it + if(self->pushBackFuture.isValid()) { + debug_printf("FIFOQueue(%s): tailWriter valid\n", self->name.c_str()); + self->pushBackQueue.sendError(end_of_stream()); + wait(store(qstate, self->pushBackFuture)); + } + else { + qstate = self->getState(); + } + + // Start new tail writer + self->pushBackQueue = PromiseStream(); + self->pushBackFuture = pushBackActor(self, self->pushBackQueue.getFuture()); + + // Start new head writer + self->pushFrontQueue = PromiseStream(); + self->pushFrontFuture = pushFrontActor(self, self->pushFrontQueue.getFuture()); + + return qstate; + } + + Future flush() { + return flush_impl(this); } IPager2 *pager; @@ -370,13 +542,21 @@ public: int64_t numEntries; int itemsPerPage; - PromiseStream writeQueue; - Future writer; + PromiseStream pushBackQueue; + PromiseStream pushFrontQueue; + Future pushBackFuture; + Future pushFrontFuture; - // Head points to the next location to read - Cursor head; - // Tail points to the next location to write - Cursor tail; + // Head points to the next location to pop(). + // pop() will only return committed records. + Cursor headReader; + // Tail points to the next location to pushBack() to + Cursor tailWriter; + + // These cursors point to the front and back of the queue block + // chain being created for items sent to pushFront() + Cursor headWriterFront; + Cursor headWriterBack; // For debugging std::string name; @@ -524,6 +704,17 @@ public: typedef FastAllocatedPage Page; typedef FIFOQueue LogicalPageQueueT; + struct DelayedFreePage { + Version version; + LogicalPageID pageID; + + bool operator<(const DelayedFreePage &rhs) const { + return version < rhs.version; + } + }; + + typedef FIFOQueue VersionedLogicalPageQueueT; + // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 for default COWPager(int desiredPageSize, std::string filename, int pageCacheSizeBytes) : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) { @@ -615,6 +806,7 @@ public: } self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); + self->delayedFreeList.recover(self, self->pHeader->delayedFreeList, "DelayedFreeListRecovered"); // If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing. // If this fails, the backup header is still in tact for the next recovery attempt. @@ -623,7 +815,7 @@ public: wait(self->writeHeaderPage(0, self->headerPage)); // Wait for all outstanding writes to complete - wait(self->writes.signalAndCollapse()); + wait(self->operations.signalAndCollapse()); // Sync header wait(self->pageFile->sync()); @@ -632,6 +824,7 @@ public: // Update the last committed header with the one that was recovered (which is the last known committed header) self->updateCommittedHeader(); + self->addLatestSnapshot(); } else { // Note: If the file contains less than 2 pages but more than 0 bytes then the pager was never successfully committed. @@ -659,11 +852,13 @@ public: // Create a new free list self->freeList.create(self, self->newPageID().get(), "FreeListNew"); + self->delayedFreeList.create(self, self->newPageID().get(), "delayedFreeListtNew"); - // The first commit() below will flush the queue and update the queue state in the header, - // but since the queue will not be used between now and then its state will not change. - // In order to populate lastCommittedHeader, update the header now with the queue's state. + // The first commit() below will flush the queues and update the queue states in the header, + // but since the queues will not be used between now and then their states will not change. + // In order to populate lastCommittedHeader, update the header now with the queue states. self->pHeader->freeList = self->freeList.getState(); + self->pHeader->delayedFreeList = self->delayedFreeList.getState(); // Set remaining header bytes to \xff memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, self->headerPage->size() - self->pHeader->size()); @@ -741,7 +936,7 @@ public: // the new content in the cache entry when the write is launched, not when it is completed. // Any waiting readers should not see this write (though this might change) if(cacheEntry.reading()) { - // Wait for the read to finish, then start the right. + // Wait for the read to finish, then start the write. cacheEntry.writeFuture = map(success(cacheEntry.page), [=](Void) { writePhysicalPage(pageID, data); return Void(); @@ -760,7 +955,7 @@ public: } } - writes.add(forwardError(cacheEntry.writeFuture, errorPromise)); + operations.add(forwardError(cacheEntry.writeFuture, errorPromise)); // Always update the page contents immediately regardless of what happened above. cacheEntry.page = data; @@ -777,7 +972,7 @@ public: // Free pageID to be used again after the next commit void freePage(LogicalPageID pageID) { - freeList.push(pageID); + freeList.pushBack(pageID); }; // Header pages use a page size of smallestPhysicalBlock @@ -826,17 +1021,32 @@ public: } // Get snapshot as of the most recent committed version of the pager - Reference getReadSnapshot(); + Reference getReadSnapshot(Version v); + void addLatestSnapshot(); + + void setOldestVersion(Version v) { + oldestVersion.set(v); + }; + + Future getOldestVersion() { + return map(recoverFuture, [=](Void) { + return oldestVersion.get(); + }); + }; ACTOR static Future commit_impl(COWPager *self) { // Write old committed header to Page 1 - self->writes.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); + self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); + + // Flush the delayed free list queue to the pager and get the new queue state into the header + // This must be done before flushing the free list as it may free or allocate pages. + wait(store(self->pHeader->delayedFreeList, self->delayedFreeList.flush())); // Flush the free list queue to the pager and get the new queue state into the header wait(store(self->pHeader->freeList, self->freeList.flush())); // Wait for all outstanding writes to complete - wait(self->writes.signalAndCollapse()); + wait(self->operations.signalAndCollapse()); // Sync everything except the header wait(self->pageFile->sync()); @@ -849,6 +1059,7 @@ public: // Update the last committed header for use in the next commit. self->updateCommittedHeader(); + self->addLatestSnapshot(); return Void(); } @@ -884,7 +1095,7 @@ public: // Destroy the cache, cancelling reads and writes in progress self->pageCache.destroy(); - wait(ready(self->writes.signal())); + wait(ready(self->operations.signal())); self->pageFile.clear(); @@ -935,6 +1146,39 @@ public: private: ~COWPager() {} + // Expire snapshots up to but not including v + void expireSnapshots(Version v) { + while(snapshots.size() > 1 && snapshots.at(1).version <= v) { + snapshots.front().expired.sendError(transaction_too_old()); + snapshots.pop_front(); + } + } + + ACTOR Future expireActor(COWPager *self) { + state DelayedFreePage upperBound; + + loop { + state Version v = self->oldestVersion.get(); + upperBound.version = v; + self->expireSnapshots(v); + + // Pop things from the delayed free queue until a version >= v is reached + loop { + Optional dfp = wait(self->delayedFreeList.pop(upperBound)); + + if(!dfp.present()) { + break; + } + + self->freeList.pushBack(dfp.get().pageID); + } + + if(self->oldestVersion.get() == v) { + wait(self->oldestVersion.onChange()); + } + } + } + #pragma pack(push, 1) // Header is the format of page 0 of the database struct Header { @@ -942,6 +1186,7 @@ private: uint32_t pageSize; int64_t pageCount; FIFOQueue::QueueState freeList; + FIFOQueue::QueueState delayedFreeList; Version committedVersion; int32_t metaKeySize; @@ -1013,25 +1258,48 @@ private: Promise closedPromise; Promise errorPromise; Future commitFuture; - SignalableActorCollection writes; + SignalableActorCollection operations; Future recoverFuture; - AsyncTrigger leastSnapshotVersionChanged; - std::map snapshotsInUse; + + // The oldest readable snapshot version + AsyncVar oldestVersion; Reference pageFile; LogicalPageQueueT freeList; + VersionedLogicalPageQueueT delayedFreeList; + + struct SnapshotEntry { + Version version; + Promise expired; + Reference snapshot; + }; + + struct SnapshotEntryLessThanVersion { + bool operator() (Version v, const SnapshotEntry &snapshot) { + return v < snapshot.version; + } + + bool operator() (const SnapshotEntry &snapshot, Version v) { + return snapshot.version < v; + } + }; + + std::deque snapshots; }; // Prevents pager from reusing freed pages from version until the snapshot is destroyed class COWPagerSnapshot : public IPagerSnapshot, ReferenceCounted { public: - COWPagerSnapshot(COWPager *pager, Key meta, Version version) : pager(pager), metaKey(meta), version(version) { + COWPagerSnapshot(COWPager *pager, Key meta, Version version, Future expiredFuture) : pager(pager), metaKey(meta), version(version), expired(expiredFuture) { } virtual ~COWPagerSnapshot() { } Future> getPhysicalPage(LogicalPageID pageID) { + if(expired.isError()) { + throw expired.getError(); + } return map(pager->readPage(pageID), [=](Reference p) { return Reference(p); }); @@ -1053,17 +1321,34 @@ public: ReferenceCounted::delref(); } -private: COWPager *pager; + Future expired; Version version; Key metaKey; }; -Reference COWPager::getReadSnapshot() { - ++snapshotsInUse[pLastCommittedHeader->committedVersion]; - return Reference(new COWPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion)); +// TODO: Add version parameter and search snapshots for result +Reference COWPager::getReadSnapshot(Version v) { + ASSERT(!snapshots.empty()); + + auto i = std::upper_bound(snapshots.begin(), snapshots.end(), v, SnapshotEntryLessThanVersion()); + if(i == snapshots.begin()) { + throw version_invalid(); + } + --i; + return i->snapshot; } +void COWPager::addLatestSnapshot() { + Promise expired; + snapshots.push_back({ + pLastCommittedHeader->committedVersion, + expired, + Reference(new COWPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) + }); +} + + // TODO: Move this to a flow header once it is mature. struct SplitStringRef { StringRef a; @@ -1490,10 +1775,12 @@ struct RedwoodRecordRef { StringRef k; + // Separate the borrowed key string byte count from the borrowed int field byte count int keyPrefixLen = std::min(prefixLen, base.key.size()); int intFieldPrefixLen = prefixLen - keyPrefixLen; int keySuffixLen = (flags & HAS_KEY_SUFFIX) ? r.readVarInt() : 0; + // If there is a key suffix, reconstitute the complete key into a contiguous string if(keySuffixLen > 0) { k = makeString(keyPrefixLen + keySuffixLen, arena); memcpy(mutateString(k), base.key.begin(), keyPrefixLen); @@ -1565,6 +1852,30 @@ struct RedwoodRecordRef { size(), flagString.c_str(), prefixLen, keySuffixLen, intFieldSuffixLen, valueLen, StringRef((const uint8_t *)this, size()).toHexString().c_str()); } }; + + // Using this class as an alternative for Delta enables reading a DeltaTree while only decoding + // its values, so the Reader does not require the original prev/next ancestors. + struct DeltaValueOnly : Delta { + RedwoodRecordRef apply(const RedwoodRecordRef &base, Arena &arena) const { + Reader r(data()); + + // Skip prefix length + r.readVarInt(); + + // Get value length + int valueLen = (flags & HAS_VALUE) ? r.read() : 0; + + // Skip key suffix length and bytes if exists + if(flags & HAS_KEY_SUFFIX) { + r.readString(r.readVarInt()); + } + + // Skip int field suffix if present + r.readBytes(flags & INT_FIELD_SUFFIX_BITS); + + return RedwoodRecordRef(StringRef(), 0, (flags & HAS_VALUE ? r.readString(valueLen) : Optional()) ); + } + }; #pragma pack(pop) // Compares and orders by key, version, chunk.start, chunk.total. @@ -2288,7 +2599,7 @@ public: if(singleVersion) { ASSERT(v == m_lastCommittedVersion); } - Reference snapshot = m_pager->getReadSnapshot(/* v */); + Reference snapshot = m_pager->getReadSnapshot(v); Key m = snapshot->getMetaKey(); return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root, recordVersion)); } @@ -3266,15 +3577,15 @@ private: debug_printf("%s: Beginning commit of version %" PRId64 "\n", self->m_name.c_str(), writeVersion); // Get the latest version from the pager, which is what we will read at - //Version latestVersion = wait(self->m_pager->getLatestVersion()); - //debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); + Version latestVersion = wait(self->m_pager->getLatestVersion()); + debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); if(REDWOOD_DEBUG) { self->printMutationBuffer(mutations); } state RedwoodRecordRef lowerBound = dbBegin.withPageID(self->m_header.root); - VersionedChildrenT newRoot = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(/*latestVersion*/), self->m_header.root, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); + VersionedChildrenT newRoot = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), self->m_header.root, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); debug_printf("CommitSubtree(root) returned %s\n", toString(newRoot).c_str()); ASSERT(newRoot.size() == 1); @@ -4661,7 +4972,11 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { DeltaTree::Cursor fwd = r.getCursor(); DeltaTree::Cursor rev = r.getCursor(); + DeltaTree::Reader rValuesOnly(tree, &prev, &next); + DeltaTree::Cursor fwdValueOnly = rValuesOnly.getCursor(); + ASSERT(fwd.moveFirst()); + ASSERT(fwdValueOnly.moveFirst()); ASSERT(rev.moveLast()); int i = 0; while(1) { @@ -4675,9 +4990,21 @@ TEST_CASE("!/redwood/correctness/unit/deltaTree/RedwoodRecordRef") { printf("Delta: %s\n", rev.node->raw->delta().toString().c_str()); ASSERT(false); } + if(fwdValueOnly.get().value != items[i].value) { + printf("forward values-only iterator i=%d\n %s found\n %s expected\n", i, fwdValueOnly.get().toString().c_str(), items[i].toString().c_str()); + printf("Delta: %s\n", fwdValueOnly.node->raw->delta().toString().c_str()); + ASSERT(false); + } ++i; - ASSERT(fwd.moveNext() == rev.movePrev()); - ASSERT(fwd.valid() == rev.valid()); + + bool more = fwd.moveNext(); + ASSERT(fwdValueOnly.moveNext() == more); + ASSERT(rev.movePrev() == more); + + ASSERT(fwd.valid() == more); + ASSERT(fwdValueOnly.valid() == more); + ASSERT(rev.valid() == more); + if(!fwd.valid()) { break; } From be37a7c01d76ac5582a831747f246a584ca72013 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 5 Sep 2019 00:47:57 -0700 Subject: [PATCH 024/184] Added format versioning to COWPager page, header, BTreePage, BTree meta record. Added height to BTree pages. --- fdbserver/IPager.h | 4 +- fdbserver/VersionedBTree.actor.cpp | 120 +++++++++++++++++++++++------ 2 files changed, 100 insertions(+), 24 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 731b32cc3b..74131cb3fe 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -174,8 +174,8 @@ public: // call freePage(pageID), and return the new page id. Otherwise the pageID argument will be returned. virtual Future atomicUpdatePage(LogicalPageID pageID, Reference data) = 0; - // Free pageID to be used again after the next commit - virtual void freePage(LogicalPageID pageID) = 0; + // Free pageID to be used again after version v is durable + virtual void freePage(LogicalPageID pageID, Version v) = 0; // Returns the data for a page by LogicalPageID // The data returned will be the later of diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 59e6bb8746..b27140cb20 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -148,13 +148,17 @@ public: index = 0; page = queue->pager->newPageBuffer(); setNext(0, 0); - raw()->endIndex = 0; + auto p = raw(); + p->formatVersion = RawPage::FORMAT_VERSION; + p->endIndex = 0; } void initNewHeadPage(LogicalPageID newPageID) { page = queue->pager->newPageBuffer(); setNext(pageID, index); - raw()->endIndex = queue->itemsPerPage; + auto p = raw(); + p->formatVersion = RawPage::FORMAT_VERSION; + p->endIndex = queue->itemsPerPage; pageID = newPageID; index = queue->itemsPerPage; } @@ -169,6 +173,8 @@ public: #pragma pack(push, 1) struct RawPage { + static constexpr int FORMAT_VERSION = 1; + uint16_t formatVersion; LogicalPageID nextPageID; uint16_t nextIndex; uint16_t endIndex; @@ -197,6 +203,7 @@ public: debug_printf("FIFOQueue(%s): loading page id=%u index=%d\n", queue->name.c_str(), pageID, index); return map(queue->pager->readPage(pageID), [=](Reference p) { page = p; + ASSERT(raw()->formatVersion == RawPage::FORMAT_VERSION); return Void(); }); } @@ -346,7 +353,7 @@ public: // freePage() must be called after setting the loading future because freePage() might pop from this // queue recursively if the pager's free list is being stored in this queue. - queue->pager->freePage(oldPageID); + queue->pager->freePage(oldPageID, 0); } return Optional(result); @@ -840,7 +847,7 @@ public: self->setPageSize(self->desiredPageSize); // Write new header using desiredPageSize - self->pHeader->formatVersion = 1; + self->pHeader->formatVersion = Header::FORMAT_VERSION; self->pHeader->committedVersion = 1; // No meta key until a user sets one and commits self->pHeader->setMetaKey(Key()); @@ -971,8 +978,15 @@ public: } // Free pageID to be used again after the next commit - void freePage(LogicalPageID pageID) { - freeList.pushBack(pageID); + void freePage(LogicalPageID pageID, Version v) { + // If v is older than the oldest version still readable then mark pageID as free as of the next commit + if(v < oldestVersion.get()) { + freeList.pushBack(pageID); + } + else { + // Otherwise add it to the delayed free list + delayedFreeList.pushBack({v, pageID}); + } }; // Header pages use a page size of smallestPhysicalBlock @@ -1035,6 +1049,28 @@ public: }; ACTOR static Future commit_impl(COWPager *self) { + state int addFront = 10 * deterministicRandom()->randomInt(0, 10); + state int addBack = 10 * deterministicRandom()->randomInt(0, 10); + state int remove = 10 * deterministicRandom()->randomInt(0, 20); + state int i; + + for(i = 0; i < addBack; ++i) { + LogicalPageID id = wait(self->newPageID()); + self->freeList.pushBack(id); + } + + for(i = 0; i < addFront; ++i) { + LogicalPageID id = wait(self->newPageID()); + self->freeList.pushFront(id); + } + + for(i = 0; i < remove; ++i) { + Optional id = wait(self->freeList.pop()); + if(!id.present()) { + break; + } + } + // Write old committed header to Page 1 self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); @@ -1182,7 +1218,8 @@ private: #pragma pack(push, 1) // Header is the format of page 0 of the database struct Header { - Version formatVersion; + static constexpr int FORMAT_VERSION = 1; + uint16_t formatVersion; uint32_t pageSize; int64_t pageCount; FIFOQueue::QueueState freeList; @@ -1198,6 +1235,7 @@ private: ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); metaKeySize = key.size(); memcpy((uint8_t *)this + sizeof(Header), key.begin(), key.size()); + ASSERT(formatVersion == FORMAT_VERSION); } int size() const { @@ -1267,6 +1305,8 @@ private: Reference pageFile; LogicalPageQueueT freeList; + // The delayed free list will be approximately in Version order. + // TODO: Make this an ordered container some day. VersionedLogicalPageQueueT delayedFreeList; struct SnapshotEntry { @@ -1619,7 +1659,7 @@ struct RedwoodRecordRef { uint32_t start; } chunk; - // If the value is a page ID it will be stored here + // If the value is a single page ID it will be stored here uint8_t bigEndianPageIDSpace[sizeof(LogicalPageID)]; int expectedSize() const { @@ -2077,10 +2117,13 @@ struct BTreePage { typedef DeltaTree BinaryTree; + static constexpr int FORMAT_VERSION = 1; #pragma pack(push,1) struct { + uint16_t formatVersion; uint8_t flags; - uint16_t count; + uint8_t height; + uint16_t itemCount; uint32_t kvBytes; uint8_t extensionPageCount; }; @@ -2117,11 +2160,11 @@ struct BTreePage { std::string toString(bool write, LogicalPageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { std::string r; - r += format("BTreePage op=%s id=%d ver=%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d extPages=%d\n lowerBound: %s\n upperBound: %s\n", - write ? "write" : "read", id, ver, this, (int)flags, (int)count, (int)kvBytes, (int)extensionPageCount, + r += format("BTreePage op=%s id=%d ver=%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", + write ? "write" : "read", id, ver, this, (int)flags, (int)itemCount, (int)kvBytes, lowerBound->toString().c_str(), upperBound->toString().c_str()); try { - if(count > 0) { + if(itemCount > 0) { // This doesn't use the cached reader for the page but it is only for debugging purposes BinaryTree::Reader reader(&tree(), lowerBound, upperBound); BinaryTree::Cursor c = reader.getCursor(); @@ -2162,10 +2205,12 @@ struct BTreePage { static void makeEmptyPage(Reference page, uint8_t newFlags) { BTreePage *btpage = (BTreePage *)page->begin(); + btpage->formatVersion = BTreePage::FORMAT_VERSION; btpage->flags = newFlags; + btpage->height = 1; btpage->kvBytes = 0; - btpage->count = 0; btpage->extensionPageCount = 0; + btpage->itemCount = 0; btpage->tree().build(nullptr, nullptr, nullptr, nullptr); VALGRIND_MAKE_MEM_DEFINED(page->begin() + btpage->tree().size(), page->size() - btpage->tree().size()); } @@ -2186,7 +2231,7 @@ struct BoundaryAndPage { // Returns a std::vector of pairs of lower boundary key indices within kvPairs and encoded pages. // TODO: Refactor this as an accumulator you add sorted keys to which makes pages. -static std::vector buildPages(bool minimalBoundaries, const RedwoodRecordRef &lowerBound, const RedwoodRecordRef &upperBound, std::vector entries, uint8_t newFlags, IPager2 *pager) { +static std::vector buildPages(bool minimalBoundaries, const RedwoodRecordRef &lowerBound, const RedwoodRecordRef &upperBound, const std::vector &entries, uint8_t newFlags, int height, IPager2 *pager) { ASSERT(entries.size() > 0); int usablePageSize = pager->getUsablePageSize(); @@ -2315,10 +2360,12 @@ static std::vector buildPages(bool minimalBoundaries, const Red VALGRIND_MAKE_MEM_DEFINED(btPageMem, allocatedSize); } + btPage->formatVersion = BTreePage::FORMAT_VERSION; btPage->flags = newFlags; + btPage->height = height; btPage->kvBytes = kvBytes; - btPage->count = i - start; btPage->extensionPageCount = blockCount - 1; + btPage->itemCount = i - start; int written = btPage->tree().build(&entries[start], &entries[i], &pageLowerBound, &pageUpperBound); if(written > pageSize) { @@ -2378,17 +2425,25 @@ public: typedef FIFOQueue LazyDeleteQueueT; +#pragma pack(push, 1) struct MetaKey { + static constexpr int FORMAT_VERSION = 1; + uint16_t formatVersion; LogicalPageID root; + uint8_t height; LazyDeleteQueueT::QueueState lazyDeleteQueue; + KeyRef asKeyRef() const { return KeyRef((uint8_t *)this, sizeof(MetaKey)); } + void fromKeyRef(KeyRef k) { ASSERT(k.size() == sizeof(MetaKey)); memcpy(this, k.begin(), k.size()); + ASSERT(formatVersion == FORMAT_VERSION); } }; +#pragma pack(pop) struct Counts { Counts() { @@ -2545,13 +2600,15 @@ public: ACTOR static Future init_impl(VersionedBTree *self) { state Version latest = wait(self->m_pager->getLatestVersion()); - debug_printf("Recovered to version %" PRId64 "\n", latest); + debug_printf("Recovered pager to version %" PRId64 "\n", latest); state Key meta = self->m_pager->getMetaKey(); if(meta.size() == 0) { + self->m_header.formatVersion = MetaKey::FORMAT_VERSION; LogicalPageID newRoot = wait(self->m_pager->newPageID()); debug_printf("new root page id=%u\n", newRoot); self->m_header.root = newRoot; + self->m_header.height = 1; ++latest; Reference page = self->m_pager->newPageBuffer(); makeEmptyPage(page, BTreePage::IS_LEAF); @@ -2570,6 +2627,9 @@ public: self->m_header.fromKeyRef(meta); self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); } + + debug_printf("Recovered btree at version %" PRId64 " height=%d\n", latest, self->m_header.); + self->m_maxPartSize = std::min(255, self->m_pager->getUsablePageSize() / 5); self->m_lastCommittedVersion = latest; return Void(); @@ -2771,6 +2831,19 @@ private: return r + " }"; } + template + static std::string toString(const VectorRef &v) { + std::string r = "{ "; + for(auto &o : v) { + r += toString(o) + ", "; + } + return r + " }"; + } + + static std::string toString(LogicalPageID id) { + return format("%" PRId64, id); + } + // Represents a change to a single key - set, clear, or atomic op struct SingleKeyMutation { // Clear @@ -2957,10 +3030,11 @@ private: childEntries.push_back(entry); } - *pages = buildPages(false, dbBegin, dbEnd, childEntries, 0, self->m_pager); - - debug_printf("Writing a new root level at version %" PRId64 " with %lu children across %lu pages\n", version, childEntries.size(), pages->size()); + int newHeight = pPage->height + 1; + self->m_header.height = newHeight; + *pages = buildPages(false, dbBegin, dbEnd, childEntries, 0, newHeight, self->m_pager); + debug_printf_always("Writing a new root level at version %" PRId64 " height %d with %lu children across %lu pages\n", version, newHeight, childEntries.size(), pages->size()); std::vector ids = wait(writePages(self, *pages, version, self->m_header.root, pPage, &dbEnd, nullptr)); *logicalPageIDs = std::move(ids); } @@ -2968,6 +3042,7 @@ private: return Void(); } + // Write replacement pages for the given originalID, return a set of internal page records that point to the pages. ACTOR static Future> writePages(VersionedBTree *self, std::vector pages, Version version, LogicalPageID originalID, const BTreePage *originalPage, const RedwoodRecordRef *upperBound, void *actor_debug) { debug_printf("%p: writePages(): %u @%" PRId64 " -> %lu replacement pages\n", actor_debug, originalID, version, pages.size()); @@ -3072,13 +3147,14 @@ private: }; ACTOR static Future> readPage(Reference snapshot, LogicalPageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { - debug_printf("readPage() op=read id=%u @%" PRId64 " lower=%s upper=%s\n", id, snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + debug_printf("readPage() op=read id=%s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); wait(delay(0, TaskPriority::DiskRead)); state Reference result = wait(snapshot->getPhysicalPage(id)); state int usablePageSize = result->size(); ++counts.pageReads; state const BTreePage *pTreePage = (const BTreePage *)result->begin(); + ASSERT(pTreePage->formatVersion == BTreePage::FORMAT_VERSION); if(pTreePage->extensionPageCount == 0) { debug_printf("readPage() Found normal page for op=read id=%u @%" PRId64 "\n", id, snapshot->getVersion()); @@ -3343,7 +3419,7 @@ private: return c; } - std::vector newPages = buildPages(true, *lowerBound, *upperBound, merged, BTreePage::IS_LEAF, self->m_pager); + std::vector newPages = buildPages(true, *lowerBound, *upperBound, merged, BTreePage::IS_LEAF, page->height, self->m_pager); pages = std::move(newPages); if(!self->singleVersion) { @@ -3522,7 +3598,7 @@ private: entries.push_back(o); } - std::vector newPages = buildPages(false, *lowerBound, *upperBound, entries, 0, self->m_pager); + std::vector newPages = buildPages(false, *lowerBound, *upperBound, entries, 0, page->height, self->m_pager); pages = std::move(newPages); writeVersion = self->getLastCommittedVersion() + 1; From bb280e76db54f6762351528c5ecfb125ad9ad97e Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 27 Sep 2019 15:08:05 -0700 Subject: [PATCH 025/184] Major refactor primarily to change BTree page ids from a single LogicalPageID to multiple, but also refactored write path data structures and memory lifetimes to use Refs and Arenas and carefully avoid unnecessary copying as it involved much of the same code. Pager reads can now explicitly avoid cache pollution. Refactored toString() helpers for easier debug output using common container types. --- fdbserver/IPager.h | 36 +- fdbserver/IndirectShadowPager.actor.cpp | 2 +- fdbserver/IndirectShadowPager.h | 2 +- fdbserver/MemoryPager.actor.cpp | 6 +- fdbserver/VersionedBTree.actor.cpp | 1136 +++++++++++------------ 5 files changed, 566 insertions(+), 616 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 74131cb3fe..e2805770f9 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -33,10 +33,15 @@ #define debug_printf_noop(...) -#if REDWOOD_DEBUG - #define debug_printf debug_printf_always +#if defined(NO_INTELLISENSE) + #if REDWOOD_DEBUG + #define debug_printf debug_printf_always + #else + #define debug_printf debug_printf_noop + #endif #else -#define debug_printf debug_printf_noop + // To get error-checking on debug_printf statements in IDE + #define debug_printf printf #endif #define BEACON fprintf(stderr, "%s: %s line %d \n", __FUNCTION__, __FILE__, __LINE__) @@ -79,7 +84,7 @@ public: class IPagerSnapshot { public: - virtual Future> getPhysicalPage(LogicalPageID pageID) = 0; + virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable) = 0; virtual Version getVersion() const = 0; virtual Key getMetaKey() const { @@ -165,25 +170,28 @@ public: // regardless of whether or not it was written to. virtual Future newPageID() = 0; - // Replace the contents of a page with new data. Existing holders of a page reference for pageID - // will see the effects of this write. + // Replace the contents of a page with new data across *all* versions. + // Existing holders of a page reference for pageID, read from any version, + // may see the effects of this write. virtual void updatePage(LogicalPageID pageID, Reference data) = 0; - // Try to atomically update the contents of a page as of the next successful commit() - // If the pager is unable to do this at this time, it may choose to write the data to a new page, - // call freePage(pageID), and return the new page id. Otherwise the pageID argument will be returned. - virtual Future atomicUpdatePage(LogicalPageID pageID, Reference data) = 0; + // Try to atomically update the contents of a page as of version v in the next commit. + // If the pager is unable to do this at this time, it may choose to write the data to a new page ID + // instead and return the new page ID to the caller. Otherwise the original pageID argument will be returned. + // If a new page ID is returned, the old page ID will be freed as of version v + virtual Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) = 0; - // Free pageID to be used again after version v is durable + // Free pageID to be used again after the commit that moves oldestVersion past v virtual void freePage(LogicalPageID pageID, Version v) = 0; - // Returns the data for a page by LogicalPageID + // Returns the latest data (regardless of version) for a page by LogicalPageID // The data returned will be the later of - // - the most recent committed atomic write + // - the most recent committed atomic // - the most recent non-atomic write - virtual Future> readPage(LogicalPageID pageID) = 0; + virtual Future> readPage(LogicalPageID pageID, bool cacheable) = 0; // Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion() + // Note that snapshots at any version may still see the results of updatePage() calls. // The snapshot shall be usable until setOldVersion() is called with a version > v. virtual Reference getReadSnapshot(Version v) = 0; diff --git a/fdbserver/IndirectShadowPager.actor.cpp b/fdbserver/IndirectShadowPager.actor.cpp index 7a6457a3f8..5a525b17af 100644 --- a/fdbserver/IndirectShadowPager.actor.cpp +++ b/fdbserver/IndirectShadowPager.actor.cpp @@ -108,7 +108,7 @@ IndirectShadowPagerSnapshot::IndirectShadowPagerSnapshot(IndirectShadowPager *pa { } -Future> IndirectShadowPagerSnapshot::getPhysicalPage(LogicalPageID pageID) { +Future> IndirectShadowPagerSnapshot::getPhysicalPage(LogicalPageID pageID, bool cacheable) { if(pagerError.isReady()) pagerError.get(); return pager->getPage(Reference::addRef(this), pageID, version); diff --git a/fdbserver/IndirectShadowPager.h b/fdbserver/IndirectShadowPager.h index a711c7ba63..1b097df639 100644 --- a/fdbserver/IndirectShadowPager.h +++ b/fdbserver/IndirectShadowPager.h @@ -70,7 +70,7 @@ class IndirectShadowPagerSnapshot : public IPagerSnapshot, ReferenceCounted> getPhysicalPage(LogicalPageID pageID); + virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable); virtual Version getVersion() const { return version; diff --git a/fdbserver/MemoryPager.actor.cpp b/fdbserver/MemoryPager.actor.cpp index 52876ae397..9e6474dd01 100644 --- a/fdbserver/MemoryPager.actor.cpp +++ b/fdbserver/MemoryPager.actor.cpp @@ -61,7 +61,7 @@ private: class MemoryPagerSnapshot : public IPagerSnapshot, ReferenceCounted { public: MemoryPagerSnapshot(MemoryPager *pager, Version version) : pager(pager), version(version) {} - virtual Future> getPhysicalPage(LogicalPageID pageID); + virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable); virtual Version getVersion() const { return version; } @@ -155,7 +155,7 @@ int MemoryPage::size() const { const int MemoryPage::PAGE_BYTES = 4096; -Future> MemoryPagerSnapshot::getPhysicalPage(LogicalPageID pageID) { +Future> MemoryPagerSnapshot::getPhysicalPage(LogicalPageID pageID, bool cacheable) { return pager->getPage(pageID, version); } @@ -367,7 +367,7 @@ ACTOR Future read(IPager *pager, LogicalPageID pageID, Version version, Ve state int myRead = readNum++; state Reference readSnapshot = pager->getReadSnapshot(version); debug_printf("Read%d\n", myRead); - Reference readPage = wait(readSnapshot->getPhysicalPage(pageID)); + Reference readPage = wait(readSnapshot->getPhysicalPage(pageID, true)); debug_printf("FinishedRead%d\n", myRead); ASSERT(validatePage(readPage, pageID, expectedVersion >= 0 ? expectedVersion : version)); return Void(); diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b27140cb20..f3c577df65 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -41,6 +41,50 @@ #include #include +// Some convenience functions for debugging to stringify various structures +template +std::string toString(const T &o) { + return o.toString(); +} + +std::string toString(LogicalPageID id) { + return format("%" PRId64, id); +} + +template +std::string toString(const Standalone &s) { + return toString((T)s); +} + +template +std::string toString(const T *begin, const T *end) { + std::string r = "{"; + + bool comma = false; + while(begin != end) { + if(comma) { + r += ", "; + } + else { + comma = true; + } + r += toString(*begin++); + } + + r += "}"; + return r; +} + +template +std::string toString(const std::vector &v) { + return toString(v.begin(), v.end()); +} + +template +std::string toString(const VectorRef &v) { + return toString(v.begin(), v.end()); +} + // A FIFO queue of T stored as a linked list of pages. // Operations are popFront(), pushBack(), and pushFront(), and flush(). // Flush() will ensure all queue pages are written to the pager. @@ -201,7 +245,7 @@ public: Future loadPage() { debug_printf("FIFOQueue(%s): loading page id=%u index=%d\n", queue->name.c_str(), pageID, index); - return map(queue->pager->readPage(pageID), [=](Reference p) { + return map(queue->pager->readPage(pageID, true), [=](Reference p) { page = p; ASSERT(raw()->formatVersion == RawPage::FORMAT_VERSION); return Void(); @@ -573,7 +617,7 @@ int nextPowerOf2(uint32_t x) { return 1 << (32 - clz(x - 1)); } -class FastAllocatedPage : public IPage, ReferenceCounted { +class FastAllocatedPage : public IPage, public FastAllocated, ReferenceCounted { public: // Create a fast-allocated page with size total bytes INCLUDING checksum FastAllocatedPage(int size, int bufferSize) : logicalSize(size), bufferSize(bufferSize) { @@ -882,20 +926,19 @@ public: return Void(); } - // Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed. - Reference newPageBuffer() { + Reference newPageBuffer() override { return Reference(new FastAllocatedPage(logicalPageSize, physicalPageSize)); } // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). // For a given pager instance, separate calls to this function must return the same value. - int getUsablePageSize() { + int getUsablePageSize() override { return logicalPageSize - sizeof(FastAllocatedPage::Checksum); } // Get a new, previously available page ID. The page will be considered in-use after the next commit // regardless of whether or not it was written to. - Future newPageID() { + Future newPageID() override { Future> nextPageID = freeList.pop(); if(nextPageID.isReady()) { if(nextPageID.get().present()) { @@ -934,7 +977,7 @@ public: return holdWhile(page, pageFile->write(page->begin(), physicalPageSize, (int64_t)pageID * physicalPageSize)); } - void updatePage(LogicalPageID pageID, Reference data) { + void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page PageCacheEntry &cacheEntry = pageCache.get(pageID); debug_printf("COWPager(%s) op=write id=%u cached=%d reading=%d writing=%d\n", filename.c_str(), pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); @@ -968,17 +1011,18 @@ public: cacheEntry.page = data; } - Future atomicUpdatePage(LogicalPageID pageID, Reference data) { + Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) override { + // This pager does not support atomic update, so it always allocates and uses a new pageID Future f = map(newPageID(), [=](LogicalPageID newPageID) { updatePage(newPageID, data); + freePage(pageID, v); return newPageID; }); return forwardError(f, errorPromise); } - // Free pageID to be used again after the next commit - void freePage(LogicalPageID pageID, Version v) { + void freePage(LogicalPageID pageID, Version v) override { // If v is older than the oldest version still readable then mark pageID as free as of the next commit if(v < oldestVersion.get()) { freeList.pushBack(pageID); @@ -1023,7 +1067,12 @@ public: } // Reads the most recent version of pageID either committed or written using updatePage() - Future> readPage(LogicalPageID pageID) { + Future> readPage(LogicalPageID pageID, bool cacheable) override { + if(!cacheable) { + // TODO: use cached page if present, otherwise read the page and return it but don't add it to the cache + ASSERT(false); + } + PageCacheEntry &cacheEntry = pageCache.get(pageID); debug_printf("COWPager(%s) op=read id=%u cached=%d reading=%d writing=%d\n", filename.c_str(), pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); @@ -1035,14 +1084,14 @@ public: } // Get snapshot as of the most recent committed version of the pager - Reference getReadSnapshot(Version v); - void addLatestSnapshot(); + Reference getReadSnapshot(Version v) override; + void addLatestSnapshot() override; - void setOldestVersion(Version v) { + void setOldestVersion(Version v) override { oldestVersion.set(v); }; - Future getOldestVersion() { + Future getOldestVersion() override { return map(recoverFuture, [=](Void) { return oldestVersion.get(); }); @@ -1100,24 +1149,23 @@ public: return Void(); } - // Make durable all pending page writes and page frees. - Future commit() { + Future commit() override { // Can't have more than one commit outstanding. ASSERT(commitFuture.isReady()); commitFuture = forwardError(commit_impl(this), errorPromise); return commitFuture; } - Key getMetaKey() const { + Key getMetaKey() const override { ASSERT(recoverFuture.isReady()); return pHeader->getMetaKey(); } - void setVersion(Version v) { + void setVersion(Version v) override { pHeader->committedVersion = v; } - void setMetaKey(KeyRef metaKey) { + void setMetaKey(KeyRef metaKey) override { pHeader->setMetaKey(metaKey); } @@ -1143,27 +1191,27 @@ public: delete self; } - void dispose() { + void dispose() override { shutdown(this, true); } - void close() { + void close() override { shutdown(this, false); } - Future getError() { + Future getError() override { return errorPromise.getFuture(); } - Future onClosed() { + Future onClosed() override { return closedPromise.getFuture(); } - Future onClose() { + Future onClose() override { return closedPromise.getFuture(); } - StorageBytes getStorageBytes() { + StorageBytes getStorageBytes() override { ASSERT(recoverFuture.isReady()); int64_t free; int64_t total; @@ -1173,7 +1221,7 @@ public: return StorageBytes(free, total, pagerSize, free + reusable); } - Future getLatestVersion() { + Future getLatestVersion() override { return map(recoverFuture, [=](Void) { return pLastCommittedHeader->committedVersion; }); @@ -1227,14 +1275,14 @@ private: Version committedVersion; int32_t metaKeySize; - Key getMetaKey() const { - return KeyRef((const uint8_t *)this + sizeof(Header), metaKeySize); + KeyRef getMetaKey() const { + return KeyRef((const uint8_t *)(this + 1), metaKeySize); } void setMetaKey(StringRef key) { ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); metaKeySize = key.size(); - memcpy((uint8_t *)this + sizeof(Header), key.begin(), key.size()); + memcpy(this + 1, key.begin(), key.size()); ASSERT(formatVersion == FORMAT_VERSION); } @@ -1336,28 +1384,28 @@ public: virtual ~COWPagerSnapshot() { } - Future> getPhysicalPage(LogicalPageID pageID) { + Future> getPhysicalPage(LogicalPageID pageID, bool cacheable) override { if(expired.isError()) { throw expired.getError(); } - return map(pager->readPage(pageID), [=](Reference p) { + return map(pager->readPage(pageID, cacheable), [=](Reference p) { return Reference(p); }); } - Key getMetaKey() const { + Key getMetaKey() const override { return metaKey; } - Version getVersion() const { + Version getVersion() const override { return version; } - void addref() { + void addref() override { ReferenceCounted::addref(); } - void delref() { + void delref() override { ReferenceCounted::delref(); } @@ -1499,6 +1547,10 @@ struct SplitStringRef { }; +// A BTree "page id" is actually a list of LogicalPageID's whose contents should be concatenated together. +// NOTE: Uses host byte order +typedef VectorRef BTreePageID; + #define STR(x) LiteralStringRef(x) struct RedwoodRecordRef { typedef uint8_t byte; @@ -1512,12 +1564,7 @@ struct RedwoodRecordRef { : key(arena, toCopy.key), version(toCopy.version), chunk(toCopy.chunk) { if(toCopy.value.present()) { - if(toCopy.localValue()) { - setPageID(toCopy.getPageID()); - } - else { - value = ValueRef(arena, toCopy.value.get()); - } + value = ValueRef(arena, toCopy.value.get()); } } @@ -1527,54 +1574,24 @@ struct RedwoodRecordRef { deserializeIntFields(intFields); } - RedwoodRecordRef(const RedwoodRecordRef &toCopy) : key(toCopy.key), version(toCopy.version), chunk(toCopy.chunk) { - if(toCopy.value.present()) { - if(toCopy.localValue()) { - setPageID(toCopy.getPageID()); - } - else { - value = toCopy.value; - } - } - } - - RedwoodRecordRef & operator= (const RedwoodRecordRef &toCopy) { - key = toCopy.key; - version = toCopy.version; - chunk = toCopy.chunk; - if(toCopy.value.present()) { - if(toCopy.localValue()) { - setPageID(toCopy.getPageID()); - } - else { - value = toCopy.value; - } - } - - return *this; - } - - bool localValue() const { - return value.get().begin() == bigEndianPageIDSpace; - } - // RedwoodRecordRefs are used for both internal and leaf pages of the BTree. // Boundary records in internal pages are made from leaf records. // These functions make creating and working with internal page records more convenient. - inline LogicalPageID getPageID() const { + inline BTreePageID getChildPage() const { ASSERT(value.present()); - return bigEndian32(*(LogicalPageID *)value.get().begin()); + return BTreePageID((LogicalPageID *)value.get().begin(), value.get().size() / sizeof(LogicalPageID)); } - inline void setPageID(LogicalPageID id) { - *(LogicalPageID *)bigEndianPageIDSpace = bigEndian32(id); - value = ValueRef(bigEndianPageIDSpace, sizeof(bigEndianPageIDSpace)); + inline void setChildPage(BTreePageID id) { + value = ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); } - inline RedwoodRecordRef withPageID(LogicalPageID id) const { - RedwoodRecordRef rec(key, version, {}, chunk.total, chunk.start); - rec.setPageID(id); - return rec; + inline void setChildPage(Arena &arena, BTreePageID id) { + value = ValueRef(arena, (const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)); + } + + inline RedwoodRecordRef withPageID(BTreePageID id) const { + return RedwoodRecordRef(key, version, ValueRef((const uint8_t *)id.begin(), id.size() * sizeof(LogicalPageID)), chunk.total, chunk.start); } inline RedwoodRecordRef withoutValue() const { @@ -2098,7 +2115,7 @@ struct RedwoodRecordRef { if(value.present()) { // Assume that values the size of a page ID are page IDs. It's not perfect but it's just for debugging. if(value.get().size() == sizeof(LogicalPageID)) { - r += format("[PageID=%u]", getPageID()); + r += format("[PageID=%s]", ::toString(getChildPage()).c_str()); } else { r += format("'%s'", kvformat(value.get(), hexLimit).c_str()); @@ -2125,18 +2142,9 @@ struct BTreePage { uint8_t height; uint16_t itemCount; uint32_t kvBytes; - uint8_t extensionPageCount; }; #pragma pack(pop) - inline LogicalPageID * extensionPages() { - return (LogicalPageID *)(this + 1); - } - - inline const LogicalPageID * extensionPages() const { - return (const LogicalPageID *)(this + 1); - } - int size() const { const BinaryTree *t = &tree(); return (uint8_t *)t - (uint8_t *)this + t->size(); @@ -2147,21 +2155,17 @@ struct BTreePage { } BinaryTree & tree() { - return *(BinaryTree *)(extensionPages() + extensionPageCount); + return *(BinaryTree *)(this + 1); } const BinaryTree & tree() const { - return *(const BinaryTree *)(extensionPages() + extensionPageCount); + return *(const BinaryTree *)(this + 1); } - static inline int GetHeaderSize(int extensionPages = 0) { - return sizeof(BTreePage) + (extensionPages * sizeof(LogicalPageID)); - } - - std::string toString(bool write, LogicalPageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { + std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { std::string r; - r += format("BTreePage op=%s id=%d ver=%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", - write ? "write" : "read", id, ver, this, (int)flags, (int)itemCount, (int)kvBytes, + r += format("BTreePage op=%s id=%s ver=%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", + write ? "write" : "read", ::toString(id).c_str(), ver, this, (int)flags, (int)itemCount, (int)kvBytes, lowerBound->toString().c_str(), upperBound->toString().c_str()); try { if(itemCount > 0) { @@ -2209,7 +2213,6 @@ static void makeEmptyPage(Reference page, uint8_t newFlags) { btpage->flags = newFlags; btpage->height = 1; btpage->kvBytes = 0; - btpage->extensionPageCount = 0; btpage->itemCount = 0; btpage->tree().build(nullptr, nullptr, nullptr, nullptr); VALGRIND_MAKE_MEM_DEFINED(page->begin() + btpage->tree().size(), page->size() - btpage->tree().size()); @@ -2219,7 +2222,7 @@ BTreePage::BinaryTree::Reader * getReader(Reference page) { return (BTreePage::BinaryTree::Reader *)page->userData; } -struct BoundaryAndPage { +struct BoundaryRefAndPage { Standalone lowerBound; Reference firstPage; std::vector> extPages; @@ -2229,187 +2232,44 @@ struct BoundaryAndPage { } }; -// Returns a std::vector of pairs of lower boundary key indices within kvPairs and encoded pages. -// TODO: Refactor this as an accumulator you add sorted keys to which makes pages. -static std::vector buildPages(bool minimalBoundaries, const RedwoodRecordRef &lowerBound, const RedwoodRecordRef &upperBound, const std::vector &entries, uint8_t newFlags, int height, IPager2 *pager) { - ASSERT(entries.size() > 0); - int usablePageSize = pager->getUsablePageSize(); +#define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); } - // This is how much space for the binary tree exists in the page, after the header - int pageSize = usablePageSize - BTreePage::GetHeaderSize(); +#pragma pack(push, 1) +template +struct InPlaceArray { + SizeT count; - // Each new block adds (usablePageSize - sizeof(LogicalPageID)) more net usable space *for the binary tree* to pageSize. - int netTreeBlockSize = usablePageSize - sizeof(LogicalPageID); - - int blockCount = 1; - std::vector pages; - - int kvBytes = 0; - int compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); - - int start = 0; - int i = 0; - const int iEnd = entries.size(); - // Lower bound of the page being added to - RedwoodRecordRef pageLowerBound = lowerBound.withoutValue(); - RedwoodRecordRef pageUpperBound; - - while(i <= iEnd) { - bool end = i == iEnd; - bool flush = end; - - // If not the end, add i to the page if necessary - if(end) { - pageUpperBound = upperBound.withoutValue(); - } - else { - // Get delta from previous record - const RedwoodRecordRef &entry = entries[i]; - int deltaSize = entry.deltaSize((i == start) ? pageLowerBound : entries[i - 1]); - int keySize = entry.key.size(); - int valueSize = entry.value.present() ? entry.value.get().size() : 0; - - int spaceNeeded = sizeof(BTreePage::BinaryTree::Node) + deltaSize; - - debug_printf("Trying to add record %3d of %3lu (i=%3d) klen %4d vlen %3d deltaSize %4d spaceNeeded %4d compressed %4d / page %4d bytes %s\n", - i + 1, entries.size(), i, keySize, valueSize, deltaSize, - spaceNeeded, compressedBytes, pageSize, entry.toString().c_str()); - - int spaceAvailable = pageSize - compressedBytes; - - // Does it fit? - bool fits = spaceAvailable >= spaceNeeded; - - // If it doesn't fit, either end the current page or increase the page size - if(!fits) { - // For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor - int minimumEntries = minimalBoundaries ? 1 : 4; - int count = i - start; - - // If not enough entries or page less than half full, increase page size to make the entry fit - if(count < minimumEntries || spaceAvailable > pageSize / 2) { - // Figure out how many additional whole or partial blocks are needed - int newBlocks = 1 + (spaceNeeded - spaceAvailable - 1) / netTreeBlockSize; - int newPageSize = pageSize + (newBlocks * netTreeBlockSize); - if(newPageSize <= BTreePage::BinaryTree::MaximumTreeSize()) { - blockCount += newBlocks; - pageSize = newPageSize; - fits = true; - } - } - if(!fits) { - pageUpperBound = entry.withoutValue(); - } - } - - // If the record fits then add it to the page set - if(fits) { - kvBytes += keySize + valueSize; - compressedBytes += spaceNeeded; - ++i; - } - - flush = !fits; - } - - // If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above. - if(flush) { - end = i == iEnd; // i could have been moved above - - int count = i - start; - // If not writing the final page, reduce entry count of page by a third - if(!end) { - i -= count / 3; - pageUpperBound = entries[i].withoutValue(); - } - - // If this isn't the final page, shorten the upper boundary - if(!end && minimalBoundaries) { - int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); - pageUpperBound.truncate(commonPrefix + 1); - } - - debug_printf("Flushing page start=%d i=%d count=%d\nlower: %s\nupper: %s\n", start, i, count, pageLowerBound.toString().c_str(), pageUpperBound.toString().c_str()); -#if REDWOOD_DEBUG - for(int j = start; j < i; ++j) { - debug_printf(" %3d: %s\n", j, entries[j].toString().c_str()); - if(j > start) { - //ASSERT(entries[j] > entries[j - 1]); - } - } - ASSERT(pageLowerBound.key <= pageUpperBound.key); -#endif - - union { - BTreePage *btPage; - uint8_t *btPageMem; - }; - - int allocatedSize; - if(blockCount == 1) { - Reference page = pager->newPageBuffer(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - btPageMem = page->mutate(); - allocatedSize = page->size(); - pages.push_back({pageLowerBound, page}); - } - else { - ASSERT(blockCount > 1); - allocatedSize = usablePageSize * blockCount; - btPageMem = new uint8_t[allocatedSize]; - VALGRIND_MAKE_MEM_DEFINED(btPageMem, allocatedSize); - } - - btPage->formatVersion = BTreePage::FORMAT_VERSION; - btPage->flags = newFlags; - btPage->height = height; - btPage->kvBytes = kvBytes; - btPage->extensionPageCount = blockCount - 1; - btPage->itemCount = i - start; - - int written = btPage->tree().build(&entries[start], &entries[i], &pageLowerBound, &pageUpperBound); - if(written > pageSize) { - fprintf(stderr, "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", written, pageSize, blockCount, i - start, kvBytes, compressedBytes); - ASSERT(false); - } - - if(blockCount != 1) { - Reference page = pager->newPageBuffer(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - - const uint8_t *rptr = btPageMem; - memcpy(page->mutate(), rptr, usablePageSize); - rptr += usablePageSize; - - std::vector> extPages; - for(int b = 1; b < blockCount; ++b) { - Reference extPage = pager->newPageBuffer(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); - - //debug_printf("block %d write offset %d\n", b, firstBlockSize + (b - 1) * usablePageSize); - memcpy(extPage->mutate(), rptr, usablePageSize); - rptr += usablePageSize; - extPages.push_back(std::move(extPage)); - } - - pages.push_back({std::move(pageLowerBound), std::move(page), std::move(extPages)}); - delete btPageMem; - } - - if(end) - break; - start = i; - kvBytes = 0; - compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); - pageLowerBound = pageUpperBound.withoutValue(); - } + const T * begin() const { + return (T *)(this + 1); + } + + T * begin() { + return (T *)(this + 1); } - //debug_printf("buildPages: returning pages.size %lu, kvpairs %lu\n", pages.size(), kvPairs.size()); - return pages; -} + const T * end() const { + return begin() + count; + } + + T * end() { + return begin() + count; + } -#define NOT_IMPLEMENTED { UNSTOPPABLE_ASSERT(false); } + VectorRef get() { + return VectorRef(begin(), count); + } + + void set(VectorRef v, int availableSpace) { + ASSERT(sizeof(T) * v.size() <= availableSpace); + count = v.size(); + memcpy(begin(), v.begin(), sizeof(T) * v.size()); + } + + int extraSize() const { + return count * sizeof(T); + } +}; +#pragma pack(pop) class VersionedBTree : public IVersionedStore { public: @@ -2429,16 +2289,15 @@ public: struct MetaKey { static constexpr int FORMAT_VERSION = 1; uint16_t formatVersion; - LogicalPageID root; uint8_t height; LazyDeleteQueueT::QueueState lazyDeleteQueue; + InPlaceArray root; KeyRef asKeyRef() const { - return KeyRef((uint8_t *)this, sizeof(MetaKey)); + return KeyRef((uint8_t *)this, sizeof(MetaKey) + root.extraSize()); } void fromKeyRef(KeyRef k) { - ASSERT(k.size() == sizeof(MetaKey)); memcpy(this, k.begin(), k.size()); ASSERT(formatVersion == FORMAT_VERSION); } @@ -2605,14 +2464,15 @@ public: state Key meta = self->m_pager->getMetaKey(); if(meta.size() == 0) { self->m_header.formatVersion = MetaKey::FORMAT_VERSION; - LogicalPageID newRoot = wait(self->m_pager->newPageID()); - debug_printf("new root page id=%u\n", newRoot); - self->m_header.root = newRoot; + LogicalPageID id = wait(self->m_pager->newPageID()); + BTreePageID newRoot((LogicalPageID *)&id, 1); + debug_printf("new root page id=%s\n", toString(newRoot).c_str()); + self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header)); self->m_header.height = 1; ++latest; Reference page = self->m_pager->newPageBuffer(); makeEmptyPage(page, BTreePage::IS_LEAF); - self->writePage(self->m_header.root, page, latest, &dbBegin, &dbEnd); + self->m_pager->updatePage(id, page); self->m_pager->setVersion(latest); LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); @@ -2628,7 +2488,7 @@ public: self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); } - debug_printf("Recovered btree at version %" PRId64 " height=%d\n", latest, self->m_header.); + debug_printf("Recovered btree at version %" PRId64 " height=%d\n", latest); self->m_maxPartSize = std::min(255, self->m_pager->getUsablePageSize() / 5); self->m_lastCommittedVersion = latest; @@ -2661,7 +2521,8 @@ public: } Reference snapshot = m_pager->getReadSnapshot(v); Key m = snapshot->getMetaKey(); - return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root, recordVersion)); + + return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root.get(), recordVersion)); } // Must be nondecreasing @@ -2695,19 +2556,29 @@ public: } private: - void writePage(LogicalPageID id, Reference page, Version ver, const RedwoodRecordRef *pageLowerBound, const RedwoodRecordRef *pageUpperBound) { - debug_printf("writePage(): %s\n", ((const BTreePage *)page->begin())->toString(true, id, ver, pageLowerBound, pageUpperBound).c_str()); - m_pager->updatePage(id, page); //, ver); - } + struct VersionAndChildrenRef { + VersionAndChildrenRef(Version v, VectorRef children, RedwoodRecordRef upperBound) + : version(v), children(children), upperBound(upperBound) { + } + + VersionAndChildrenRef(Arena &arena, const VersionAndChildrenRef &toCopy) + : version(toCopy.version), children(arena, toCopy.children), upperBound(arena, toCopy.upperBound) { + } + + int expectedSize() const { + return children.expectedSize() + upperBound.expectedSize(); + } + + std::string toString() const { + return format("{version=%" PRId64 " upperBound=%s children=%s}", version, ::toString(children).c_str(), upperBound.toString().c_str()); + } - // TODO: Don't use Standalone - struct VersionedChildPageSet { Version version; - std::vector> children; - Standalone upperBound; + VectorRef children; + RedwoodRecordRef upperBound; }; - typedef std::vector VersionedChildrenT; + typedef VectorRef VersionedChildrenT; // Utility class for building a vector of internal page entries. // Entries must be added in version order. Modified will be set to true @@ -2721,6 +2592,8 @@ private: { } + private: + // This must be called internally, on records whose arena has already been added to the entries arena inline void addEntry(const RedwoodRecordRef &rec) { if(rec.value.present()) { ++childPageCount; @@ -2744,10 +2617,11 @@ private: } } - entries.push_back(rec); + entries.push_back(entries.arena(), rec); } - - void addEntries(const VersionedChildPageSet &newSet) { + public: + // Add the child entries from newSet into entries + void addEntries(VersionAndChildrenRef newSet) { // If there are already entries, the last one links to a child page, and its upper bound is not the same // as the first lowerBound in newSet (or newSet is empty, as the next newSet is necessarily greater) // then add the upper bound of the previous set as a value-less record so that on future reads @@ -2805,45 +2679,12 @@ private: } BTreePage::BinaryTree::Cursor cursor; - std::vector> entries; - Standalone lastUpperBound; + Standalone> entries; + RedwoodRecordRef lastUpperBound; bool modified; int childPageCount; - Arena arena; }; - - template - static std::string toString(const T &o) { - return o.toString(); - } - - static std::string toString(const VersionedChildPageSet &c) { - return format("Version=%" PRId64 " children=%s upperBound=%s", c.version, toString(c.children).c_str(), c.upperBound.toString().c_str()); - } - - template - static std::string toString(const std::vector &v) { - std::string r = "{ "; - for(auto &o : v) { - r += toString(o) + ", "; - } - return r + " }"; - } - - template - static std::string toString(const VectorRef &v) { - std::string r = "{ "; - for(auto &o : v) { - r += toString(o) + ", "; - } - return r + " }"; - } - - static std::string toString(LogicalPageID id) { - return format("%" PRId64, id); - } - // Represents a change to a single key - set, clear, or atomic op struct SingleKeyMutation { // Clear @@ -2967,7 +2808,12 @@ private: std::string m_name; bool singleVersion; - MetaKey m_header; + // MetaKey changes size so allocate space for it to expand into + union { + uint8_t headerSpace[sizeof(MetaKey) + sizeof(LogicalPageID) * 20]; + MetaKey m_header; + }; + LazyDeleteQueueT m_lazyDeleteQueue; int m_maxPartSize; @@ -3018,102 +2864,231 @@ private: return ib; } - ACTOR static Future buildNewRoot(VersionedBTree *self, Version version, std::vector *pages, std::vector *logicalPageIDs, BTreePage *pPage) { - debug_printf("buildNewRoot start version %" PRId64 ", %lu pages\n", version, pages->size()); + // Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s) + // TODO: Maybe refactor this as an accumulator you add sorted keys to which precomputes adjacent common prefixes and makes pages. + ACTOR static Future>> writePages(VersionedBTree *self, bool minimalBoundaries, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, VectorRef entries, uint8_t newFlags, int height, Version v, BTreePageID previousID) { + ASSERT(entries.size() > 0); + state Standalone> records; - // While there are multiple child pages for this version we must write new tree levels. - while(pages->size() > 1) { - std::vector childEntries; - for(int i=0; i < pages->size(); i++) { - RedwoodRecordRef entry = pages->at(i).lowerBound.withPageID(logicalPageIDs->at(i)); - debug_printf("Added new root entry %s\n", entry.toString().c_str()); - childEntries.push_back(entry); - } + // This is how much space for the binary tree exists in the page, after the header + state int blockSize = self->m_pager->getUsablePageSize(); + state int pageSize = blockSize - sizeof(BTreePage); + state int blockCount = 1; - int newHeight = pPage->height + 1; - self->m_header.height = newHeight; - *pages = buildPages(false, dbBegin, dbEnd, childEntries, 0, newHeight, self->m_pager); + state int kvBytes = 0; + state int compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); - debug_printf_always("Writing a new root level at version %" PRId64 " height %d with %lu children across %lu pages\n", version, newHeight, childEntries.size(), pages->size()); - std::vector ids = wait(writePages(self, *pages, version, self->m_header.root, pPage, &dbEnd, nullptr)); - *logicalPageIDs = std::move(ids); - } + state int start = 0; + state int i = 0; + state bool end; - return Void(); - } + // For leaf level where minimal boundaries are used require at least 1 entry, otherwise require 4 to enforce a minimum branching factor + state int minimumEntries = minimalBoundaries ? 1 : 4; + + // Lower bound of the page being added to + state RedwoodRecordRef pageLowerBound = lowerBound->withoutValue(); + state RedwoodRecordRef pageUpperBound; - // Write replacement pages for the given originalID, return a set of internal page records that point to the pages. - ACTOR static Future> writePages(VersionedBTree *self, std::vector pages, Version version, LogicalPageID originalID, const BTreePage *originalPage, const RedwoodRecordRef *upperBound, void *actor_debug) { - debug_printf("%p: writePages(): %u @%" PRId64 " -> %lu replacement pages\n", actor_debug, originalID, version, pages.size()); + while(i <= entries.size()) { + end = i == entries.size(); + bool flush = end; - ASSERT(version != 0 || pages.size() == 1); - - state std::vector primaryLogicalPageIDs; - - // TODO: Re-enable this once using pager's atomic replacement - // Reuse original primary page ID if it's not the root or if only one page is being written. - //if(originalID != self->m_root || pages.size() == 1) - // primaryLogicalPageIDs.push_back(originalID); - - // Allocate a primary page ID for each page to be written - while(primaryLogicalPageIDs.size() < pages.size()) { - LogicalPageID id = wait(self->m_pager->newPageID()); - primaryLogicalPageIDs.push_back(id); - } - - debug_printf("%p: writePages(): Writing %lu replacement pages for %d at version %" PRId64 "\n", actor_debug, pages.size(), originalID, version); - state int i; - for(i=0; i> *extPages = &pages[i].extPages; - // If there are extension pages, write all pages using pager directly because this->writePage() is for whole primary pages - if(extPages->size() != 0) { - state BTreePage *newPage = (BTreePage *)pages[i].firstPage->mutate(); - ASSERT(newPage->extensionPageCount == extPages->size()); - - state int e; - state int eEnd = extPages->size(); - for(e = 0; e < eEnd; ++e) { - LogicalPageID eid = wait(self->m_pager->newPageID()); - debug_printf("%p: writePages(): Writing extension page op=write id=%u @%" PRId64 " (%d of %lu) referencePageID=%u\n", actor_debug, eid, version, e + 1, extPages->size(), id); - newPage->extensionPages()[e] = bigEndian32(eid); - // If replacing the primary page below (version == 0) then pass the primary page's ID as the reference page ID - self->m_pager->updatePage(eid, extPages->at(e)); //, version, (version == 0) ? id : invalidLogicalPageID); - ++counts.extPageWrites; - } - - debug_printf("%p: writePages(): Writing primary page op=write id=%u @%" PRId64 " (+%lu extension pages)\n", actor_debug, id, version, extPages->size()); - self->m_pager->updatePage(id, pages[i].firstPage); // version); + // If not the end, add i to the page if necessary + if(end) { + pageUpperBound = upperBound->withoutValue(); } else { - debug_printf("%p: writePages(): Writing normal page op=write id=%u @%" PRId64 "\n", actor_debug, id, version); - self->writePage(id, pages[i].firstPage, version, &pages[i].lowerBound, (i == pages.size() - 1) ? upperBound : &pages[i + 1].lowerBound); + // Get delta from previous record + const RedwoodRecordRef &entry = entries[i]; + int deltaSize = entry.deltaSize((i == start) ? pageLowerBound : entries[i - 1]); + int keySize = entry.key.size(); + int valueSize = entry.value.present() ? entry.value.get().size() : 0; + + int spaceNeeded = sizeof(BTreePage::BinaryTree::Node) + deltaSize; + + debug_printf("Trying to add record %3d of %3lu (i=%3d) klen %4d vlen %3d deltaSize %4d spaceNeeded %4d compressed %4d / page %4d bytes %s\n", + i + 1, entries.size(), i, keySize, valueSize, deltaSize, + spaceNeeded, compressedBytes, pageSize, entry.toString().c_str()); + + int spaceAvailable = pageSize - compressedBytes; + + // Does it fit? + bool fits = spaceAvailable >= spaceNeeded; + + // If it doesn't fit, either end the current page or increase the page size + if(!fits) { + int count = i - start; + + // If not enough entries or page less than half full, increase page size to make the entry fit + if(count < minimumEntries || spaceAvailable > pageSize / 2) { + // Figure out how many additional whole or partial blocks are needed + // newBlocks = ceil ( additional space needed / block size) + int newBlocks = 1 + (spaceNeeded - spaceAvailable - 1) / blockSize; + int newPageSize = pageSize + (newBlocks * blockSize); + if(newPageSize <= BTreePage::BinaryTree::MaximumTreeSize()) { + blockCount += newBlocks; + pageSize = newPageSize; + fits = true; + } + } + if(!fits) { + pageUpperBound = entry.withoutValue(); + } + } + + // If the record fits then add it to the page set + if(fits) { + kvBytes += keySize + valueSize; + compressedBytes += spaceNeeded; + ++i; + } + + flush = !fits; + } + + // If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above. + if(flush) { + end = i == entries.size(); // i could have been moved above + + int count = i - start; + // If not writing the final page, reduce entry count of page by a third + if(!end) { + i -= count / 3; + pageUpperBound = entries[i].withoutValue(); + } + + // If this isn't the final page, shorten the upper boundary + if(!end && minimalBoundaries) { + int commonPrefix = pageUpperBound.getCommonPrefixLen(entries[i - 1], 0); + pageUpperBound.truncate(commonPrefix + 1); + } + + state std::vector> pages; + BTreePage *btPage; + + if(blockCount == 1) { + Reference page = self->m_pager->newPageBuffer(); + VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); + btPage = (BTreePage *)page->mutate(); + pages.push_back(std::move(page)); + } + else { + ASSERT(blockCount > 1); + int size = blockSize * blockCount; + btPage = (BTreePage *)new uint8_t[size]; + VALGRIND_MAKE_MEM_DEFINED(btPageMem, size); + } + + btPage->formatVersion = BTreePage::FORMAT_VERSION; + btPage->flags = newFlags; + btPage->height = height; + btPage->kvBytes = kvBytes; + btPage->itemCount = i - start; + + int written = btPage->tree().build(&entries[start], &entries[i], &pageLowerBound, &pageUpperBound); + if(written > pageSize) { + fprintf(stderr, "ERROR: Wrote %d bytes to %d byte page (%d blocks). recs %d kvBytes %d compressed %d\n", written, pageSize, blockCount, i - start, kvBytes, compressedBytes); + ASSERT(false); + } + + // Create chunked pages + // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. + if(blockCount != 1) { + const uint8_t *rptr = (const uint8_t *)btPage; + for(int b = 0; b < blockCount; ++b) { + Reference page = self->m_pager->newPageBuffer(); + VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); + memcpy(page->mutate(), rptr, blockSize); + rptr += blockSize; + pages.push_back(std::move(page)); + } + delete (uint8_t *)btPage; + } + + // Write this btree page, which is made of 1 or more pager pages. + state int p; + state BTreePageID childPageID; + + // If there's still just 1 page, and it's the same size as the original, then reuse original page id(s) + if(end && records.empty() && previousID.size() == pages.size()) { + for(p = 0; p < pages.size(); ++p) { + LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v)); + childPageID.push_back(records.arena(), id); + } + } + else { + // Can't reused the old page IDs, so free the old ones (once) as of version and allocate new ones. + if(records.empty()) { + for(LogicalPageID id : previousID) { + self->m_pager->freePage(id, v); + } + } + for(p = 0; p < pages.size(); ++p) { + LogicalPageID id = wait(self->m_pager->newPageID()); + self->m_pager->updatePage(id, pages[p]); + childPageID.push_back(records.arena(), id); + } + } + + // Update activity counts + ++counts.pageWrites; + if(pages.size() > 1) { + counts.extPageWrites += pages.size() - 1; + } + + debug_printf("Flushing page id=%s original=%s start=%d i=%d count=%d\nlower: %s\nupper: %s\n", toString(childPageID).c_str(), toString(previousID).c_str(), start, i, i - start, pageLowerBound.toString().c_str(), pageUpperBound.toString().c_str()); + if(REDWOOD_DEBUG) { + for(int j = start; j < i; ++j) { + debug_printf(" %3d: %s\n", j, entries[j].toString().c_str()); + } + ASSERT(pageLowerBound.key <= pageUpperBound.key); + } + + // Push a new record onto the results set, without the child page, copying it into the records arena + records.push_back_deep(records.arena(), pageLowerBound.withoutValue()); + // Set the child page value of the inserted record to childPageID, which has already been allocated in records.arena() above + records.back().setChildPage(childPageID); + + if(end) { + break; + } + + start = i; + kvBytes = 0; + compressedBytes = BTreePage::BinaryTree::GetTreeOverhead(); + pageLowerBound = pageUpperBound.withoutValue(); } } - // Free the old extension pages now that all replacement pages have been written - //for(int i = 0; i < originalPage->extensionPageCount; ++i) { - //debug_printf("%p: writePages(): Freeing old extension op=del id=%u @latest\n", actor_debug, bigEndian32(originalPage->extensionPages()[i])); - //m_pager->freeLogicalPage(bigEndian32(originalPage->extensionPages()[i]), version); - //} + //debug_printf("buildPages: returning pages.size %lu, kvpairs %lu\n", pages.size(), kvPairs.size()); + return records; + } - return primaryLogicalPageIDs; + ACTOR static Future>> buildNewRoot(VersionedBTree *self, Version version, Standalone> records, int height) { + debug_printf("buildNewRoot start version %" PRId64 ", %lu records\n", version, records.size()); + + // While there are multiple child pages for this version we must write new tree levels. + while(records.size() > 1) { + self->m_header.height = ++height; + Standalone> newRecords = wait(writePages(self, false, &dbBegin, &dbEnd, records, 0, height, version, BTreePageID())); + debug_printf_always("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, newRecords.size()); + records = newRecords; + } + + return records; } class SuperPage : public IPage, ReferenceCounted { public: - SuperPage(std::vector> pages, int usablePageSize) - : m_size(pages.size() * usablePageSize) { + SuperPage(std::vector> pages) { + int blockSize = pages.front()->size(); + m_size = blockSize * pages.size(); m_data = new uint8_t[m_size]; uint8_t *wptr = m_data; for(auto &p : pages) { - memcpy(wptr, p->begin(), usablePageSize); - wptr += usablePageSize; + ASSERT(p->size() == blockSize); + memcpy(wptr, p->begin(), blockSize); + wptr += blockSize; } } @@ -3143,41 +3118,41 @@ private: private: uint8_t *m_data; - const int m_size; + int m_size; }; - ACTOR static Future> readPage(Reference snapshot, LogicalPageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { - debug_printf("readPage() op=read id=%s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + ACTOR static Future> readPage(Reference snapshot, BTreePageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { + debug_printf("readPage() op=read page id=%s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); wait(delay(0, TaskPriority::DiskRead)); - state Reference result = wait(snapshot->getPhysicalPage(id)); - state int usablePageSize = result->size(); - ++counts.pageReads; - state const BTreePage *pTreePage = (const BTreePage *)result->begin(); - ASSERT(pTreePage->formatVersion == BTreePage::FORMAT_VERSION); + std::vector>> reads; - if(pTreePage->extensionPageCount == 0) { - debug_printf("readPage() Found normal page for op=read id=%u @%" PRId64 "\n", id, snapshot->getVersion()); + for(auto &pageID : id) { + reads.push_back(snapshot->getPhysicalPage(pageID, true)); + } + + ++counts.pageReads; + std::vector> pages = wait(getAll(reads)); + ASSERT(!pages.empty()); + + Reference page; + + if(pages.size() == 1) { + page = pages.front(); } else { - std::vector>> pageGets; - pageGets.push_back(std::move(result)); - - for(int i = 0; i < pTreePage->extensionPageCount; ++i) { - debug_printf("readPage() Reading extension page op=read id=%u @%" PRId64 " ext=%d/%d\n", bigEndian32(pTreePage->extensionPages()[i]), snapshot->getVersion(), i + 1, (int)pTreePage->extensionPageCount); - pageGets.push_back(snapshot->getPhysicalPage(bigEndian32(pTreePage->extensionPages()[i]))); - } - - std::vector> pages = wait(getAll(pageGets)); - counts.extPageReads += pTreePage->extensionPageCount; - result = Reference(new SuperPage(pages, usablePageSize)); - pTreePage = (const BTreePage *)result->begin(); + counts.extPageReads += (pages.size() - 1); + // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. + page = Reference(new SuperPage(pages)); } - if(result->userData == nullptr) { - debug_printf("readPage() Creating Reader for PageID=%u @%" PRId64 " lower=%s upper=%s\n", id, snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - result->userData = new BTreePage::BinaryTree::Reader(&pTreePage->tree(), lowerBound, upperBound); - result->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Reader *)ptr; }; + const BTreePage *pTreePage = (const BTreePage *)page->begin(); + ASSERT(pTreePage->formatVersion == BTreePage::FORMAT_VERSION); + + if(page->userData == nullptr) { + debug_printf("readPage() Creating Reader for page id=%s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + page->userData = new BTreePage::BinaryTree::Reader(&pTreePage->tree(), lowerBound, upperBound); + page->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Reader *)ptr; }; } debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); @@ -3185,24 +3160,33 @@ private: // Nothing should attempt to read bytes in the page outside the BTreePage structure VALGRIND_MAKE_MEM_UNDEFINED(result->begin() + pTreePage->size(), result->size() - pTreePage->size()); - return result; + return page; + } + + void freeBtreePage(BTreePageID btPageID, Version v) { + // Free individual pages at v + for(LogicalPageID id : btPageID) { + m_pager->freePage(id, v); + } } // Returns list of (version, list of (lower_bound, list of children) ) // TODO: Probably should pass prev/next records by pointer in many places - ACTOR static Future commitSubtree(VersionedBTree *self, MutationBufferT *mutationBuffer, Reference snapshot, LogicalPageID root, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, const RedwoodRecordRef *decodeLowerBound, const RedwoodRecordRef *decodeUpperBound) { + ACTOR static Future> commitSubtree(VersionedBTree *self, MutationBufferT *mutationBuffer, Reference snapshot, BTreePageID rootID, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, const RedwoodRecordRef *decodeLowerBound, const RedwoodRecordRef *decodeUpperBound) { state std::string context; if(REDWOOD_DEBUG) { - context = format("CommitSubtree(root=%u): ", root); + context = format("CommitSubtree(root=%s): ", toString(rootID).c_str()); } - debug_printf("%s root=%d lower=%s upper=%s\n", context.c_str(), root, lowerBound->toString().c_str(), upperBound->toString().c_str()); - debug_printf("%s root=%d decodeLower=%s decodeUpper=%s\n", context.c_str(), root, decodeLowerBound->toString().c_str(), decodeUpperBound->toString().c_str()); + state Standalone results; + + debug_printf("%s lower=%s upper=%s\n", context.c_str(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + debug_printf("%s decodeLower=%s decodeUpper=%s\n", context.c_str(), decodeLowerBound->toString().c_str(), decodeUpperBound->toString().c_str()); self->counts.commitToPageStart++; // If a boundary changed, the page must be rewritten regardless of KV mutations state bool boundaryChanged = (lowerBound != decodeLowerBound) || (upperBound != decodeUpperBound); - debug_printf("%s id=%u boundaryChanged=%d\n", context.c_str(), root, boundaryChanged); + debug_printf("%s boundaryChanged=%d\n", context.c_str(), boundaryChanged); // Find the slice of the mutation buffer that is relevant to this subtree // TODO: Rather than two lower_bound searches, perhaps just compare each mutation to the upperBound key while iterating @@ -3218,16 +3202,15 @@ private: // If the key is being mutated, them remove this subtree. if(iMutationBoundary == iMutationBoundaryEnd) { if(!iMutationBoundary->second.startKeyMutations.empty()) { - VersionedChildrenT c; - debug_printf("%s id=%u lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; + debug_printf("%s lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), toString(results).c_str()); + return results; } // If there are no forced boundary changes then this subtree is unchanged. if(!boundaryChanged) { - VersionedChildrenT c({ {0, {*decodeLowerBound}, *decodeUpperBound} }); - debug_printf("%s id=%d page contains a single key '%s' which is not changing, returning %s\n", context.c_str(), root, lowerBound->key.toString().c_str(), toString(c).c_str()); - return c; + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s page contains a single key '%s' which is not changing, returning %s\n", context.c_str(), lowerBound->key.toString().c_str(), toString(results).c_str()); + return results; } } @@ -3241,29 +3224,28 @@ private: iMutationBoundary->first < lowerBound->key) ) ) { - VersionedChildrenT c({ {0, {*decodeLowerBound}, *decodeUpperBound} }); - debug_printf("%s no changes because sole mutation range was not cleared, returning %s\n", context.c_str(), toString(c).c_str()); - return c; + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s no changes because sole mutation range was not cleared, returning %s\n", context.c_str(), toString(results).c_str()); + return results; } self->counts.commitToPage++; - state Reference rawPage = wait(readPage(snapshot, root, decodeLowerBound, decodeUpperBound)); + state Reference rawPage = wait(readPage(snapshot, rootID, decodeLowerBound, decodeUpperBound)); state BTreePage *page = (BTreePage *) rawPage->begin(); - debug_printf("%s commitSubtree(): %s\n", context.c_str(), page->toString(false, root, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); + debug_printf("%s commitSubtree(): %s\n", context.c_str(), page->toString(false, rootID, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); state BTreePage::BinaryTree::Cursor cursor = getReader(rawPage)->getCursor(); cursor.moveFirst(); - state std::vector pages; - state std::vector newPageIDs; - state VersionedChildrenT results; +// state Standalone> internalRecords; state Version writeVersion; + state bool isRoot = (rootID == self->m_header.root.get()); // Leaf Page if(page->flags & BTreePage::IS_LEAF) { - std::vector merged; + state Standalone> merged; - debug_printf("%s id=%u MERGING EXISTING DATA WITH MUTATIONS:\n", context.c_str(), root); + debug_printf("%s MERGING EXISTING DATA WITH MUTATIONS:\n", context.c_str()); if(REDWOOD_DEBUG) { self->printMutationBuffer(iMutationBoundary, iMutationBoundaryEnd); } @@ -3301,7 +3283,7 @@ private: while(cursor.valid() && cursor.get().key == iMutationBoundary->first) { // If not in single version mode or there were no changes to the key if(!self->singleVersion || iMutationBoundary->second.noChanges()) { - merged.push_back(cursor.get()); + merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, boundary start]\n", context.c_str(), merged.back().toString().c_str()); } else { @@ -3320,7 +3302,7 @@ private: if(iMutations->first < minVersion || minVersion == invalidVersion) minVersion = iMutations->first; ++changes; - merged.push_back(m.toRecord(iMutationBoundary->first, iMutations->first)); + merged.push_back(merged.arena(), m.toRecord(iMutationBoundary->first, iMutations->first)); debug_printf("%s Added non-split %s [mutation, boundary start]\n", context.c_str(), merged.back().toString().c_str()); } else { @@ -3333,7 +3315,7 @@ private: while(bytesLeft > 0) { int partSize = std::min(bytesLeft, self->m_maxPartSize); // Don't copy the value chunk because this page will stay in memory until after we've built new version(s) of it - merged.push_back(whole.split(start, partSize)); + merged.push_back(merged.arena(), whole.split(start, partSize)); bytesLeft -= partSize; start += partSize; debug_printf("%s Added split %s [mutation, boundary start] bytesLeft %d\n", context.c_str(), merged.back().toString().c_str(), bytesLeft); @@ -3355,7 +3337,7 @@ private: bool remove = self->singleVersion && clearRangeVersion.present(); if(!remove) { - merged.push_back(cursor.get()); + merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, middle]\n", context.c_str(), merged.back().toString().c_str()); } else { @@ -3379,7 +3361,7 @@ private: if(clearVersion < minVersion || minVersion == invalidVersion) minVersion = clearVersion; ++changes; - merged.push_back(RedwoodRecordRef(cursor.get().key, clearVersion)); + merged.push_back(merged.arena(), RedwoodRecordRef(cursor.get().key, clearVersion)); debug_printf("%s Added %s [existing, middle clear]\n", context.c_str(), merged.back().toString().c_str()); } cursor = nextCursor; @@ -3392,7 +3374,7 @@ private: // Write any remaining existing keys, which are not subject to clears as they are beyond the cleared range. while(cursor.valid()) { - merged.push_back(cursor.get()); + merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, tail]\n", context.c_str(), merged.back().toString().c_str()); cursor.moveNext(); } @@ -3402,71 +3384,32 @@ private: // No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records. // But if a boundary was changed then we must rewrite the page anyway. if(!boundaryChanged && minVersion == invalidVersion) { - VersionedChildrenT c({ {0, {*decodeLowerBound}, *decodeUpperBound} }); - debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(c).c_str()); + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(results).c_str()); ASSERT(changes == 0); - return c; + return results; } // TODO: Make version and key splits based on contents of merged list, if keeping history // If everything in the page was deleted then this page should be deleted as of the new version // Note that if a single range clear covered the entire page then we should not get this far - if(merged.empty() && root != 0) { - // TODO: For multi version mode only delete this page as of the new version - VersionedChildrenT c({}); - debug_printf("%s id=%u All leaf page contents were cleared, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; + if(merged.empty() && !isRoot) { + self->freeBtreePage(rootID, writeVersion); + debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), toString(results).c_str()); + return results; } - std::vector newPages = buildPages(true, *lowerBound, *upperBound, merged, BTreePage::IS_LEAF, page->height, self->m_pager); - pages = std::move(newPages); - - if(!self->singleVersion) { - ASSERT(false); -// // If there isn't still just a single page of data then this page became too large and was split. -// // The new split pages will be valid as of minVersion, but the old page remains valid at the old version -// if(pages.size() != 1) { -// results.push_back( {0, {*decodeLowerBound}, ??} ); -// debug_printf("%s Added versioned child set #1: %s\n", context.c_str(), toString(results.back()).c_str()); -// } -// else { -// // The page was updated but not size-split or version-split so the last page version's data -// // can be replaced with the new page contents -// if(pages.size() == 1) -// minVersion = 0; -// } - } - - // Write page(s), get new page IDs writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; - std::vector pageIDs = wait(self->writePages(self, pages, writeVersion, root, page, upperBound, THIS)); - newPageIDs = std::move(pageIDs); - - // If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page - if(root == self->m_header.root && pages.size() > 1) { - debug_printf("%s Building new root\n", context.c_str()); - wait(self->buildNewRoot(self, writeVersion, &pages, &newPageIDs, page)); - } - - results.push_back({writeVersion, {}, *upperBound}); - for(int i=0; i> entries = wait(writePages(self, true, lowerBound, upperBound, merged, BTreePage::IS_LEAF, page->height, writeVersion, rootID)); + results.arena().dependsOn(entries.arena()); + results.push_back(results.arena(), VersionAndChildrenRef(writeVersion, entries, *upperBound)); debug_printf("%s Merge complete, returning %s\n", context.c_str(), toString(results).c_str()); - - debug_printf("%s DONE.\n", context.c_str()); return results; } else { // Internal Page - - // TODO: Combine these into one vector and/or do something more elegant - state std::vector> futureChildren; + state std::vector>> futureChildren; bool first = true; while(cursor.valid()) { @@ -3488,8 +3431,8 @@ private: const RedwoodRecordRef &decodeChildLowerBound = cursor.get(); - LogicalPageID pageID = cursor.get().getPageID(); - ASSERT(pageID != 0); + BTreePageID pageID = cursor.get().getChildPage(); + ASSERT(!pageID.empty()); const RedwoodRecordRef &decodeChildUpperBound = cursor.moveNext() ? cursor.get() : *decodeUpperBound; @@ -3500,8 +3443,8 @@ private: const RedwoodRecordRef &childUpperBound = cursor.valid() ? cursor.get() : *upperBound; - debug_printf("%s recursing to PageID=%u lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", - context.c_str(), pageID, childLowerBound.toString().c_str(), childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), decodeChildUpperBound.toString().c_str()); + debug_printf("%s recursing to PageID=%s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", + context.c_str(), toString(pageID).c_str(), childLowerBound.toString().c_str(), childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), decodeChildUpperBound.toString().c_str()); /* // TODO: If lower bound and upper bound have the same key, do something intelligent if possible @@ -3544,19 +3487,20 @@ private: } if(REDWOOD_DEBUG) { - debug_printf("%s Subtree update results for root PageID=%u\n", context.c_str(), root); + debug_printf("%s Subtree update results\n", context.c_str()); for(int i = 0; i < futureChildren.size(); ++i) { debug_printf("%s subtree result %s\n", context.c_str(), toString(futureChildren[i].get()).c_str()); } } - // TODO: Handle multi-versioned results + // TODO: Either handle multi-versioned results or change commitSubtree interface to return a single child set. ASSERT(self->singleVersion); cursor.moveFirst(); + // All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be upperBound InternalPageBuilder pageBuilder(cursor); for(int i = 0; i < futureChildren.size(); ++i) { - const VersionedChildrenT &versionedChildren = futureChildren[i].get(); + VersionedChildrenT versionedChildren = futureChildren[i].get(); ASSERT(versionedChildren.size() <= 1); if(!versionedChildren.empty()) { @@ -3570,64 +3514,29 @@ private: if(pageBuilder.modified) { // If the page now has no children if(pageBuilder.childPageCount == 0) { - // If we are the root, write a new empty btree - if(root == 0) { - Reference page = self->m_pager->newPageBuffer(); - makeEmptyPage(page, BTreePage::IS_LEAF); - RedwoodRecordRef rootEntry = dbBegin.withPageID(0); - self->writePage(0, page, self->getLastCommittedVersion() + 1, &dbBegin, &dbEnd); - VersionedChildrenT c({ {0, {dbBegin}, dbEnd } }); - debug_printf("%s id=%u All root page children were deleted, rewrote root as leaf, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; - } - else { - VersionedChildrenT c({}); - debug_printf("%s id=%u All internal page children were deleted #1 so deleting this page too, returning %s\n", context.c_str(), root, toString(c).c_str()); - return c; - } + self->freeBtreePage(rootID, writeVersion); + debug_printf("%s All internal page children were deleted #1 so deleting this page too, returning %s\n", context.c_str(), toString(results).c_str()); + return results; } else { - debug_printf("%s Internal PageID=%u modified, creating replacements.\n", context.c_str(), root); + debug_printf("%s Internal page modified, creating replacements.\n", context.c_str()); debug_printf("%s newChildren=%s lastUpperBound=%s upperBound=%s\n", context.c_str(), toString(pageBuilder.entries).c_str(), pageBuilder.lastUpperBound.toString().c_str(), upperBound->toString().c_str()); ASSERT(pageBuilder.lastUpperBound == *upperBound); - // TODO: Don't do this! - std::vector entries; - for(auto &o : pageBuilder.entries) { - entries.push_back(o); - } - - std::vector newPages = buildPages(false, *lowerBound, *upperBound, entries, 0, page->height, self->m_pager); - pages = std::move(newPages); - writeVersion = self->getLastCommittedVersion() + 1; - std::vector pageIDs = wait(writePages(self, pages, writeVersion, root, page, upperBound, THIS)); - newPageIDs = std::move(pageIDs); + Standalone> childEntries = wait(holdWhile(pageBuilder.entries, writePages(self, false, lowerBound, upperBound, pageBuilder.entries, 0, page->height, writeVersion, rootID))); - // If this commitSubtree() is operating on the root, write new levels if needed until until we're returning a single page - if(root == self->m_header.root) { - wait(self->buildNewRoot(self, writeVersion, &pages, &newPageIDs, page)); - } - - VersionedChildrenT vc(1); - vc.resize(1); - VersionedChildPageSet &c = vc.front(); - c.version = writeVersion; - c.upperBound = *upperBound; - - for(int i=0; i((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s Page has no changes, returning %s\n", context.c_str(), toString(results).c_str()); + return results; } } } @@ -3653,19 +3562,46 @@ private: debug_printf("%s: Beginning commit of version %" PRId64 "\n", self->m_name.c_str(), writeVersion); // Get the latest version from the pager, which is what we will read at - Version latestVersion = wait(self->m_pager->getLatestVersion()); + state Version latestVersion = wait(self->m_pager->getLatestVersion()); debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); if(REDWOOD_DEBUG) { self->printMutationBuffer(mutations); } - state RedwoodRecordRef lowerBound = dbBegin.withPageID(self->m_header.root); - VersionedChildrenT newRoot = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), self->m_header.root, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); - debug_printf("CommitSubtree(root) returned %s\n", toString(newRoot).c_str()); - ASSERT(newRoot.size() == 1); + // TODO: Support root page as a BTreePageID in the header instead of just a LogicalPageID + state Standalone rootPageID = self->m_header.root.get(); + state RedwoodRecordRef lowerBound = dbBegin.withPageID(rootPageID); + Standalone versionedRoots = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); + debug_printf("CommitSubtree(root %s) returned %s\n", toString(rootPageID).c_str(), toString(versionedRoots).c_str()); + + // CommitSubtree on the root can only return 1 child at most because the pager interface only supports writing + // one meta record (which contains the root page) per commit. + ASSERT(versionedRoots.size() <= 1); + + // If the old root was deleted, write a new empty tree root node and free the old roots + if(versionedRoots.empty()) { + debug_printf("Writing new empty root.\n"); + LogicalPageID newRootID = wait(self->m_pager->newPageID()); + Reference page = self->m_pager->newPageBuffer(); + makeEmptyPage(page, BTreePage::IS_LEAF); + self->m_pager->updatePage(newRootID, page); + rootPageID = BTreePageID((LogicalPageID *)&newRootID, 1); + } + else { + Standalone> newRootLevel(versionedRoots.front().children, versionedRoots.arena()); + if(newRootLevel.size() == 1) { + rootPageID = newRootLevel.front().getChildPage(); + } + else { + // If the new root level's size is not 1 then build new root level(s) + Standalone> newRootPage = wait(buildNewRoot(self, latestVersion, newRootLevel, self->m_header.height)); + rootPageID = newRootPage.front().getChildPage(); + } + } + + self->m_header.root.set(rootPageID, sizeof(headerSpace) - sizeof(m_header)); - self->m_header.root = newRoot.front().children.front().getPageID(); self->m_pager->setVersion(writeVersion); wait(store(self->m_header.lazyDeleteQueue, self->m_lazyDeleteQueue.flush())); @@ -3682,7 +3618,7 @@ private: self->m_mutationBuffers.erase(self->m_mutationBuffers.begin()); self->m_lastCommittedVersion = writeVersion; - ++self->counts.commits; + ++counts.commits; committed.send(Void()); return Void(); @@ -3697,11 +3633,13 @@ private: // PageCursors can be shared by many InternalCursors, making InternalCursor copying low overhead struct PageCursor : ReferenceCounted, FastAllocated { Reference parent; - LogicalPageID pageID; // Only needed for debugging purposes + BTreePageID pageID; // Only needed for debugging purposes Reference page; BTreePage::BinaryTree::Cursor cursor; - PageCursor(LogicalPageID id, Reference page, Reference parent = {}) + // id will normally reference memory owned by the parent, which is okay because a reference to the parent + // will be held in the cursor + PageCursor(BTreePageID id, Reference page, Reference parent = {}) : pageID(id), page(page), parent(parent), cursor(getReader().getCursor()) { } @@ -3729,7 +3667,7 @@ private: BTreePage::BinaryTree::Cursor next = cursor; next.moveNext(); const RedwoodRecordRef &rec = cursor.get(); - LogicalPageID id = rec.getPageID(); + BTreePageID id = rec.getChildPage(); Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); return map(child, [=](Reference page) { return Reference(new PageCursor(id, page, Reference::addRef(this))); @@ -3737,11 +3675,11 @@ private: } std::string toString() const { - return format("PageID=%u, %s", pageID, cursor.valid() ? cursor.get().toString().c_str() : ""); + return format("PageID=%s, %s", ::toString(pageID).c_str(), cursor.valid() ? cursor.get().toString().c_str() : ""); } }; - LogicalPageID rootPageID; + Standalone rootPageID; Reference pager; Reference pageCursor; @@ -3749,7 +3687,7 @@ private: InternalCursor() { } - InternalCursor(Reference pager, LogicalPageID root) + InternalCursor(Reference pager, BTreePageID root) : pager(pager), rootPageID(root) { } @@ -3970,7 +3908,7 @@ private: // KeyValueRefs returned become invalid once the cursor is moved class Cursor : public IStoreCursor, public ReferenceCounted, public FastAllocated, NonCopyable { public: - Cursor(Reference pageSource, LogicalPageID root, Version recordVersion) + Cursor(Reference pageSource, BTreePageID root, Version recordVersion) : m_version(recordVersion), m_cur1(pageSource, root), m_cur2(m_cur1) @@ -4823,18 +4761,16 @@ TEST_CASE("!/redwood/correctness/unit/RedwoodRecordRef") { // Test pageID stuff. { - LogicalPageID id = 1; + LogicalPageID ids[] = {1, 5}; + BTreePageID id(ids, 2); RedwoodRecordRef r; - r.setPageID(id); - ASSERT(r.getPageID() == id); - RedwoodRecordRef s; - s = r; - ASSERT(s.getPageID() == id); - RedwoodRecordRef t(r); - ASSERT(t.getPageID() == id); - r.setPageID(id + 1); - ASSERT(s.getPageID() == id); - ASSERT(t.getPageID() == id); + r.setChildPage(id); + ASSERT(r.getChildPage() == id); + ASSERT(r.getChildPage().begin() == id.begin()); + + Standalone r2 = r; + ASSERT(r2.getChildPage() == id); + ASSERT(r2.getChildPage().begin() != id.begin()); } // Testing common prefix calculation for integer fields using the member function that calculates this directly @@ -5472,9 +5408,15 @@ TEST_CASE("!/redwood/correctness/pager/cow") { pager->updatePage(id, p); pager->setMetaKey(LiteralStringRef("asdfasdf")); wait(pager->commit()); - Reference p2 = wait(pager->readPage(id)); + Reference p2 = wait(pager->readPage(id, true)); printf("%s\n", StringRef(p2->begin(), p2->size()).toHexString().c_str()); + // TODO: Verify reads, do more writes and reads to make this a real pager validator + + Future onClosed = pager->onClosed(); + pager->close(); + wait(onClosed); + return Void(); } From 848a344aa72bd7d0fd57cae797c506caf8933509 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 27 Sep 2019 22:56:33 -0700 Subject: [PATCH 026/184] DeltaTree building now passes the prev/next common prefix length, which is effectively a subtree shared prefix, to recursive calls, which enables each new prev/next common prefix comparison to start from the position at which the previous call on the stack left off. --- fdbserver/DeltaTree.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index cd6b021e6c..ce584f76f2 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -342,7 +342,7 @@ public: // The boundary leading to the new page acts as the last time we branched right if(begin != end) { - nodeBytes = build(root(), begin, end, prev, next); + nodeBytes = build(root(), begin, end, prev, next, prev->getCommonPrefixLen(*next, 0)); } else { nodeBytes = 0; @@ -351,7 +351,7 @@ public: } private: - static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next) { + static OffsetT build(Node &root, const T *begin, const T *end, const T *prev, const T *next, int subtreeCommon) { //printf("build: %s to %s\n", begin->toString().c_str(), (end - 1)->toString().c_str()); //printf("build: root at %p sizeof(Node) %d delta at %p \n", &root, sizeof(Node), &root.delta()); ASSERT(end != begin); @@ -361,12 +361,8 @@ private: int mid = perfectSubtreeSplitPointCached(count); const T &item = begin[mid]; - // Get the common prefix length between next and prev - // Since mid is between them, we can skip that length to determine the common prefix length - // between mid and prev and between mid and next. - int nextPrevCommon = prev->getCommonPrefixLen(*next, 0); - int commonWithPrev = item.getCommonPrefixLen(*prev, nextPrevCommon); - int commonWithNext = item.getCommonPrefixLen(*next, nextPrevCommon); + int commonWithPrev = item.getCommonPrefixLen(*prev, subtreeCommon); + int commonWithNext = item.getCommonPrefixLen(*next, subtreeCommon); bool prefixSourcePrev; int commonPrefix; @@ -391,7 +387,7 @@ private: // Serialize left child if(count > 1) { - wptr += build(*(Node *)wptr, begin, begin + mid, prev, &item); + wptr += build(*(Node *)wptr, begin, begin + mid, prev, &item, commonWithPrev); root.leftChildOffset = deltaSize; } else { @@ -401,7 +397,7 @@ private: // Serialize right child if(count > 2) { root.rightChildOffset = wptr - (uint8_t *)&root.delta(); - wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next); + wptr += build(*(Node *)wptr, begin + mid + 1, end, &item, next, commonWithNext); } else { root.rightChildOffset = 0; From 0a3b7ff909d2c332e547c84020468a72ef524162 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sat, 28 Sep 2019 00:26:57 -0700 Subject: [PATCH 027/184] Cleanup of old or temporary code. --- fdbserver/VersionedBTree.actor.cpp | 39 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index f3c577df65..94d7e4c132 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1098,25 +1098,28 @@ public: }; ACTOR static Future commit_impl(COWPager *self) { - state int addFront = 10 * deterministicRandom()->randomInt(0, 10); - state int addBack = 10 * deterministicRandom()->randomInt(0, 10); - state int remove = 10 * deterministicRandom()->randomInt(0, 20); - state int i; + // TODO: Remove this once the free list is in normal use + if(g_network->isSimulated()) { + state int addFront = 10 * deterministicRandom()->randomInt(0, 10); + state int addBack = 10 * deterministicRandom()->randomInt(0, 10); + state int remove = 10 * deterministicRandom()->randomInt(0, 20); + state int i; - for(i = 0; i < addBack; ++i) { - LogicalPageID id = wait(self->newPageID()); - self->freeList.pushBack(id); - } + for(i = 0; i < addBack; ++i) { + LogicalPageID id = wait(self->newPageID()); + self->freeList.pushBack(id); + } - for(i = 0; i < addFront; ++i) { - LogicalPageID id = wait(self->newPageID()); - self->freeList.pushFront(id); - } + for(i = 0; i < addFront; ++i) { + LogicalPageID id = wait(self->newPageID()); + self->freeList.pushFront(id); + } - for(i = 0; i < remove; ++i) { - Optional id = wait(self->freeList.pop()); - if(!id.present()) { - break; + for(i = 0; i < remove; ++i) { + Optional id = wait(self->freeList.pop()); + if(!id.present()) { + break; + } } } @@ -1676,9 +1679,6 @@ struct RedwoodRecordRef { uint32_t start; } chunk; - // If the value is a single page ID it will be stored here - uint8_t bigEndianPageIDSpace[sizeof(LogicalPageID)]; - int expectedSize() const { return key.expectedSize() + value.expectedSize(); } @@ -3060,7 +3060,6 @@ private: } } - //debug_printf("buildPages: returning pages.size %lu, kvpairs %lu\n", pages.size(), kvPairs.size()); return records; } From 2854087118282c15c3621f93a7e8c27856978814 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sat, 28 Sep 2019 13:26:01 -0700 Subject: [PATCH 028/184] Implemented COWPager non-caching page reads. --- fdbserver/VersionedBTree.actor.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 94d7e4c132..b66e4e3d45 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -683,6 +683,16 @@ public: ObjectCache(int sizeLimit = 0) : sizeLimit(sizeLimit) { } + // Get the object for i if it exists, else return nullptr. + // If the object exists, its eviction order will NOT change as this is not a cache hit. + ObjectType * getIfExists(const IndexType &index) { + auto i = cache.find(index); + if(i != cache.end()) { + return &i->second.item; + } + return nullptr; + } + // Get the object for i or create a new one. // After a get(), the object for i is the last in evictionOrder. ObjectType & get(const IndexType &index) { @@ -1068,9 +1078,15 @@ public: // Reads the most recent version of pageID either committed or written using updatePage() Future> readPage(LogicalPageID pageID, bool cacheable) override { + // Use cached page if present, without triggering a cache hit. + // Otherwise, read the page and return it but don't add it to the cache if(!cacheable) { - // TODO: use cached page if present, otherwise read the page and return it but don't add it to the cache - ASSERT(false); + PageCacheEntry *pCacheEntry = pageCache.getIfExists(pageID); + if(pCacheEntry != nullptr) { + return pCacheEntry->page; + } + + return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); } PageCacheEntry &cacheEntry = pageCache.get(pageID); From 24e03a55ad8c35fb518c0a768682e071bbebd692 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Sat, 28 Sep 2019 13:27:00 -0700 Subject: [PATCH 029/184] Some code cleanup and updated TODOs. --- fdbserver/IVersionedStore.h | 3 --- fdbserver/VersionedBTree.actor.cpp | 17 +++-------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h index dd7b0f4bea..d991073b2d 100644 --- a/fdbserver/IVersionedStore.h +++ b/fdbserver/IVersionedStore.h @@ -37,11 +37,8 @@ public: virtual bool isValid() = 0; virtual KeyRef getKey() = 0; - //virtual StringRef getCompressedKey() = 0; virtual ValueRef getValue() = 0; - virtual void invalidateReturnedStrings() = 0; - virtual void addref() = 0; virtual void delref() = 0; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b66e4e3d45..4bdb4e13ab 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1434,7 +1434,6 @@ public: Key metaKey; }; -// TODO: Add version parameter and search snapshots for result Reference COWPager::getReadSnapshot(Version v) { ASSERT(!snapshots.empty()); @@ -1691,7 +1690,7 @@ struct RedwoodRecordRef { Version version; struct { uint32_t total; - // TODO: Change start to chunk number. + // TODO: Change start to chunk number? uint32_t start; } chunk; @@ -2528,8 +2527,7 @@ public: // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same // write version, OR it may represent a snapshot as of the call to readAtVersion(). virtual Reference readAtVersion(Version v) { - // TODO: Use the buffer to return uncommitted data - // For now, only committed versions can be read. + // Only committed versions can be read. Version recordVersion = singleVersion ? 0 : v; ASSERT(v <= m_lastCommittedVersion); if(singleVersion) { @@ -2881,7 +2879,6 @@ private: } // Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s) - // TODO: Maybe refactor this as an accumulator you add sorted keys to which precomputes adjacent common prefixes and makes pages. ACTOR static Future>> writePages(VersionedBTree *self, bool minimalBoundaries, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, VectorRef entries, uint8_t newFlags, int height, Version v, BTreePageID previousID) { ASSERT(entries.size() > 0); state Standalone> records; @@ -3185,8 +3182,7 @@ private: } } - // Returns list of (version, list of (lower_bound, list of children) ) - // TODO: Probably should pass prev/next records by pointer in many places + // Returns list of (version, internal page records, required upper bound) ACTOR static Future> commitSubtree(VersionedBTree *self, MutationBufferT *mutationBuffer, Reference snapshot, BTreePageID rootID, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, const RedwoodRecordRef *decodeLowerBound, const RedwoodRecordRef *decodeUpperBound) { state std::string context; if(REDWOOD_DEBUG) { @@ -3584,7 +3580,6 @@ private: self->printMutationBuffer(mutations); } - // TODO: Support root page as a BTreePageID in the header instead of just a LogicalPageID state Standalone rootPageID = self->m_header.root.get(); state RedwoodRecordRef lowerBound = dbBegin.withPageID(rootPageID); Standalone versionedRoots = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); @@ -3964,16 +3959,10 @@ private: return m_kv.get().key; } - //virtual StringRef getCompressedKey() = 0; virtual ValueRef getValue() { return m_kv.get().value; } - // TODO: Either remove this method or change the contract so that key and value strings returned are still valid after the cursor is - // moved and allocate them in some arena that this method resets. - virtual void invalidateReturnedStrings() { - } - std::string toString() const { std::string r; r += format("Cursor(%p) ver: %" PRId64 " ", this, m_version); From 30c56536bd3b6a438880042e44fdf1e6342d6838 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 1 Oct 2019 02:06:00 -0700 Subject: [PATCH 030/184] Refactored FIFOQueue to support fixed or variable-sized types. Bug fixes in page deletion and lazy delete queuing. --- fdbserver/VersionedBTree.actor.cpp | 257 +++++++++++++++++++---------- 1 file changed, 169 insertions(+), 88 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 4bdb4e13ab..3f2bed1ab6 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -42,6 +42,8 @@ #include // Some convenience functions for debugging to stringify various structures +// Classes can add compatibility by either specializing toString or implementing +// std::string toString() const; template std::string toString(const T &o) { return o.toString(); @@ -86,9 +88,10 @@ std::string toString(const VectorRef &v) { } // A FIFO queue of T stored as a linked list of pages. -// Operations are popFront(), pushBack(), and pushFront(), and flush(). +// Operations are pop(), pushBack(), and pushFront(), and flush(). // Flush() will ensure all queue pages are written to the pager. -// popFront() will only return records that have been flushed. +// pop() will only return records that have been flushed, and pops +// from the front of the queue. // // Each page contains some number of T items and a link to the next page. // When the queue is flushed, the last page in the chain is ended and linked to a newly allocated @@ -101,21 +104,65 @@ std::string toString(const VectorRef &v) { // // The write pattern is designed such that no written/updated yet not fsync'd page is ever // expected to be valid. -template -class FIFOQueue { - static_assert(std::is_trivially_copyable::value); +// +// Requirements on T +// - must be trivially copyable +// OR have a specialization for FIFOQueueCodec +// OR have the following methods +// // Deserialize from src into *this, return number of bytes from src consumed +// int readFromBytes(const uint8_t *src); +// // Return the size of *this serialized +// int bytesNeeded() const; +// // Serialize *this to dst, return number of bytes written to dst +// int writeToBytes(uint8_t *dst) const; +// - must be supported by toString(object) by either a toString specialization +// OR implement the toString method: +// std::string toString() const; +template +struct FIFOQueueCodec { + static T readFromBytes(const uint8_t *src, int &bytesRead) { + T x; + bytesRead = x.readFromBytes(src); + return x; + } + static int bytesNeeded(const T &x) { + return x.bytesNeeded(); + } + static int writeToBytes(uint8_t *dst, const T &x) { + return x.writeToBytes(dst); + } +}; + +template +struct FIFOQueueCodec::value>::type> { + static_assert(std::is_trivially_copyable::value); + static T readFromBytes(const uint8_t *src, int &bytesRead) { + bytesRead = sizeof(T); + return *(T *)src; + } + static int bytesNeeded(const T &x) { + return sizeof(T); + } + static int writeToBytes(uint8_t *dst, const T &x) { + *(T *)dst = x; + return sizeof(T); + } +}; + +template> +class FIFOQueue { public: #pragma pack(push, 1) struct QueueState { LogicalPageID headPageID = invalidLogicalPageID; LogicalPageID tailPageID = invalidLogicalPageID; - uint16_t headIndex; + uint16_t headOffset; // Note that there is no tail index because the tail page is always never-before-written and its index will start at 0 int64_t numPages; int64_t numEntries; std::string toString() const { - return format("head: %u:%d tail: %u numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headIndex, tailPageID, numPages, numEntries); + return format("head: page %u offset %d tail: page %u numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headOffset, tailPageID, numPages, numEntries); } }; #pragma pack(pop) @@ -123,7 +170,7 @@ public: struct Cursor { // These can change when loading transitions from not ready to ready LogicalPageID pageID; - int index; + int offset; Reference page; FIFOQueue *queue; @@ -144,7 +191,7 @@ public: Cursor & operator=(const Cursor &c) { ASSERT(c.notLoading()); pageID = c.pageID; - index = c.index; + offset = c.offset; page = c.page; queue = c.queue; endPageID = c.endPageID; @@ -170,16 +217,16 @@ public: queue = q; // Initially the page is invalid and the index is 0 initNewHeadPage(invalidLogicalPageID); - index = 0; + offset = 0; loading = Void(); } // Initializes a cursor that will read in the forward direction starting from pageID p, index i up to but not touching pageID end - void initRead(FIFOQueue *q, LogicalPageID p, int i, LogicalPageID end) { - debug_printf("FIFOQueue(%s): New read queue cursor at page id=%u index=%d end page id=%u\n", q->name.c_str(), p, i, end); + void initRead(FIFOQueue *q, LogicalPageID p, int o, LogicalPageID end) { + debug_printf("FIFOQueue(%s): New read queue cursor at page id=%u offset=%d end page id=%u\n", q->name.c_str(), p, o, end); queue = q; pageID = p; - index = i; + offset = o; endPageID = end; // If cursor is not pointed at the end page then start loading it. @@ -189,22 +236,22 @@ public: void initNewTailPage(LogicalPageID newPageID) { pageID = newPageID; - index = 0; + offset = 0; page = queue->pager->newPageBuffer(); setNext(0, 0); auto p = raw(); p->formatVersion = RawPage::FORMAT_VERSION; - p->endIndex = 0; + p->endOffset = 0; } void initNewHeadPage(LogicalPageID newPageID) { page = queue->pager->newPageBuffer(); - setNext(pageID, index); + setNext(pageID, offset); auto p = raw(); p->formatVersion = RawPage::FORMAT_VERSION; - p->endIndex = queue->itemsPerPage; pageID = newPageID; - index = queue->itemsPerPage; + offset = queue->dataBytesPerPage; + p->endOffset = offset; } Future onNotLoading() const { @@ -220,11 +267,10 @@ public: static constexpr int FORMAT_VERSION = 1; uint16_t formatVersion; LogicalPageID nextPageID; - uint16_t nextIndex; - uint16_t endIndex; - - inline T & at(int i) { - return ((T *)(this + 1))[i]; + uint16_t nextOffset; + uint16_t endOffset; + uint8_t * begin() { + return (uint8_t *)(this + 1); } }; #pragma pack(pop) @@ -233,18 +279,18 @@ public: return ((RawPage *)(page->begin())); } - void setNext(LogicalPageID pageID, int index) { + void setNext(LogicalPageID pageID, int offset) { RawPage *p = raw(); p->nextPageID = pageID; - p->nextIndex = index; + p->nextOffset = offset; } void setNext(const Cursor &cursor) { - setNext(cursor.pageID, cursor.index); + setNext(cursor.pageID, cursor.offset); } Future loadPage() { - debug_printf("FIFOQueue(%s): loading page id=%u index=%d\n", queue->name.c_str(), pageID, index); + debug_printf("FIFOQueue(%s): loading page id=%u offset=%d\n", queue->name.c_str(), pageID, offset); return map(queue->pager->readPage(pageID, true), [=](Reference p) { page = p; ASSERT(raw()->formatVersion == RawPage::FORMAT_VERSION); @@ -288,12 +334,8 @@ public: return loading; } - bool operator== (const Cursor &rhs) { - return pageID == rhs.pageID && index == rhs.index; - } - bool empty() { - return raw()->endIndex == 0; + return raw()->endOffset == 0; } void writePage() { @@ -308,21 +350,19 @@ public: } Future writeTail(const T &item) { - // If the cursor is loaded already, write the item and move to the next slot - if(loading.isReady()) { - debug_printf("FIFOQueue(%s): writeTail to %u:%d\n", queue->name.c_str(), pageID, index); - auto p = raw(); - p->at(index) = item; - ++queue->numEntries; - ++index; - p->endIndex = index; - if(index == queue->itemsPerPage) { - newTailPage(); - } - return Void(); + ASSERT(loading.isReady()); + debug_printf("FIFOQueue(%s): writeTail(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); + auto p = raw(); + int bytesNeeded = Codec::bytesNeeded(item); + if(offset + bytesNeeded > queue->dataBytesPerPage) { + newTailPage(); + return waitThenWriteTail(this, item); } - - return waitThenWriteTail(this, item); + Codec::writeToBytes(p->begin() + offset, item); + ++queue->numEntries; + offset += bytesNeeded; + p->endOffset = offset; + return Void(); } ACTOR static Future waitThenWriteHead(Cursor *self, T item) { @@ -332,22 +372,18 @@ public: } Future writeHead(const T &item) { - // If the cursor is loaded already, write the item and move to the next slot - if(loading.isReady()) { - debug_printf("FIFOQueue(%s): writeHead to %u:%d\n", queue->name.c_str(), pageID, index); - if(index == 0) { - newHeadPage(); - } - else { - --index; - auto p = raw(); - p->at(index) = item; - ++queue->numEntries; - return Void(); - } + ASSERT(loading.isReady()); + debug_printf("FIFOQueue(%s): writeHead(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); + int bytesNeeded = Codec::bytesNeeded(item); + if(offset < bytesNeeded) { + newHeadPage(); + return waitThenWriteHead(this, item); } - - return waitThenWriteHead(this, item); + offset -= bytesNeeded; + auto p = raw(); + Codec::writeToBytes(p->begin() + offset, item); + ++queue->numEntries; + return Void(); } ACTOR static Future> waitThenMoveNext(Cursor *self, Optional upperBound) { @@ -375,22 +411,24 @@ public: // If loading is ready, read an item and move forward if(loading.isReady()) { auto p = raw(); - T result = p->at(index); + int bytesRead; + T result = Codec::readFromBytes(p->begin() + offset, bytesRead); if(upperBound.present() && upperBound.get() < result) { - debug_printf("FIFOQueue(%s) read cursor page id=%u index=%d endIndex=%d exceeds upper bound\n", queue->name.c_str(), pageID, index, p->endIndex); + debug_printf("FIFOQueue(%s) not popping %s from page id=%u offset=%d endOffset=%d - exceeds upper bound %s\n", + queue->name.c_str(), toString(result).c_str(), pageID, offset, p->endOffset, toString(upperBound.get()).c_str()); return Optional(); } - debug_printf("FIFOQueue(%s) read cursor pop from page id=%u index=%d endIndex=%d\n", queue->name.c_str(), pageID, index, p->endIndex); + debug_printf("FIFOQueue(%s) popped %s from page id=%u offset=%d endOffset=%d\n", queue->name.c_str(), toString(result).c_str(), pageID, offset, p->endOffset); --queue->numEntries; - ++index; + offset += bytesRead; // If this page is out of items, start reading the next one - if(index == p->endIndex) { + if(offset == p->endOffset) { LogicalPageID oldPageID = pageID; pageID = p->nextPageID; - index = p->nextIndex; + offset = p->nextOffset; --queue->numPages; debug_printf("FIFOQueue(%s) advancing to next page id=%u endPageID=%u\n", queue->name.c_str(), pageID, endPageID); loading = (pageID == endPageID) ? Future() : loadPage(); @@ -421,7 +459,7 @@ public: name = queueName; numPages = 1; numEntries = 0; - itemsPerPage = (pager->getUsablePageSize() - sizeof(typename Cursor::RawPage)) / sizeof(T); + dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); tailWriter.initWriteTail(this, newPageID); headReader.initRead(this, newPageID, 0, newPageID); ASSERT(flush().isReady()); @@ -434,9 +472,9 @@ public: name = queueName; numPages = qs.numPages; numEntries = qs.numEntries; - itemsPerPage = (pager->getUsablePageSize() - sizeof(typename Cursor::RawPage)) / sizeof(T); + dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); tailWriter.initWriteTail(this, qs.tailPageID); - headReader.initRead(this, qs.headPageID, qs.headIndex, qs.tailPageID); + headReader.initRead(this, qs.headPageID, qs.headOffset, qs.tailPageID); ASSERT(flush().isReady()); } @@ -446,10 +484,10 @@ public: QueueState getState() const { // It only makes sense to save queue state when the tail cursor points to a new empty page - ASSERT(tailWriter.index == 0); + ASSERT(tailWriter.offset == 0); QueueState s; - s.headIndex = headReader.index; + s.headOffset = headReader.offset; s.headPageID = headReader.pageID; s.tailPageID = tailWriter.pageID; s.numEntries = numEntries; @@ -539,12 +577,12 @@ public: } void pushBack(const T &item) { - debug_printf("FIFOQueue(%s): pushBack\n", name.c_str()); + debug_printf("FIFOQueue(%s): pushBack(%s)\n", name.c_str(), toString(item).c_str()); pushBackQueue.send(item); } void pushFront(const T &item) { - debug_printf("FIFOQueue(%s): pushFront\n", name.c_str()); + debug_printf("FIFOQueue(%s): pushFront(%s)\n", name.c_str(), toString(item).c_str()); pushFrontQueue.send(item); } @@ -591,7 +629,7 @@ public: IPager2 *pager; int64_t numPages; int64_t numEntries; - int itemsPerPage; + int dataBytesPerPage; PromiseStream pushBackQueue; PromiseStream pushFrontQueue; @@ -772,6 +810,10 @@ public: bool operator<(const DelayedFreePage &rhs) const { return version < rhs.version; } + + std::string toString() const { + return format("{page id=%u @%" PRId64 "}", pageID, version); + } }; typedef FIFOQueue VersionedLogicalPageQueueT; @@ -2295,7 +2337,35 @@ public: struct LazyDeleteQueueEntry { Version version; - LogicalPageID pageID; + Standalone pageID; + + bool operator< (const LazyDeleteQueueEntry &rhs) { + return version < rhs.version; + } + + int readFromBytes(const uint8_t *src) { + version = *(Version *)src; + src += sizeof(Version); + int count = *src++; + pageID = BTreePageID((LogicalPageID *)src, count); + return bytesNeeded(); + } + + int bytesNeeded() const { + return sizeof(Version) + 1 + (pageID.size() * sizeof(LogicalPageID)); + } + + int writeToBytes(uint8_t *dst) const { + *(Version *)dst = version; + dst += sizeof(Version); + *dst++ = pageID.size(); + memcpy(dst, pageID.begin(), pageID.size() * sizeof(LogicalPageID)); + return bytesNeeded(); + } + + std::string toString() const { + return format("{page id=%s @%" PRId64 "}", ::toString(pageID).c_str(), version); + } }; typedef FIFOQueue LazyDeleteQueueT; @@ -3022,7 +3092,8 @@ private: state int p; state BTreePageID childPageID; - // If there's still just 1 page, and it's the same size as the original, then reuse original page id(s) + // If we are only writing 1 page and it has the same BTreePageID size as the original they try to reuse the + // LogicalPageIDs in previousID and try to update them atomically. if(end && records.empty() && previousID.size() == pages.size()) { for(p = 0; p < pages.size(); ++p) { LogicalPageID id = wait(self->m_pager->atomicUpdatePage(previousID[p], pages[p], v)); @@ -3030,11 +3101,12 @@ private: } } else { - // Can't reused the old page IDs, so free the old ones (once) as of version and allocate new ones. + // Either the original page is being split, or it's not but it has changed BTreePageID size. + // Either way, there is no point in reusing any of the original page IDs because the parent + // must be rewritten anyway to count for the change in child count or child links. + // Free the old IDs, but only once (before the first output record is added). if(records.empty()) { - for(LogicalPageID id : previousID) { - self->m_pager->freePage(id, v); - } + self->freeBtreePage(previousID, v); } for(p = 0; p < pages.size(); ++p) { LogicalPageID id = wait(self->m_pager->newPageID()); @@ -3183,7 +3255,7 @@ private: } // Returns list of (version, internal page records, required upper bound) - ACTOR static Future> commitSubtree(VersionedBTree *self, MutationBufferT *mutationBuffer, Reference snapshot, BTreePageID rootID, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, const RedwoodRecordRef *decodeLowerBound, const RedwoodRecordRef *decodeUpperBound) { + ACTOR static Future> commitSubtree(VersionedBTree *self, MutationBufferT *mutationBuffer, Reference snapshot, BTreePageID rootID, bool isLeaf, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, const RedwoodRecordRef *decodeLowerBound, const RedwoodRecordRef *decodeUpperBound) { state std::string context; if(REDWOOD_DEBUG) { context = format("CommitSubtree(root=%s): ", toString(rootID).c_str()); @@ -3213,6 +3285,13 @@ private: // If the key is being mutated, them remove this subtree. if(iMutationBoundary == iMutationBoundaryEnd) { if(!iMutationBoundary->second.startKeyMutations.empty()) { + Version firstKeyChangeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : iMutationBoundary->second.startKeyMutations.begin()->first; + if(isLeaf) { + self->freeBtreePage(rootID, firstKeyChangeVersion); + } + else { + self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{firstKeyChangeVersion, rootID}); + } debug_printf("%s lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), toString(results).c_str()); return results; } @@ -3248,12 +3327,12 @@ private: state BTreePage::BinaryTree::Cursor cursor = getReader(rawPage)->getCursor(); cursor.moveFirst(); -// state Standalone> internalRecords; state Version writeVersion; state bool isRoot = (rootID == self->m_header.root.get()); // Leaf Page if(page->flags & BTreePage::IS_LEAF) { + ASSERT(isLeaf); state Standalone> merged; debug_printf("%s MERGING EXISTING DATA WITH MUTATIONS:\n", context.c_str()); @@ -3420,6 +3499,7 @@ private: } else { // Internal Page + ASSERT(!isLeaf); state std::vector>> futureChildren; bool first = true; @@ -3487,7 +3567,8 @@ private: futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, &childLowerBound, &childUpperBound)); } */ - futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, &childLowerBound, &childUpperBound, &decodeChildLowerBound, &decodeChildUpperBound)); + // If this page has height of 2 then its children are leaf nodes + futureChildren.push_back(self->commitSubtree(self, mutationBuffer, snapshot, pageID, page->height == 2, &childLowerBound, &childUpperBound, &decodeChildLowerBound, &decodeChildUpperBound)); } // Waiting one at a time makes debugging easier @@ -3506,6 +3587,7 @@ private: // TODO: Either handle multi-versioned results or change commitSubtree interface to return a single child set. ASSERT(self->singleVersion); + writeVersion = self->getLastCommittedVersion() + 1; cursor.moveFirst(); // All of the things added to pageBuilder will exist in the arenas inside futureChildren or will be upperBound InternalPageBuilder pageBuilder(cursor); @@ -3525,8 +3607,8 @@ private: if(pageBuilder.modified) { // If the page now has no children if(pageBuilder.childPageCount == 0) { - self->freeBtreePage(rootID, writeVersion); - debug_printf("%s All internal page children were deleted #1 so deleting this page too, returning %s\n", context.c_str(), toString(results).c_str()); + self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{writeVersion, rootID}); + debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), toString(results).c_str()); return results; } else { @@ -3535,7 +3617,6 @@ private: ASSERT(pageBuilder.lastUpperBound == *upperBound); - writeVersion = self->getLastCommittedVersion() + 1; Standalone> childEntries = wait(holdWhile(pageBuilder.entries, writePages(self, false, lowerBound, upperBound, pageBuilder.entries, 0, page->height, writeVersion, rootID))); results.arena().dependsOn(childEntries.arena()); @@ -3582,7 +3663,7 @@ private: state Standalone rootPageID = self->m_header.root.get(); state RedwoodRecordRef lowerBound = dbBegin.withPageID(rootPageID); - Standalone versionedRoots = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); + Standalone versionedRoots = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, self->m_header.height == 1, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); debug_printf("CommitSubtree(root %s) returned %s\n", toString(rootPageID).c_str(), toString(versionedRoots).c_str()); // CommitSubtree on the root can only return 1 child at most because the pager interface only supports writing From 656eacc9653aaab1d6d8457167324fbba1f4f48b Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 1 Oct 2019 14:31:48 -0700 Subject: [PATCH 031/184] Increase the default client shard location cache size by a factor of 2. --- fdbclient/Knobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbclient/Knobs.cpp b/fdbclient/Knobs.cpp index d9777a1f1e..18a8c9ac45 100644 --- a/fdbclient/Knobs.cpp +++ b/fdbclient/Knobs.cpp @@ -69,7 +69,7 @@ ClientKnobs::ClientKnobs(bool randomize) { init( GRV_BATCH_TIMEOUT, 0.005 ); if( randomize && BUGGIFY ) GRV_BATCH_TIMEOUT = 0.1; init( BROADCAST_BATCH_SIZE, 20 ); if( randomize && BUGGIFY ) BROADCAST_BATCH_SIZE = 1; - init( LOCATION_CACHE_EVICTION_SIZE, 300000 ); + init( LOCATION_CACHE_EVICTION_SIZE, 600000 ); init( LOCATION_CACHE_EVICTION_SIZE_SIM, 10 ); if( randomize && BUGGIFY ) LOCATION_CACHE_EVICTION_SIZE_SIM = 3; init( GET_RANGE_SHARD_LIMIT, 2 ); From 2f7c0bf43a0a5c61315af106cb3126f405ea067b Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Tue, 1 Oct 2019 14:35:04 -0700 Subject: [PATCH 032/184] Add release note --- documentation/sphinx/source/release-notes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index e2858be844..b2d130bc67 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -13,13 +13,14 @@ Fixes Status ------ -* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) `_. +* Replaced ``cluster.database_locked`` status field with ``cluster.database_lock_state``, which contains two subfields: ``locked`` (boolean) and ``lock_uid`` (which contains the database lock uid if the database is locked). `(PR #2058) `_ Bindings -------- Other Changes ------------- +* Double the number of shard locations that the client will cache locally. `(PR #2198) `_ Earlier release notes --------------------- From fa357ef1ca3985c268b505e865824694427c1b62 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 2 Oct 2019 06:43:11 -0700 Subject: [PATCH 033/184] Bug fixes. COWPager's page cache was being initialized too late in recovery, after it had already been used. Cursor's KeyValueRef memory sometimes pointed to freed memory from an InternalCursor that had been moved. Added valgrind macros to avoid false positives. --- fdbserver/VersionedBTree.actor.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 3f2bed1ab6..2047ccd9e4 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -340,6 +340,8 @@ public: void writePage() { debug_printf("FIFOQueue(%s): write page id=%u\n", queue->name.c_str(), pageID); + VALGRIND_MAKE_MEM_DEFINED(raw()->begin(), offset); + VALGRIND_MAKE_MEM_DEFINED(raw()->begin() + offset, queue->dataBytesPerPage - raw()->endOffset); queue->pager->updatePage(pageID, page); } @@ -351,13 +353,13 @@ public: Future writeTail(const T &item) { ASSERT(loading.isReady()); - debug_printf("FIFOQueue(%s): writeTail(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); auto p = raw(); int bytesNeeded = Codec::bytesNeeded(item); if(offset + bytesNeeded > queue->dataBytesPerPage) { newTailPage(); return waitThenWriteTail(this, item); } + debug_printf("FIFOQueue(%s): writeTail(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); Codec::writeToBytes(p->begin() + offset, item); ++queue->numEntries; offset += bytesNeeded; @@ -373,7 +375,6 @@ public: Future writeHead(const T &item) { ASSERT(loading.isReady()); - debug_printf("FIFOQueue(%s): writeHead(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); int bytesNeeded = Codec::bytesNeeded(item); if(offset < bytesNeeded) { newHeadPage(); @@ -381,6 +382,7 @@ public: } offset -= bytesNeeded; auto p = raw(); + debug_printf("FIFOQueue(%s): writeHead(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); Codec::writeToBytes(p->begin() + offset, item); ++queue->numEntries; return Void(); @@ -771,6 +773,11 @@ public: cache.clear(); } + int count() const { + ASSERT(evictionOrder.size() == cache.size()); + return evictionOrder.size(); + } + private: struct Entry : public boost::intrusive::list_base_hook<> { IndexType index; @@ -778,10 +785,10 @@ private: }; int sizeLimit; - boost::intrusive::list evictionOrder; // TODO: Use boost intrusive unordered set instead, with a comparator that only considers entry.index std::unordered_map cache; + boost::intrusive::list evictionOrder; }; ACTOR template Future forwardError(Future f, Promise target) { @@ -837,6 +844,8 @@ public: if(pHeader != nullptr) { pHeader->pageSize = logicalPageSize; } + ASSERT(pageCache.count() == 0); + pageCache = PageCacheT(pageCacheBytes / physicalPageSize); } void updateCommittedHeader() { @@ -972,8 +981,6 @@ public: wait(self->commit()); } - self->pageCache = PageCacheT(self->pageCacheBytes / self->physicalPageSize); - debug_printf("COWPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, self->physicalPageSize); return Void(); } @@ -3059,7 +3066,7 @@ private: ASSERT(blockCount > 1); int size = blockSize * blockCount; btPage = (BTreePage *)new uint8_t[size]; - VALGRIND_MAKE_MEM_DEFINED(btPageMem, size); + VALGRIND_MAKE_MEM_DEFINED(btPage, size); } btPage->formatVersion = BTreePage::FORMAT_VERSION; @@ -3155,7 +3162,7 @@ private: while(records.size() > 1) { self->m_header.height = ++height; Standalone> newRecords = wait(writePages(self, false, &dbBegin, &dbEnd, records, 0, height, version, BTreePageID())); - debug_printf_always("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, newRecords.size()); + debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, newRecords.size()); records = newRecords; } @@ -3242,7 +3249,7 @@ private: debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); // Nothing should attempt to read bytes in the page outside the BTreePage structure - VALGRIND_MAKE_MEM_UNDEFINED(result->begin() + pTreePage->size(), result->size() - pTreePage->size()); + VALGRIND_MAKE_MEM_UNDEFINED(page->begin() + pTreePage->size(), page->size() - pTreePage->size()); return page; } @@ -4195,6 +4202,7 @@ private: self->m_arena = Arena(); const RedwoodRecordRef &rec = self->m_cur1.get(); + self->m_kv.reset(); debug_printf("readFullKVPair: Starting at %s\n", self->toString().c_str()); // Unsplit value, cur1 will hold the key and value memory From 69fe02933da6511f595568688e08f684f5613861 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 9 Oct 2019 12:59:01 -0700 Subject: [PATCH 034/184] Replace /flow/delayOrdering with /flow/buggifiedDelay Seems that we don't want the property that delays become ready in order to hold, so make sure it doesn't hold in the simulator. --- fdbrpc/FlowTests.actor.cpp | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/fdbrpc/FlowTests.actor.cpp b/fdbrpc/FlowTests.actor.cpp index 7317c81ff0..2939f96b6f 100644 --- a/fdbrpc/FlowTests.actor.cpp +++ b/fdbrpc/FlowTests.actor.cpp @@ -50,24 +50,27 @@ TEST_CASE("/flow/actorcompiler/lineNumbers") { return Void(); } -TEST_CASE("/flow/delayOrdering") { - state double x = deterministicRandom()->random01(); - state double y = deterministicRandom()->random01(); - if (BUGGIFY) { - y = x; +TEST_CASE("/flow/buggifiedDelay") { + if (FLOW_KNOBS->MAX_BUGGIFIED_DELAY == 0) { + return Void(); + } + loop { + state double x = deterministicRandom()->random01(); + state int last = 0; + state Future f1 = map(delay(x), [last = &last](const Void&) { + *last = 1; + return Void(); + }); + state Future f2 = map(delay(x), [last = &last](const Void&) { + *last = 2; + return Void(); + }); + wait(f1 && f2); + if (last == 1) { + TEST(true); // Delays can become ready out of order + return Void(); + } } - state int last = 0; - state Future f1 = map(delay(x), [last = &last](const Void&) { - *last = 1; - return Void(); - }); - state Future f2 = map(delay(y), [last = &last](const Void&) { - *last = 2; - return Void(); - }); - wait(f1 && f2); - ASSERT((x <= y) == (last == 2)); - return Void(); } template From 0489f81c109853c9569009d504008ffa0e79f8f9 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Thu, 10 Oct 2019 11:49:07 -0700 Subject: [PATCH 035/184] Initial commit to modify machine attrition to work outside simulation --- .../workloads/MachineAttrition.actor.cpp | 92 +++++++++++++++++-- 1 file changed, 83 insertions(+), 9 deletions(-) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 9fd9245971..243c28c16d 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -24,6 +24,7 @@ #include "fdbserver/workloads/workloads.actor.h" #include "fdbrpc/simulator.h" #include "fdbclient/ManagementAPI.actor.h" +#include "ClusterRecruitmentInterface.h" #include "flow/actorcompiler.h" // This must be the last #include. static std::set const& normalAttritionErrors() { @@ -59,7 +60,7 @@ ACTOR Future ignoreSSFailuresForDuration(Database cx, double duration) { struct MachineAttritionWorkload : TestWorkload { bool enabled; int machinesToKill, machinesToLeave; - double testDuration; + double testDuration, suspendDuration; bool reboot; bool killDc; bool killSelf; @@ -78,6 +79,7 @@ struct MachineAttritionWorkload : TestWorkload { machinesToKill = getOption( options, LiteralStringRef("machinesToKill"), 2 ); machinesToLeave = getOption( options, LiteralStringRef("machinesToLeave"), 1 ); testDuration = getOption( options, LiteralStringRef("testDuration"), 10.0 ); + suspendDuration = getOption( options, LiteralStringRef("suspendDuration"), 1.0 ); reboot = getOption( options, LiteralStringRef("reboot"), false ); killDc = getOption( options, LiteralStringRef("killDc"), deterministicRandom()->random01() < 0.25 ); killSelf = getOption( options, LiteralStringRef("killSelf"), false ); @@ -124,6 +126,12 @@ struct MachineAttritionWorkload : TestWorkload { reportErrorsExcept( machineKillWorker( this, meanDelay, cx ), "machineKillWorkerError", UID(), &normalAttritionErrors()), testDuration, Void() ); } + if (!clientId && !g_network->isSimulated()) { + double meanDelay = testDuration / machinesToKill; + return timeout(reportErrorsExcept(noSimMachineKillWorker(this, meanDelay, cx), + "noSimMachineKillWorkerError", UID(), &normalAttritionErrors()), + testDuration, Void()); + } if(killSelf) throw please_reboot(); return Void(); @@ -132,17 +140,84 @@ struct MachineAttritionWorkload : TestWorkload { virtual void getMetrics( vector& m ) { } - struct UIDPredicate { - UIDPredicate(StringRef uid ) : uid( uid ) {} - bool operator() ( WorkerInterface rhs ) { return rhs.locality.zoneId() != uid; } - private: - StringRef uid; - }; + ACTOR static Future noSimMachineKillWorker(MachineAttritionWorkload *self, double meanDelay, Database cx) { + ASSERT(!g_network->isSimulated()); + state int killedMachines = 0; + state double delayBeforeKill = deterministicRandom()->random01() * meanDelay; + state std::vector workers = + wait(self->dbInfo->get().clusterInterface.getWorkers.getReply(GetWorkersRequest())); + deterministicRandom()->randomShuffle(workers); + // Can reuse reboot request to send to each interface since no reply promise needed + state RebootRequest rbReq; + if (self->reboot) { + rbReq.waitForDuration = self->suspendDuration; + } else { + rbReq.waitForDuration = std::numeric_limits::max(); + } + if (self->killDc) { + wait(delay(delayBeforeKill)); + // Pick a dcId to kill + while (workers.back().processClass == ProcessClass::ClassType::TesterClass) { + deterministicRandom()->randomShuffle(workers); + } + Optional> killDcId = workers.back().interf.locality.dcId(); + TraceEvent("Assassination").detail("TargetDataCenter", killDcId); + for (const auto& worker : workers) { + // kill all matching dcId workers, except testers + if (worker.interf.locality.dcId() == killDcId && + worker.processClass == ProcessClass::ClassType::TesterClass) { + worker.interf.clientInterface.reboot.send(rbReq); + } + } + } else { + while (killedMachines < self->machinesToKill && workers.size() > self->machinesToLeave) { + TraceEvent("WorkerKillBegin") + .detail("KilledMachines", killedMachines) + .detail("MachinesToKill", self->machinesToKill) + .detail("MachinesToLeave", self->machinesToLeave) + .detail("Machines", workers.size()); + wait(delay(delayBeforeKill)); + TraceEvent("WorkerKillAfterDelay").detail("Delay", delayBeforeKill); + if (self->waitForVersion) { + state Transaction tr(cx); + loop { + try { + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + wait(success(tr.getReadVersion())); + break; + } catch (Error& e) { + wait(tr.onError(e)); + } + } + } + // Pick a machine to kill, ignoring testers + state WorkerDetails targetMachine; + while (workers.back().processClass == ProcessClass::ClassType::TesterClass) { + deterministicRandom()->randomShuffle(workers); + } + targetMachine = workers.back(); + TraceEvent("Assassination") + .detail("TargetMachine", targetMachine.interf.locality.toString()) + .detail("ZoneId", targetMachine.interf.locality.zoneId()) + .detail("KilledMachines", killedMachines) + .detail("MachinesToKill", self->machinesToKill) + .detail("MachinesToLeave", self->machinesToLeave) + .detail("Machines", self->machines.size()); + targetMachine.interf.clientInterface.reboot.send(rbReq); + killedMachines++; + workers.pop_back(); + wait(delay(meanDelay - delayBeforeKill)); + delayBeforeKill = deterministicRandom()->random01() * meanDelay; + TraceEvent("WorkerKillAfterMeanDelay").detail("DelayBeforeKill", delayBeforeKill); + } + } + return Void(); + } ACTOR static Future machineKillWorker( MachineAttritionWorkload *self, double meanDelay, Database cx ) { state int killedMachines = 0; state double delayBeforeKill = deterministicRandom()->random01() * meanDelay; - state std::set killedUIDs; ASSERT( g_network->isSimulated() ); @@ -196,7 +271,6 @@ struct MachineAttritionWorkload : TestWorkload { TEST(true); //Marked a zone for maintenance before killing it bool _ = wait(setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20)); - // } } else if (BUGGIFY_WITH_PROB(0.005)) { TEST(true); // Disable DD for all storage server failures self->ignoreSSFailures = From 373ac3026ffabe3479a7c7c462b98ce1b37698f0 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 14 Oct 2019 15:03:04 -0700 Subject: [PATCH 036/184] update check for dcId --- fdbserver/workloads/MachineAttrition.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 243c28c16d..685fe181f1 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -164,8 +164,8 @@ struct MachineAttritionWorkload : TestWorkload { TraceEvent("Assassination").detail("TargetDataCenter", killDcId); for (const auto& worker : workers) { // kill all matching dcId workers, except testers - if (worker.interf.locality.dcId() == killDcId && - worker.processClass == ProcessClass::ClassType::TesterClass) { + if (worker.interf.locality.dcId().present() && worker.interf.locality.dcId() == killDcId && + worker.processClass != ProcessClass::ClassType::TesterClass) { worker.interf.clientInterface.reboot.send(rbReq); } } From c3e2bde987a235aee02d668af910dc454c68d0c0 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 15 Oct 2019 03:10:50 -0700 Subject: [PATCH 037/184] Deferred subtree clears and expiring/reusing old pages is complete. Many bug fixes involving scheduled page freeing, page list queue flushing, and expiring old snapshots (this was mostly written but not used yet). Rewrote most of FIFOQueue (again) to more cleanly handle queue cyclical dependencies caused by having queues that use a pager which in tern uses the same queues for managing page freeing and allocation. Many debug output improvements, including making BTreePageIDs and LogicalPageIDs stringify the same way everywhere to make following a PageID easier. --- fdbserver/IPager.h | 2 +- fdbserver/VersionedBTree.actor.cpp | 1090 ++++++++++++++-------------- 2 files changed, 560 insertions(+), 532 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index e2805770f9..508c90cf9b 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -205,7 +205,7 @@ public: virtual void setMetaKey(KeyRef metaKey) = 0; // Sets the next commit version - virtual void setVersion(Version v) = 0; + virtual void setCommitVersion(Version v) = 0; virtual StorageBytes getStorageBytes() = 0; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 2047ccd9e4..e37c44f436 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -50,7 +50,10 @@ std::string toString(const T &o) { } std::string toString(LogicalPageID id) { - return format("%" PRId64, id); + if(id == invalidLogicalPageID) { + return "LogicalPageID{invalid}"; + } + return format("LogicalPageID{%" PRId64 "}", id); } template @@ -87,23 +90,35 @@ std::string toString(const VectorRef &v) { return toString(v.begin(), v.end()); } +template +std::string toString(const Optional &o) { + if(o.present()) { + return toString(o.get()); + } + return ""; +} + // A FIFO queue of T stored as a linked list of pages. -// Operations are pop(), pushBack(), and pushFront(), and flush(). -// Flush() will ensure all queue pages are written to the pager. +// Main operations are pop(), pushBack(), pushFront(), and flush(). +// +// flush() will ensure all queue pages are written to the pager and move the unflushed +// pushFront()'d records onto the front of the queue, in FIFO order. +// // pop() will only return records that have been flushed, and pops // from the front of the queue. // -// Each page contains some number of T items and a link to the next page. +// Each page contains some number of T items and a link to the next page and starting position on that page. // When the queue is flushed, the last page in the chain is ended and linked to a newly allocated // but not-yet-written-to pageID, which future writes after the flush will write to. // Items pushed onto the front of the queue are written to a separate linked list until flushed, // at which point that list becomes the new front of the queue. // -// Committing changes to a queue involves flushing the queue, calling fsync, and then -// writing the QueueState which flush() returns somewhere and making it durable. -// -// The write pattern is designed such that no written/updated yet not fsync'd page is ever -// expected to be valid. +// The write pattern is designed such that no page is ever expected to be valid after +// being written to or updated but not fsync'd. This is why a new unused page is added +// to the queue, linked to by the last data page, before commit. The new page can't be +// added and filled with data as part of the next commit because that would mean modifying +// the previous tail page to update its next link, which risks corrupting it and losing +// data that was not yet popped if that write is never fsync'd. // // Requirements on T // - must be trivially copyable @@ -115,10 +130,7 @@ std::string toString(const VectorRef &v) { // int bytesNeeded() const; // // Serialize *this to dst, return number of bytes written to dst // int writeToBytes(uint8_t *dst) const; -// - must be supported by toString(object) by either a toString specialization -// OR implement the toString method: -// std::string toString() const; - +// - must be supported by toString(object) (see above) template struct FIFOQueueCodec { static T readFromBytes(const uint8_t *src, int &bytesRead) { @@ -155,6 +167,9 @@ class FIFOQueue { public: #pragma pack(push, 1) struct QueueState { + bool operator==(const QueueState &rhs) const { + return memcmp(this, &rhs, sizeof(QueueState)) == 0; + } LogicalPageID headPageID = invalidLogicalPageID; LogicalPageID tailPageID = invalidLogicalPageID; uint16_t headOffset; @@ -162,104 +177,83 @@ public: int64_t numPages; int64_t numEntries; std::string toString() const { - return format("head: page %u offset %d tail: page %u numPages: %" PRId64 " numEntries: %" PRId64 "\n", headPageID, (int)headOffset, tailPageID, numPages, numEntries); + return format("{head: %s:%d tail: %s numPages: %" PRId64 " numEntries: %" PRId64 "}", ::toString(headPageID).c_str(), (int)headOffset, ::toString(tailPageID).c_str(), numPages, numEntries); } }; #pragma pack(pop) struct Cursor { - // These can change when loading transitions from not ready to ready + enum Mode { + NONE, + READ, + WRITE + }; + + // The current page being read or written to LogicalPageID pageID; + + // The first page ID to be written to the pager, if this cursor has written anything + LogicalPageID firstPageIDWritten; + + // Offset after RawPage header to next read from or write to int offset; - Reference page; - FIFOQueue *queue; - Future loading; - - // Cursor will not read this page or anything beyond it. + // A read cursor will not read this page (or beyond) LogicalPageID endPageID; - Cursor() : queue(nullptr), pageID(invalidLogicalPageID), endPageID(invalidLogicalPageID) { + Reference page; + FIFOQueue *queue; + Future operation; + Mode mode; + + uint32_t debug_id; + + Cursor() : mode(NONE) { + debug_id = deterministicRandom()->randomUInt32(); } - Cursor(const Cursor &c) = delete; + // Initialize a cursor. Since cursors can have async operations pending they can't be copied cleanly. + void init(FIFOQueue *q = nullptr, Mode m = NONE, LogicalPageID initialPageID = invalidLogicalPageID, int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { + if(operation.isValid()) { + operation.cancel(); + } + queue = q; + mode = m; + firstPageIDWritten = invalidLogicalPageID; + offset = readOffset; + endPageID = endPage; + page.clear(); + + if(mode == READ) { + // If cursor is not pointed at the end page then start loading it. + // The end page will not have been written to disk yet. + pageID = initialPageID; + operation = (pageID == endPageID) ? Void() : loadPage(); + } + else { + pageID = invalidLogicalPageID; + ASSERT(mode == WRITE || (initialPageID == invalidLogicalPageID && readOffset == 0 && endPage == invalidLogicalPageID)); + operation = Void(); + } + + debug_printf("FIFOQueue::Cursor initialized: %s\n", toString().c_str()); + + if(mode == WRITE && initialPageID != invalidLogicalPageID) { + newPage(initialPageID); + } + } + + Cursor(const Cursor &other) = delete; ~Cursor() { - loading.cancel(); + operation.cancel(); } - Cursor & operator=(const Cursor &c) { - ASSERT(c.notLoading()); - pageID = c.pageID; - offset = c.offset; - page = c.page; - queue = c.queue; - endPageID = c.endPageID; - loading = Void(); - return *this; - } - - void setEnd(Cursor &end) { - endPageID = end.pageID; - } - - // Initializes a cursor that will write to new pages in the forward direction starting from newPageID - void initWriteTail(FIFOQueue *q, LogicalPageID newPageID) { - debug_printf("FIFOQueue(%s): New writeTail queue cursor at page id=%u\n", q->name.c_str(), newPageID); - queue = q; - initNewTailPage(newPageID); - loading = Void(); - } - - // Initializes a cursor that will write to new pages in the reverse direction, allocating pages as needed. - void initWriteHead(FIFOQueue *q) { - debug_printf("FIFOQueue(%s): New writeHead queue cursor\n", q->name.c_str()); - queue = q; - // Initially the page is invalid and the index is 0 - initNewHeadPage(invalidLogicalPageID); - offset = 0; - loading = Void(); - } - - // Initializes a cursor that will read in the forward direction starting from pageID p, index i up to but not touching pageID end - void initRead(FIFOQueue *q, LogicalPageID p, int o, LogicalPageID end) { - debug_printf("FIFOQueue(%s): New read queue cursor at page id=%u offset=%d end page id=%u\n", q->name.c_str(), p, o, end); - queue = q; - pageID = p; - offset = o; - endPageID = end; - - // If cursor is not pointed at the end page then start loading it. - // The end page will not have been written to disk yet. - loading = (p == endPageID) ? Future() : loadPage(); - } - - void initNewTailPage(LogicalPageID newPageID) { - pageID = newPageID; - offset = 0; - page = queue->pager->newPageBuffer(); - setNext(0, 0); - auto p = raw(); - p->formatVersion = RawPage::FORMAT_VERSION; - p->endOffset = 0; - } - - void initNewHeadPage(LogicalPageID newPageID) { - page = queue->pager->newPageBuffer(); - setNext(pageID, offset); - auto p = raw(); - p->formatVersion = RawPage::FORMAT_VERSION; - pageID = newPageID; - offset = queue->dataBytesPerPage; - p->endOffset = offset; - } - - Future onNotLoading() const { - return loading.isValid() ? loading : Void(); - } - - bool notLoading() const { - return !loading.isValid() || loading.isReady(); + std::string toString() const { + if(mode == NONE) { + return format("{cursor=%x queue=n/a}", debug_id); + } + return format("{cursor=%x queue=%s mode=%d pos=%s:%d endOffset=%d endPage=%s}", debug_id, queue ? queue->name.c_str() : "null", mode, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, ::toString(endPageID).c_str()); } #pragma pack(push, 1) @@ -275,22 +269,29 @@ public: }; #pragma pack(pop) + Future notBusy() { + return operation; + } + + // Returns true if any items have been written to the last page + bool pendingWrites() const { + return mode == WRITE && offset != 0; + } + RawPage * raw() const { return ((RawPage *)(page->begin())); } void setNext(LogicalPageID pageID, int offset) { + ASSERT(mode == WRITE); RawPage *p = raw(); p->nextPageID = pageID; p->nextOffset = offset; } - void setNext(const Cursor &cursor) { - setNext(cursor.pageID, cursor.offset); - } - Future loadPage() { - debug_printf("FIFOQueue(%s): loading page id=%u offset=%d\n", queue->name.c_str(), pageID, offset); + ASSERT(mode == READ); + debug_printf("FIFOQueue::Cursor loading %s\n", toString().c_str()); return map(queue->pager->readPage(pageID, true), [=](Reference p) { page = p; ASSERT(raw()->formatVersion == RawPage::FORMAT_VERSION); @@ -298,152 +299,141 @@ public: }); } - // Allocate a new next page for the cursor's old page to link to, write the old page, then point the cursor at the new page. - Future newTailPage() { - ASSERT(page); - ASSERT(loading.isReady()); - - loading = map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { - debug_printf("FIFOQueue(%s): new tail page id=%u\n", queue->name.c_str(), newPageID); - setNext(newPageID, 0); - writePage(); - ++queue->numPages; - initNewTailPage(newPageID); - return Void(); - }); - - return loading; - } - - // Allocate a new previous page which links to the cursor's old page, write the old page if first is false, and then point the cursor at the new page. - Future newHeadPage() { - ASSERT(page); - ASSERT(loading.isReady()); - - loading = map(queue->pager->newPageID(), [=](LogicalPageID newPageID) { - debug_printf("FIFOQueue(%s): new head page id=%u\n", queue->name.c_str(), newPageID); - // Write the page if it has a valid ID and a valid nextPageID - if(pageID != invalidLogicalPageID && raw()->nextPageID != invalidLogicalPageID) { - writePage(); - } - initNewHeadPage(newPageID); - ++queue->numPages; - return Void(); - }); - - return loading; - } - - bool empty() { - return raw()->endOffset == 0; - } - void writePage() { - debug_printf("FIFOQueue(%s): write page id=%u\n", queue->name.c_str(), pageID); + ASSERT(mode == WRITE); + debug_printf("FIFOQueue(%s) writing page %s\n", queue->name.c_str(), toString().c_str()); VALGRIND_MAKE_MEM_DEFINED(raw()->begin(), offset); VALGRIND_MAKE_MEM_DEFINED(raw()->begin() + offset, queue->dataBytesPerPage - raw()->endOffset); queue->pager->updatePage(pageID, page); - } - - ACTOR static Future waitThenWriteTail(Cursor *self, T item) { - wait(self->loading); - wait(self->writeTail(item)); - return Void(); - } - - Future writeTail(const T &item) { - ASSERT(loading.isReady()); - auto p = raw(); - int bytesNeeded = Codec::bytesNeeded(item); - if(offset + bytesNeeded > queue->dataBytesPerPage) { - newTailPage(); - return waitThenWriteTail(this, item); + if(firstPageIDWritten == invalidLogicalPageID) { + firstPageIDWritten = pageID; } - debug_printf("FIFOQueue(%s): writeTail(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); - Codec::writeToBytes(p->begin() + offset, item); - ++queue->numEntries; - offset += bytesNeeded; - p->endOffset = offset; - return Void(); } - ACTOR static Future waitThenWriteHead(Cursor *self, T item) { - wait(self->loading); - wait(self->writeHead(item)); - return Void(); - } - - Future writeHead(const T &item) { - ASSERT(loading.isReady()); - int bytesNeeded = Codec::bytesNeeded(item); - if(offset < bytesNeeded) { - newHeadPage(); - return waitThenWriteHead(this, item); + ACTOR static Future newPage_impl(Cursor *self, Future previous, LogicalPageID newPageID, int newOffset, bool initializeNewPage) { + ASSERT(self->mode == WRITE); + wait(previous); + debug_printf("FIFOQueue::Cursor Adding page %s init=%d %s\n", ::toString(newPageID).c_str(), initializeNewPage, self->toString().c_str()); + ASSERT(self->mode == WRITE); + if(newPageID == invalidLogicalPageID) { + debug_printf("FIFOQueue::Cursor Allocating new page %s\n", self->toString().c_str()); + wait(store(newPageID, self->queue->pager->newPageID())); } - offset -= bytesNeeded; - auto p = raw(); - debug_printf("FIFOQueue(%s): writeHead(%s) to %u:%d\n", queue->name.c_str(), toString(item).c_str(), pageID, offset); - Codec::writeToBytes(p->begin() + offset, item); - ++queue->numEntries; + debug_printf("FIFOQueue::Cursor Adding page %s init=%d %s\n", ::toString(newPageID).c_str(), initializeNewPage, self->toString().c_str()); + + // Update existing page and write, if it exists + if(self->page) { + self->setNext(newPageID, newOffset); + debug_printf("FIFOQueue::Cursor Linked new page, writing %s\n", self->toString().c_str()); + self->writePage(); + } + + self->pageID = newPageID; + self->offset = newOffset; + + if(initializeNewPage) { + self->page = self->queue->pager->newPageBuffer(); + self->setNext(0, 0); + auto p = self->raw(); + p->formatVersion = RawPage::FORMAT_VERSION; + p->endOffset = 0; + ++self->queue->numPages; + } + + debug_printf("FIFOQueue::Cursor Added page %s\n", self->toString().c_str()); return Void(); } - ACTOR static Future> waitThenMoveNext(Cursor *self, Optional upperBound) { - wait(self->loading); - Optional result = wait(self->moveNext(upperBound)); + // Link the current page to newPageID:newOffset and then write it to the pager. + // If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized + // as a new tail page. + void newPage(LogicalPageID newPageID = invalidLogicalPageID, int newOffset = 0, bool initializeNewPage = true) { + operation = newPage_impl(this, operation, newPageID, newOffset, initializeNewPage); + } + + // Write item to the next position in the current page or, if it won't fit, add a new page and write it there. + ACTOR static Future write_impl(Cursor *self, Future previous, T item) { + ASSERT(self->mode == WRITE); + wait(previous); + state int bytesNeeded = Codec::bytesNeeded(item); + if(self->offset + bytesNeeded > self->queue->dataBytesPerPage) { + debug_printf("FIFOQueue::Cursor write(%s) page is full, adding new page %s\n", ::toString(item).c_str(), self->toString().c_str()); + wait(newPage_impl(self, Void(), invalidLogicalPageID, 0, true)); + wait(yield()); + } + debug_printf("FIFOQueue::Cursor write(%s) %s\n", ::toString(item).c_str(), self->toString().c_str()); + auto p = self->raw(); + Codec::writeToBytes(p->begin() + self->offset, item); + ++self->queue->numEntries; + self->offset += bytesNeeded; + p->endOffset = self->offset; + debug_printf("FIFOQueue::Cursor write(%s) finished, %s\n", ::toString(item).c_str(), self->toString().c_str()); + return Void(); + } + + void write(const T &item) { + operation = write_impl(this, operation, item); + } + + // Read the next item at the cursor, moving to a new page first if the current page is exhausted + ACTOR static Future> readNext_impl(Cursor *self, Future previous, Optional upperBound) { + ASSERT(self->mode == READ); + wait(previous); + + debug_printf("FIFOQueue::Cursor readNext begin %s\n", self->toString().c_str()); + if(self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) { + debug_printf("FIFOQueue::Cursor readNext returning nothing %s\n", self->toString().c_str()); + return Optional(); + } + + // We now know we are pointing to PageID and it should be read and used, but it may not be loaded yet. + if(!self->page) { + wait(self->loadPage()); + wait(yield()); + } + + debug_printf("FIFOQueue::Cursor readNext reading at current position %s\n", self->toString().c_str()); + auto p = self->raw(); + ASSERT(self->offset < p->endOffset); + int bytesRead; + T result = Codec::readFromBytes(p->begin() + self->offset, bytesRead); + + if(upperBound.present() && upperBound.get() < result) { + debug_printf("FIFOQueue(%s) not popping %s, exceeds upper bound %s %s\n", + self->queue->name.c_str(), ::toString(result).c_str(), ::toString(upperBound.get()).c_str(), self->toString().c_str()); + return Optional(); + } + + --self->queue->numEntries; + self->offset += bytesRead; + debug_printf("FIFOQueue::Cursor popped %s, %s\n", ::toString(result).c_str(), self->toString().c_str()); + ASSERT(self->offset <= p->endOffset); + + if(self->offset == p->endOffset) { + debug_printf("FIFOQueue::Cursor Page exhausted, %s\n", self->toString().c_str()); + --self->queue->numPages; + LogicalPageID oldPageID = self->pageID; + self->pageID = p->nextPageID; + self->offset = p->nextOffset; + self->page.clear(); + debug_printf("FIFOQueue::Cursor Page exhausted, moved to new page, %s\n", self->toString().c_str()); + + // Freeing the old page must happen after advancing the cursor and clearing the page reference because + // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this + // very same queue. + self->queue->pager->freePage(oldPageID, 0); + } + return result; } - // Read and moved past the next item if it is < upperBound - Future> moveNext(const Optional &upperBound = {}) { - // If loading is not valid then either the cursor is not initialized. - // It may have at one time pointed to a page not yet committed. - if(!loading.isValid()) { - // If the pageID isn't the endPageID then start loading the page - if(pageID != endPageID) { - debug_printf("FIFOQueue(%s) starting load of page id=%u which is no longer the end page id=%u\n", queue->name.c_str(), pageID, endPageID); - loading = loadPage(); - } - else { - // Otherwise we can't read anymore so return nothing - return Optional(); - } + Future> readNext(const Optional &upperBound = {}) { + if(mode == NONE) { + return Optional(); } - - // If loading is ready, read an item and move forward - if(loading.isReady()) { - auto p = raw(); - int bytesRead; - T result = Codec::readFromBytes(p->begin() + offset, bytesRead); - - if(upperBound.present() && upperBound.get() < result) { - debug_printf("FIFOQueue(%s) not popping %s from page id=%u offset=%d endOffset=%d - exceeds upper bound %s\n", - queue->name.c_str(), toString(result).c_str(), pageID, offset, p->endOffset, toString(upperBound.get()).c_str()); - return Optional(); - } - - debug_printf("FIFOQueue(%s) popped %s from page id=%u offset=%d endOffset=%d\n", queue->name.c_str(), toString(result).c_str(), pageID, offset, p->endOffset); - --queue->numEntries; - offset += bytesRead; - - // If this page is out of items, start reading the next one - if(offset == p->endOffset) { - LogicalPageID oldPageID = pageID; - pageID = p->nextPageID; - offset = p->nextOffset; - --queue->numPages; - debug_printf("FIFOQueue(%s) advancing to next page id=%u endPageID=%u\n", queue->name.c_str(), pageID, endPageID); - loading = (pageID == endPageID) ? Future() : loadPage(); - - // freePage() must be called after setting the loading future because freePage() might pop from this - // queue recursively if the pager's free list is being stored in this queue. - queue->pager->freePage(oldPageID, 0); - } - - return Optional(result); - } - - return waitThenMoveNext(this, upperBound); + Future> read = readNext_impl(this, operation, upperBound); + operation = success(read); + return read; } }; @@ -451,43 +441,48 @@ public: FIFOQueue() : pager(nullptr) { } + ~FIFOQueue() { + newTailPage.cancel(); + } + FIFOQueue(const FIFOQueue &other) = delete; void operator=(const FIFOQueue &rhs) = delete; // Create a new queue at newPageID void create(IPager2 *p, LogicalPageID newPageID, std::string queueName) { - debug_printf("FIFOQueue(%s): create from page id %u\n", queueName.c_str(), newPageID); + debug_printf("FIFOQueue(%s) create from page id %u\n", queueName.c_str(), newPageID); pager = p; name = queueName; numPages = 1; numEntries = 0; dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); - tailWriter.initWriteTail(this, newPageID); - headReader.initRead(this, newPageID, 0, newPageID); - ASSERT(flush().isReady()); + headReader.init(this, Cursor::READ, newPageID, 0, newPageID); + tailWriter.init(this, Cursor::WRITE, newPageID); + headWriter.init(this, Cursor::WRITE); + newTailPage = invalidLogicalPageID; + debug_printf("FIFOQueue(%s) created\n", queueName.c_str()); } // Load an existing queue from its queue state void recover(IPager2 *p, const QueueState &qs, std::string queueName) { - debug_printf("FIFOQueue(%s): recover from queue state %s\n", queueName.c_str(), qs.toString().c_str()); + debug_printf("FIFOQueue(%s) recover from queue state %s\n", queueName.c_str(), qs.toString().c_str()); pager = p; name = queueName; numPages = qs.numPages; numEntries = qs.numEntries; dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); - tailWriter.initWriteTail(this, qs.tailPageID); - headReader.initRead(this, qs.headPageID, qs.headOffset, qs.tailPageID); - ASSERT(flush().isReady()); + headReader.init(this, Cursor::READ, qs.headPageID, qs.headOffset, qs.tailPageID); + tailWriter.init(this, Cursor::WRITE, qs.tailPageID); + headWriter.init(this, Cursor::WRITE); + newTailPage = invalidLogicalPageID; + debug_printf("FIFOQueue(%s) recovered\n", queueName.c_str()); } Future> pop(Optional upperBound = {}) { - return headReader.moveNext(upperBound); + return headReader.readNext(upperBound); } QueueState getState() const { - // It only makes sense to save queue state when the tail cursor points to a new empty page - ASSERT(tailWriter.offset == 0); - QueueState s; s.headOffset = headReader.offset; s.headPageID = headReader.pageID; @@ -495,136 +490,116 @@ public: s.numEntries = numEntries; s.numPages = numPages; - debug_printf("FIFOQueue(%s): getState(): %s\n", name.c_str(), s.toString().c_str()); + debug_printf("FIFOQueue(%s) getState(): %s\n", name.c_str(), s.toString().c_str()); return s; } - ACTOR static Future pushBackActor(FIFOQueue *self, FutureStream input) { - try { - loop { - state T item = waitNext(input); - wait(self->tailWriter.writeTail(item)); - } - } - catch(Error &e) { - if(e.code() != error_code_end_of_stream) { - throw; - } - } - - // Wait for the head cursor to be done loading because it might free a page, which would add to the - // free list queue, which might be this queue. - wait(self->headReader.onNotLoading()); - - // Wait for the final write to the queue to be finished, it may be waiting for a new pageID after - // filling a page to capacity. - wait(self->tailWriter.onNotLoading()); - - // If tail page is not empty, link it to a new unwritten/empty page - if(!self->tailWriter.empty()) { - wait(self->tailWriter.newTailPage()); - } - - // We should not reach here until the pushFrontActor has already finished - ASSERT(self->pushFrontFuture.isReady()); - ASSERT(self->headWriterFront.notLoading()); - ASSERT(self->headWriterBack.notLoading()); - - // If any new pages were pushed on the front of the queue, link the tail page of the new front pages - // to the current head and write the page, then update head to point to the head of the new front pages. - if(self->headWriterBack.pageID != invalidLogicalPageID) { - self->headWriterBack.setNext(self->headReader); - self->headWriterBack.writePage(); - self->headReader = self->headWriterFront; - } - - // After queue is flushed, head may read everything written so far (which will have been committed) - self->headReader.setEnd(self->tailWriter); - - return self->getState(); - } - - // Create pages to prepend to the front of the queue. - ACTOR static Future pushFrontActor(FIFOQueue *self, FutureStream input) { - self->headWriterFront.initWriteHead(self); - self->headWriterBack.initWriteHead(self); - - state bool first = true; - - try { - loop { - state T item = waitNext(input); - wait(self->headWriterFront.writeHead(item)); - if(first) { - self->headWriterBack = self->headWriterFront; - first = false; - } - } - } - catch(Error &e) { - if(e.code() != error_code_end_of_stream) { - throw; - } - } - - // If any items were written, then at least one page was written. - if(!first) { - // If the head is on a different page than the tail then write the head page - if(self->headWriterFront.pageID != self->headWriterBack.pageID) { - self->headWriterFront.writePage(); - } - } - - return Void(); - } - void pushBack(const T &item) { - debug_printf("FIFOQueue(%s): pushBack(%s)\n", name.c_str(), toString(item).c_str()); - pushBackQueue.send(item); + debug_printf("FIFOQueue(%s) pushBack(%s)\n", name.c_str(), toString(item).c_str()); + tailWriter.write(item); } void pushFront(const T &item) { - debug_printf("FIFOQueue(%s): pushFront(%s)\n", name.c_str(), toString(item).c_str()); - pushFrontQueue.send(item); + debug_printf("FIFOQueue(%s) pushFront(%s)\n", name.c_str(), toString(item).c_str()); + headWriter.write(item); } - // Flush changes to the pager and return the resulting queue state. - ACTOR static Future flush_impl(FIFOQueue *self) { - debug_printf("FIFOQueue(%s): flush\n", self->name.c_str()); - - // Signal head writer to flush and wait for it - // This must be done first in case this queue is the freelist itself, since - // flushing the head writer might require getting a new pageID. - if(self->pushFrontFuture.isValid()) { - debug_printf("FIFOQueue(%s): headWriter valid\n", self->name.c_str()); - self->pushFrontQueue.sendError(end_of_stream()); - wait(self->pushFrontFuture); - } - - state QueueState qstate; - - // Signal tail writer to flush and wait for it - if(self->pushBackFuture.isValid()) { - debug_printf("FIFOQueue(%s): tailWriter valid\n", self->name.c_str()); - self->pushBackQueue.sendError(end_of_stream()); - wait(store(qstate, self->pushBackFuture)); - } - else { - qstate = self->getState(); - } - - // Start new tail writer - self->pushBackQueue = PromiseStream(); - self->pushBackFuture = pushBackActor(self, self->pushBackQueue.getFuture()); - - // Start new head writer - self->pushFrontQueue = PromiseStream(); - self->pushFrontFuture = pushFrontActor(self, self->pushFrontQueue.getFuture()); - - return qstate; + // Wait until the most recently started operations on each cursor as of now are ready + Future notBusy() { + return headWriter.notBusy() && headReader.notBusy() && tailWriter.notBusy() && ready(newTailPage); } - Future flush() { + // Returns true if any most recently started operations on any cursors are not ready + bool busy() { + return !headWriter.notBusy().isReady() || !headReader.notBusy().isReady() || !tailWriter.notBusy().isReady() || !newTailPage.isReady(); + } + + // preFlush() prepares this queue to be flushed to disk, but doesn't actually do it so the queue can still + // be pushed and popped after this operation. It returns whether or not any operations were pending or + // started during execution. + // + // If one or more queues are used by their pager in newPageID() or freePage() operations, then preFlush() + // must be called on each of them inside a loop that runs until each of the preFlush() calls have returned + // false. + // + // The reason for all this is that: + // - queue pop() can call pager->freePage() which can call push() on the same or another queue + // - queue push() can call pager->newPageID() which can call pop() on the same or another queue + // This creates a circular dependency with 1 or more queues when those queues are used by the pager + // to manage free page IDs. + ACTOR static Future preFlush_impl(FIFOQueue *self) { + debug_printf("FIFOQueue(%s) preFlush begin\n", self->name.c_str()); + wait(self->notBusy()); + + // Completion of the pending operations as of the start of notBusy() could have began new operations, + // so see if any work is pending now. + bool workPending = self->busy(); + + if(!workPending) { + // A newly created or flushed queue starts out in a state where its tail page to be written to is empty. + // After pushBack() is called, this is no longer the case and never will be again until the queue is flushed. + // Before the non-empty tail page is written it must be linked to a new empty page for use after the next + // flush. (This is explained more at the top of FIFOQueue but it is because queue pages can only be written + // once because once they contain durable data a second write to link to a new page could corrupt the existing + // data if the subsequent commit never succeeds.) + if(self->newTailPage.isReady() && self->newTailPage.get() == invalidLogicalPageID && self->tailWriter.pendingWrites()) { + self->newTailPage = self->pager->newPageID(); + workPending = true; + } + } + + debug_printf("FIFOQueue(%s) preFlush returning %d\n", self->name.c_str(), workPending); + return workPending; + } + + Future preFlush() { + return preFlush_impl(this); + } + + void finishFlush() { + debug_printf("FIFOQueue(%s) finishFlush start\n", name.c_str()); + ASSERT(!busy()); + + // If a new tail page was allocated, link the last page of the tail writer to it. + if(newTailPage.get() != invalidLogicalPageID) { + tailWriter.newPage(newTailPage.get(), 0, false); + + // newPage() should be ready immediately since a pageID is being explicitly passed. + ASSERT(tailWriter.notBusy().isReady()); + + newTailPage = invalidLogicalPageID; + } + + // If the headWriter wrote anything, link its tail page to the headReader position and point the headReader + // to the start of the headWriter + if(headWriter.pendingWrites()) { + headWriter.newPage(headReader.pageID, headReader.offset, false); + headReader.pageID = headWriter.firstPageIDWritten; + headReader.offset = 0; + } + + // Update headReader's end page to the new tail page + headReader.endPageID = tailWriter.pageID; + + // Reset the write cursors + tailWriter.init(this, Cursor::WRITE, tailWriter.pageID); + headWriter.init(this, Cursor::WRITE); + + debug_printf("FIFOQueue(%s) finishFlush end\n", name.c_str()); + } + + ACTOR static Future flush_impl(FIFOQueue *self) { + loop { + bool notDone = wait(self->preFlush()); + if(!notDone) { + break; + } + } + self->finishFlush(); + return Void(); + } + + Future flush() { return flush_impl(this); } @@ -633,21 +608,11 @@ public: int64_t numEntries; int dataBytesPerPage; - PromiseStream pushBackQueue; - PromiseStream pushFrontQueue; - Future pushBackFuture; - Future pushFrontFuture; - - // Head points to the next location to pop(). - // pop() will only return committed records. Cursor headReader; - // Tail points to the next location to pushBack() to Cursor tailWriter; + Cursor headWriter; - // These cursors point to the front and back of the queue block - // chain being created for items sent to pushFront() - Cursor headWriterFront; - Cursor headWriterBack; + Future newTailPage; // For debugging std::string name; @@ -819,7 +784,7 @@ public: } std::string toString() const { - return format("{page id=%u @%" PRId64 "}", pageID, version); + return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); } }; @@ -827,7 +792,9 @@ public: // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 for default - COWPager(int desiredPageSize, std::string filename, int pageCacheSizeBytes) : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) { + COWPager(int desiredPageSize, std::string filename, int pageCacheSizeBytes) + : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) + { if(pageCacheBytes == 0) { pageCacheBytes = g_network->isSimulated() ? (BUGGIFY ? FLOW_KNOBS->BUGGIFY_SIM_PAGE_CACHE_4K : FLOW_KNOBS->SIM_PAGE_CACHE_4K) : FLOW_KNOBS->PAGE_CACHE_4K; } @@ -954,6 +921,7 @@ public: // Write new header using desiredPageSize self->pHeader->formatVersion = Header::FORMAT_VERSION; self->pHeader->committedVersion = 1; + self->pHeader->oldestVersion = 1; // No meta key until a user sets one and commits self->pHeader->setMetaKey(Key()); @@ -963,8 +931,8 @@ public: self->pHeader->pageCount = 2; // Create a new free list - self->freeList.create(self, self->newPageID().get(), "FreeListNew"); - self->delayedFreeList.create(self, self->newPageID().get(), "delayedFreeListtNew"); + self->freeList.create(self, self->newPageID().get(), "FreeList"); + self->delayedFreeList.create(self, self->newPageID().get(), "delayedFreeList"); // The first commit() below will flush the queues and update the queue states in the header, // but since the queues will not be used between now and then their states will not change. @@ -996,42 +964,40 @@ public: } // Get a new, previously available page ID. The page will be considered in-use after the next commit - // regardless of whether or not it was written to. - Future newPageID() override { - Future> nextPageID = freeList.pop(); - if(nextPageID.isReady()) { - if(nextPageID.get().present()) { - debug_printf("COWPager(%s) new page id=%u from ready freelist\n", filename.c_str(), nextPageID.get().get()); - return nextPageID.get().get(); - } - LogicalPageID id = pHeader->pageCount; - ++pHeader->pageCount; - debug_printf("COWPager(%s) new page id=%u at end of file\n", filename.c_str(), id); - return id; + // regardless of whether or not it was written to, until it is returned to the pager via freePage() + ACTOR static Future newPageID_impl(COWPager *self) { + // First try the free list + Optional freePageID = wait(self->freeList.pop()); + if(freePageID.present()) { + debug_printf("COWPager(%s) newPageID() returned %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); + return freePageID.get(); } - Future f = map(nextPageID, [=](Optional nextPageID) { - if(nextPageID.present()) { - debug_printf("COWPager(%s) new page id=%u from freelist after wait\n", filename.c_str(), nextPageID.get()); - return nextPageID.get(); - } - LogicalPageID id = pHeader->pageCount; - ++pHeader->pageCount; - debug_printf("COWPager(%s) new page id=%u at end of file\n", filename.c_str(), id); - return id; - }); + Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{self->pLastCommittedHeader->oldestVersion, 0})); + if(delayedFreePageID.present()) { + debug_printf("COWPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); + return delayedFreePageID.get().pageID; + } - return forwardError(f, errorPromise); + // Lastly, grow the pager file by a page and return it. + LogicalPageID id = self->pHeader->pageCount; + ++self->pHeader->pageCount; + debug_printf("COWPager(%s) new page, %s at end of file\n", self->filename.c_str(), toString(id).c_str()); + return id; }; + Future newPageID() override { + return forwardError(newPageID_impl(this), errorPromise); + } + Future writeHeaderPage(PhysicalPageID pageID, Reference page) { - debug_printf("COWPager(%s) header op=write id=%u\n", filename.c_str(), pageID); + debug_printf("COWPager(%s) header op=write %s\n", filename.c_str(), toString(pageID).c_str()); ((Page *)page.getPtr())->updateChecksum(pageID); return holdWhile(page, pageFile->write(page->begin(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); } Future writePhysicalPage(PhysicalPageID pageID, Reference page) { - debug_printf("COWPager(%s) op=write id=%u\n", filename.c_str(), pageID); + debug_printf("COWPager(%s) op=write %s\n", filename.c_str(), toString(pageID).c_str()); ((Page *)page.getPtr())->updateChecksum(pageID); return holdWhile(page, pageFile->write(page->begin(), physicalPageSize, (int64_t)pageID * physicalPageSize)); } @@ -1039,7 +1005,7 @@ public: void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager(%s) op=write id=%u cached=%d reading=%d writing=%d\n", filename.c_str(), pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("COWPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places // the new content in the cache entry when the write is launched, not when it is completed. @@ -1071,6 +1037,7 @@ public: } Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) override { + debug_printf("COWPager(%s) op=writeAtomic %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); // This pager does not support atomic update, so it always allocates and uses a new pageID Future f = map(newPageID(), [=](LogicalPageID newPageID) { updatePage(newPageID, data); @@ -1083,11 +1050,13 @@ public: void freePage(LogicalPageID pageID, Version v) override { // If v is older than the oldest version still readable then mark pageID as free as of the next commit - if(v < oldestVersion.get()) { + if(v < pLastCommittedHeader->oldestVersion) { + debug_printf("COWPager(%s) op=freeNow %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); freeList.pushBack(pageID); } else { // Otherwise add it to the delayed free list + debug_printf("COWPager(%s) op=freeLater %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); delayedFreeList.pushBack({v, pageID}); } }; @@ -1098,7 +1067,7 @@ public: ACTOR static Future> readHeaderPage(COWPager *self, PhysicalPageID pageID) { state Reference page(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)); int readBytes = wait(self->pageFile->read(page->mutate(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); - debug_printf("COWPager(%s) header op=read_complete id=%u bytes=%d\n", self->filename.c_str(), pageID, readBytes); + debug_printf("COWPager(%s) header op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); ASSERT(readBytes == smallestPhysicalBlock); return page; } @@ -1106,11 +1075,11 @@ public: ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { state Reference page = self->newPageBuffer(); int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); - debug_printf("COWPager(%s) op=read_complete id=%u bytes=%d\n", self->filename.c_str(), pageID, readBytes); + debug_printf("COWPager(%s) op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); ASSERT(readBytes == self->physicalPageSize); Page *p = (Page *)page.getPtr(); if(!p->verifyChecksum(pageID)) { - debug_printf("COWPager(%s) checksum failed id=%u\n", self->filename.c_str(), pageID); + debug_printf("COWPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); Error e = checksum_failed(); TraceEvent(SevError, "COWPagerChecksumFailed") .detail("Filename", self->filename.c_str()) @@ -1139,7 +1108,7 @@ public: } PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager(%s) op=read id=%u cached=%d reading=%d writing=%d\n", filename.c_str(), pageID, cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("COWPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); if(!cacheEntry.page.isValid()) { cacheEntry.page = readPhysicalPage(this, (PhysicalPageID)pageID); @@ -1153,53 +1122,42 @@ public: void addLatestSnapshot() override; void setOldestVersion(Version v) override { - oldestVersion.set(v); + ASSERT(v >= pHeader->oldestVersion); + ASSERT(v <= pHeader->committedVersion); + pHeader->oldestVersion = v; + expireSnapshots(v); }; Future getOldestVersion() override { return map(recoverFuture, [=](Void) { - return oldestVersion.get(); + return pLastCommittedHeader->oldestVersion; }); }; ACTOR static Future commit_impl(COWPager *self) { - // TODO: Remove this once the free list is in normal use - if(g_network->isSimulated()) { - state int addFront = 10 * deterministicRandom()->randomInt(0, 10); - state int addBack = 10 * deterministicRandom()->randomInt(0, 10); - state int remove = 10 * deterministicRandom()->randomInt(0, 20); - state int i; - - for(i = 0; i < addBack; ++i) { - LogicalPageID id = wait(self->newPageID()); - self->freeList.pushBack(id); - } - - for(i = 0; i < addFront; ++i) { - LogicalPageID id = wait(self->newPageID()); - self->freeList.pushFront(id); - } - - for(i = 0; i < remove; ++i) { - Optional id = wait(self->freeList.pop()); - if(!id.present()) { - break; - } - } - } + debug_printf("COWPager(%s) commit begin\n", self->filename.c_str()); // Write old committed header to Page 1 self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); - // Flush the delayed free list queue to the pager and get the new queue state into the header - // This must be done before flushing the free list as it may free or allocate pages. - wait(store(self->pHeader->delayedFreeList, self->delayedFreeList.flush())); + // Flush the free list delayed free list queues together as they are used by freePage() and newPageID() + loop { + state bool freeBusy = wait(self->freeList.preFlush()); + state bool delayedFreeBusy = wait(self->delayedFreeList.preFlush()); + if(!freeBusy && !delayedFreeBusy) { + break; + } + } + self->freeList.finishFlush(); + self->delayedFreeList.finishFlush(); - // Flush the free list queue to the pager and get the new queue state into the header - wait(store(self->pHeader->freeList, self->freeList.flush())); + self->pHeader->freeList = self->freeList.getState(); + self->pHeader->delayedFreeList = self->delayedFreeList.getState(); // Wait for all outstanding writes to complete + debug_printf("COWPager(%s) waiting for outstanding writes\n", self->filename.c_str()); wait(self->operations.signalAndCollapse()); + debug_printf("COWPager(%s) Syncing\n", self->filename.c_str()); // Sync everything except the header wait(self->pageFile->sync()); @@ -1229,7 +1187,7 @@ public: return pHeader->getMetaKey(); } - void setVersion(Version v) override { + void setCommitVersion(Version v) override { pHeader->committedVersion = v; } @@ -1300,37 +1258,14 @@ private: // Expire snapshots up to but not including v void expireSnapshots(Version v) { - while(snapshots.size() > 1 && snapshots.at(1).version <= v) { + debug_printf("COWPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); + while(snapshots.size() > 1 && snapshots.front().version < v) { + debug_printf("COWPager(%s) expiring snapshot for %" PRId64 "\n", filename.c_str(), snapshots.front().version); snapshots.front().expired.sendError(transaction_too_old()); snapshots.pop_front(); } } - ACTOR Future expireActor(COWPager *self) { - state DelayedFreePage upperBound; - - loop { - state Version v = self->oldestVersion.get(); - upperBound.version = v; - self->expireSnapshots(v); - - // Pop things from the delayed free queue until a version >= v is reached - loop { - Optional dfp = wait(self->delayedFreeList.pop(upperBound)); - - if(!dfp.present()) { - break; - } - - self->freeList.pushBack(dfp.get().pageID); - } - - if(self->oldestVersion.get() == v) { - wait(self->oldestVersion.onChange()); - } - } - } - #pragma pack(push, 1) // Header is the format of page 0 of the database struct Header { @@ -1341,6 +1276,7 @@ private: FIFOQueue::QueueState freeList; FIFOQueue::QueueState delayedFreeList; Version committedVersion; + Version oldestVersion; int32_t metaKeySize; KeyRef getMetaKey() const { @@ -1415,9 +1351,6 @@ private: SignalableActorCollection operations; Future recoverFuture; - // The oldest readable snapshot version - AsyncVar oldestVersion; - Reference pageFile; LogicalPageQueueT freeList; @@ -1618,6 +1551,10 @@ struct SplitStringRef { // NOTE: Uses host byte order typedef VectorRef BTreePageID; +std::string toString(BTreePageID id) { + return std::string("BTreePageID") + toString(id.begin(), id.end()); +} + #define STR(x) LiteralStringRef(x) struct RedwoodRecordRef { typedef uint8_t byte; @@ -2179,7 +2116,7 @@ struct RedwoodRecordRef { if(value.present()) { // Assume that values the size of a page ID are page IDs. It's not perfect but it's just for debugging. if(value.get().size() == sizeof(LogicalPageID)) { - r += format("[PageID=%s]", ::toString(getChildPage()).c_str()); + r += format("[%s]", ::toString(getChildPage()).c_str()); } else { r += format("'%s'", kvformat(value.get(), hexLimit).c_str()); @@ -2197,6 +2134,7 @@ struct BTreePage { enum EPageFlags { IS_LEAF = 1}; typedef DeltaTree BinaryTree; + typedef DeltaTree ValueTree; static constexpr int FORMAT_VERSION = 1; #pragma pack(push,1) @@ -2226,9 +2164,13 @@ struct BTreePage { return *(const BinaryTree *)(this + 1); } + const ValueTree & valueTree() const { + return *(const ValueTree *)(this + 1); + } + std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { std::string r; - r += format("BTreePage op=%s id=%s ver=%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", + r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", write ? "write" : "read", ::toString(id).c_str(), ver, this, (int)flags, (int)itemCount, (int)kvBytes, lowerBound->toString().c_str(), upperBound->toString().c_str()); try { @@ -2346,7 +2288,7 @@ public: Version version; Standalone pageID; - bool operator< (const LazyDeleteQueueEntry &rhs) { + bool operator< (const LazyDeleteQueueEntry &rhs) const { return version < rhs.version; } @@ -2371,7 +2313,7 @@ public: } std::string toString() const { - return format("{page id=%s @%" PRId64 "}", ::toString(pageID).c_str(), version); + return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); } }; @@ -2393,6 +2335,11 @@ public: memcpy(this, k.begin(), k.size()); ASSERT(formatVersion == FORMAT_VERSION); } + + std::string toString() { + return format("{height=%d formatVersion=%d root=%s lazyDeleteQueue=%s}", (int)height, (int)formatVersion, ::toString(root.get()).c_str(), lazyDeleteQueue.toString().c_str()); + } + }; #pragma pack(pop) @@ -2549,6 +2496,63 @@ public: m_latestCommit = m_init; } + ACTOR static Future incrementalLazyDelete(VersionedBTree *self, int minPages) { + // TODO: Is it contractually okay to always to read at the latest version? + state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion().get()); + state int freedPages = 0; + + loop { + // take a page from front of queue + state Optional q = wait(self->m_lazyDeleteQueue.pop()); + debug_printf("LazyDelete: popped %s\n", toString(q).c_str()); + if(!q.present()) { + return Void(); + } + + // Read the page without caching + Reference p = wait(self->readPage(snapshot, q.get().pageID, nullptr, nullptr, true)); + const BTreePage &btPage = *(BTreePage *)p->begin(); + + // Level 1 (leaf) nodes should never be in the lazy delete queue + ASSERT(btPage.height > 1); + + // Iterate over page entries, skipping key decoding using BTreePage::ValueTree which uses + // RedwoodRecordRef::DeltaValueOnly as the delta type type to skip key decoding + BTreePage::ValueTree::Reader reader(&btPage.valueTree(), &dbBegin, &dbEnd); + auto c = reader.getCursor(); + ASSERT(c.moveFirst()); + Version v = q.get().version; + while(1) { + if(c.get().value.present()) { + BTreePageID btChildPageID = c.get().getChildPage(); + // If this page is height 2, then the children are leaves so free + if(btPage.height == 2) { + debug_printf("LazyDelete: freeing child %s\n", toString(btChildPageID).c_str()); + self->freeBtreePage(btChildPageID, v); + freedPages += btChildPageID.size(); + } + else { + // Otherwise, queue them for lazy delete. + debug_printf("LazyDelete: queuing child %s\n", toString(btChildPageID).c_str()); + self->m_lazyDeleteQueue.pushFront(LazyDeleteQueueEntry{v, btChildPageID}); + } + } + if(!c.moveNext()) { + break; + } + } + + // Free the page, now that its children have either been freed or queued + debug_printf("LazyDelete: freeing queue entry %s\n", toString(q.get().pageID).c_str()); + self->freeBtreePage(q.get().pageID, v); + freedPages += q.get().pageID.size(); + + if(freedPages >= minPages) { + return Void(); + } + } + } + ACTOR static Future init_impl(VersionedBTree *self) { state Version latest = wait(self->m_pager->getLatestVersion()); debug_printf("Recovered pager to version %" PRId64 "\n", latest); @@ -2558,18 +2562,17 @@ public: self->m_header.formatVersion = MetaKey::FORMAT_VERSION; LogicalPageID id = wait(self->m_pager->newPageID()); BTreePageID newRoot((LogicalPageID *)&id, 1); - debug_printf("new root page id=%s\n", toString(newRoot).c_str()); + debug_printf("new root %s\n", toString(newRoot).c_str()); self->m_header.root.set(newRoot, sizeof(headerSpace) - sizeof(m_header)); self->m_header.height = 1; ++latest; Reference page = self->m_pager->newPageBuffer(); makeEmptyPage(page, BTreePage::IS_LEAF); self->m_pager->updatePage(id, page); - self->m_pager->setVersion(latest); + self->m_pager->setCommitVersion(latest); LogicalPageID newQueuePage = wait(self->m_pager->newPageID()); - debug_printf("new lazy delete queue page id=%u\n", newQueuePage); - self->m_lazyDeleteQueue.create(self->m_pager, newQueuePage, "LazyDeleteQueueNew"); + self->m_lazyDeleteQueue.create(self->m_pager, newQueuePage, "LazyDeleteQueue"); self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); self->m_pager->setMetaKey(self->m_header.asKeyRef()); wait(self->m_pager->commit()); @@ -2580,7 +2583,7 @@ public: self->m_lazyDeleteQueue.recover(self->m_pager, self->m_header.lazyDeleteQueue, "LazyDeleteQueueRecovered"); } - debug_printf("Recovered btree at version %" PRId64 " height=%d\n", latest); + debug_printf("Recovered btree at version %" PRId64 ": %s\n", latest, self->m_header.toString().c_str()); self->m_maxPartSize = std::min(255, self->m_pager->getUsablePageSize() / 5); self->m_lastCommittedVersion = latest; @@ -2661,7 +2664,7 @@ private: } std::string toString() const { - return format("{version=%" PRId64 " upperBound=%s children=%s}", version, ::toString(children).c_str(), upperBound.toString().c_str()); + return format("{version=%" PRId64 " children=%s upperbound=%s}", version, ::toString(children).c_str(), upperBound.toString().c_str()); } Version version; @@ -3128,7 +3131,7 @@ private: counts.extPageWrites += pages.size() - 1; } - debug_printf("Flushing page id=%s original=%s start=%d i=%d count=%d\nlower: %s\nupper: %s\n", toString(childPageID).c_str(), toString(previousID).c_str(), start, i, i - start, pageLowerBound.toString().c_str(), pageUpperBound.toString().c_str()); + debug_printf("Flushing %s original=%s start=%d i=%d count=%d\nlower: %s\nupper: %s\n", toString(childPageID).c_str(), toString(previousID).c_str(), start, i, i - start, pageLowerBound.toString().c_str(), pageUpperBound.toString().c_str()); if(REDWOOD_DEBUG) { for(int j = start; j < i; ++j) { debug_printf(" %3d: %s\n", j, entries[j].toString().c_str()); @@ -3212,41 +3215,47 @@ private: int m_size; }; - ACTOR static Future> readPage(Reference snapshot, BTreePageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) { - debug_printf("readPage() op=read page id=%s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); - wait(delay(0, TaskPriority::DiskRead)); - - std::vector>> reads; - - for(auto &pageID : id) { - reads.push_back(snapshot->getPhysicalPage(pageID, true)); - } - - ++counts.pageReads; - std::vector> pages = wait(getAll(reads)); - ASSERT(!pages.empty()); - - Reference page; - - if(pages.size() == 1) { - page = pages.front(); + ACTOR static Future> readPage(Reference snapshot, BTreePageID id, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, bool forLazyDelete = false) { + if(!forLazyDelete) { + debug_printf("readPage() op=read %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); } else { - counts.extPageReads += (pages.size() - 1); + debug_printf("readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); + } + + wait(delay(0, TaskPriority::DiskRead)); + + state Reference page; + + ++counts.pageReads; + if(id.size() == 1) { + wait(store(page, snapshot->getPhysicalPage(id.front(), !forLazyDelete))); + } + else { + ASSERT(!id.empty()); + counts.extPageReads += (id.size() - 1); + std::vector>> reads; + for(auto &pageID : id) { + reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyDelete)); + } + std::vector> pages = wait(getAll(reads)); // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. page = Reference(new SuperPage(pages)); } + debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); const BTreePage *pTreePage = (const BTreePage *)page->begin(); ASSERT(pTreePage->formatVersion == BTreePage::FORMAT_VERSION); - if(page->userData == nullptr) { - debug_printf("readPage() Creating Reader for page id=%s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); + if(!forLazyDelete && page->userData == nullptr) { + debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); page->userData = new BTreePage::BinaryTree::Reader(&pTreePage->tree(), lowerBound, upperBound); page->userDataDestructor = [](void *ptr) { delete (BTreePage::BinaryTree::Reader *)ptr; }; } - debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); + if(!forLazyDelete) { + debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); + } // Nothing should attempt to read bytes in the page outside the BTreePage structure VALGRIND_MAKE_MEM_UNDEFINED(page->begin() + pTreePage->size(), page->size() - pTreePage->size()); @@ -3292,6 +3301,7 @@ private: // If the key is being mutated, them remove this subtree. if(iMutationBoundary == iMutationBoundaryEnd) { if(!iMutationBoundary->second.startKeyMutations.empty()) { + debug_printf("%s lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), toString(results).c_str()); Version firstKeyChangeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : iMutationBoundary->second.startKeyMutations.begin()->first; if(isLeaf) { self->freeBtreePage(rootID, firstKeyChangeVersion); @@ -3299,7 +3309,6 @@ private: else { self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{firstKeyChangeVersion, rootID}); } - debug_printf("%s lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), toString(results).c_str()); return results; } @@ -3489,15 +3498,15 @@ private: // TODO: Make version and key splits based on contents of merged list, if keeping history + writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; // If everything in the page was deleted then this page should be deleted as of the new version // Note that if a single range clear covered the entire page then we should not get this far if(merged.empty() && !isRoot) { - self->freeBtreePage(rootID, writeVersion); debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), toString(results).c_str()); + self->freeBtreePage(rootID, writeVersion); return results; } - writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; state Standalone> entries = wait(writePages(self, true, lowerBound, upperBound, merged, BTreePage::IS_LEAF, page->height, writeVersion, rootID)); results.arena().dependsOn(entries.arena()); results.push_back(results.arena(), VersionAndChildrenRef(writeVersion, entries, *upperBound)); @@ -3541,7 +3550,7 @@ private: const RedwoodRecordRef &childUpperBound = cursor.valid() ? cursor.get() : *upperBound; - debug_printf("%s recursing to PageID=%s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", + debug_printf("%s recursing to %s lower=%s upper=%s decodeLower=%s decodeUpper=%s\n", context.c_str(), toString(pageID).c_str(), childLowerBound.toString().c_str(), childUpperBound.toString().c_str(), decodeChildLowerBound.toString().c_str(), decodeChildUpperBound.toString().c_str()); /* @@ -3614,8 +3623,8 @@ private: if(pageBuilder.modified) { // If the page now has no children if(pageBuilder.childPageCount == 0) { - self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{writeVersion, rootID}); debug_printf("%s All internal page children were deleted so deleting this page too, returning %s\n", context.c_str(), toString(results).c_str()); + self->freeBtreePage(rootID, writeVersion); return results; } else { @@ -3658,7 +3667,13 @@ private: // Wait for the latest commit that started to be finished. wait(previousCommit); - debug_printf("%s: Beginning commit of version %" PRId64 "\n", self->m_name.c_str(), writeVersion); + + // Advance oldest version by a random number between 0 and the difference between the latest and oldest versions. + Version newOldestVersion = self->m_pager->getOldestVersion().get() + deterministicRandom()->randomInt(0, self->m_pager->getLatestVersion().get() - self->m_pager->getOldestVersion().get() + 1); + self->m_pager->setOldestVersion(newOldestVersion); + debug_printf("%s: Beginning commit of version %" PRId64 ", oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, newOldestVersion); + + state Future lazyDelete = incrementalLazyDelete(self, 100); // Get the latest version from the pager, which is what we will read at state Version latestVersion = wait(self->m_pager->getLatestVersion()); @@ -3700,8 +3715,12 @@ private: self->m_header.root.set(rootPageID, sizeof(headerSpace) - sizeof(m_header)); - self->m_pager->setVersion(writeVersion); - wait(store(self->m_header.lazyDeleteQueue, self->m_lazyDeleteQueue.flush())); + wait(lazyDelete); + + self->m_pager->setCommitVersion(writeVersion); + + wait(self->m_lazyDeleteQueue.flush()); + self->m_header.lazyDeleteQueue = self->m_lazyDeleteQueue.getState(); debug_printf("Setting metakey\n"); self->m_pager->setMetaKey(self->m_header.asKeyRef()); @@ -3773,7 +3792,7 @@ private: } std::string toString() const { - return format("PageID=%s, %s", ::toString(pageID).c_str(), cursor.valid() ? cursor.get().toString().c_str() : ""); + return format("%s, %s", ::toString(pageID).c_str(), cursor.valid() ? cursor.get().toString().c_str() : ""); } }; @@ -4695,7 +4714,7 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, break; } } catch(Error &e) { - if(e.code() != error_code_end_of_stream) { + if(e.code() != error_code_end_of_stream && e.code() != error_code_transaction_too_old) { throw; } } @@ -4704,25 +4723,34 @@ ACTOR Future verify(VersionedBTree *btree, FutureStream vStream, // Does a random range read, doesn't trap/report errors ACTOR Future randomReader(VersionedBTree *btree) { - state Reference cur; - loop { - wait(yield()); - if(!cur || deterministicRandom()->random01() > .1) { - Version v = btree->getLastCommittedVersion(); - if(!btree->isSingleVersion()) { - v = deterministicRandom()->randomInt(1, v + 1); - } - cur = btree->readAtVersion(v); - } - - state KeyValue kv = randomKV(10, 0); - wait(cur->findFirstEqualOrGreater(kv.key, true, 0)); - state int c = deterministicRandom()->randomInt(0, 100); - while(cur->isValid() && c-- > 0) { - wait(success(cur->next(true))); + try { + state Reference cur; + loop { wait(yield()); + if(!cur || deterministicRandom()->random01() > .1) { + Version v = btree->getLastCommittedVersion(); + if(!btree->isSingleVersion()) { + v = deterministicRandom()->randomInt(1, v + 1); + } + cur = btree->readAtVersion(v); + } + + state KeyValue kv = randomKV(10, 0); + wait(cur->findFirstEqualOrGreater(kv.key, true, 0)); + state int c = deterministicRandom()->randomInt(0, 100); + while(cur->isValid() && c-- > 0) { + wait(success(cur->next(true))); + wait(yield()); + } } } + catch(Error &e) { + if(e.code() != error_code_transaction_too_old) { + throw e; + } + } + + return Void(); } struct IntIntPair { @@ -5413,7 +5441,7 @@ TEST_CASE("!/redwood/correctness/btree") { // Recover from disk at random if(!serialTest && deterministicRandom()->random01() < coldStartProbability) { - printf("Recovering from disk.\n"); + printf("Recovering from disk after next commit.\n"); // Wait for outstanding commit debug_printf("Waiting for outstanding commit\n"); @@ -5428,7 +5456,7 @@ TEST_CASE("!/redwood/correctness/btree") { btree->close(); wait(closedFuture); - debug_printf("Reopening btree\n"); + printf("Reopening btree from disk.\n"); IPager2 *pager = new COWPager(pageSize, pagerFile, 0); btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); From 6b7317da9b70ee7a8c294d70648125122065193a Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 15 Oct 2019 03:36:22 -0700 Subject: [PATCH 038/184] Bug and clarity fixes to tracking FIFOQueue page and item count. --- fdbserver/VersionedBTree.actor.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index e37c44f436..eb6428dc68 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -318,6 +318,9 @@ public: if(newPageID == invalidLogicalPageID) { debug_printf("FIFOQueue::Cursor Allocating new page %s\n", self->toString().c_str()); wait(store(newPageID, self->queue->pager->newPageID())); + // numPages is only increased if the page is allocated here. + // Callers who pass in a page are responsible for updating numPages when necessary (it isn't always necessary) + ++self->queue->numPages; } debug_printf("FIFOQueue::Cursor Adding page %s init=%d %s\n", ::toString(newPageID).c_str(), initializeNewPage, self->toString().c_str()); @@ -337,7 +340,6 @@ public: auto p = self->raw(); p->formatVersion = RawPage::FORMAT_VERSION; p->endOffset = 0; - ++self->queue->numPages; } debug_printf("FIFOQueue::Cursor Added page %s\n", self->toString().c_str()); @@ -364,9 +366,9 @@ public: debug_printf("FIFOQueue::Cursor write(%s) %s\n", ::toString(item).c_str(), self->toString().c_str()); auto p = self->raw(); Codec::writeToBytes(p->begin() + self->offset, item); - ++self->queue->numEntries; self->offset += bytesNeeded; p->endOffset = self->offset; + ++self->queue->numEntries; debug_printf("FIFOQueue::Cursor write(%s) finished, %s\n", ::toString(item).c_str(), self->toString().c_str()); return Void(); } @@ -404,17 +406,17 @@ public: return Optional(); } - --self->queue->numEntries; self->offset += bytesRead; + --self->queue->numEntries; debug_printf("FIFOQueue::Cursor popped %s, %s\n", ::toString(result).c_str(), self->toString().c_str()); ASSERT(self->offset <= p->endOffset); if(self->offset == p->endOffset) { debug_printf("FIFOQueue::Cursor Page exhausted, %s\n", self->toString().c_str()); - --self->queue->numPages; LogicalPageID oldPageID = self->pageID; self->pageID = p->nextPageID; self->offset = p->nextOffset; + --self->queue->numPages; self->page.clear(); debug_printf("FIFOQueue::Cursor Page exhausted, moved to new page, %s\n", self->toString().c_str()); @@ -563,6 +565,8 @@ public: // If a new tail page was allocated, link the last page of the tail writer to it. if(newTailPage.get() != invalidLogicalPageID) { tailWriter.newPage(newTailPage.get(), 0, false); + // The flush sequence allocated a page and added it to the queue so increment numPages + ++numPages; // newPage() should be ready immediately since a pageID is being explicitly passed. ASSERT(tailWriter.notBusy().isReady()); From fa654d9da7197dca11c4d85f1250d77ac1c063b9 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 16 Oct 2019 10:00:16 -0700 Subject: [PATCH 039/184] updated to not kill majority of coordinators --- .../workloads/MachineAttrition.actor.cpp | 48 ++++++++++++++----- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 685fe181f1..0993dc39b1 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -19,12 +19,13 @@ */ #include "fdbclient/NativeAPI.actor.h" +#include "fdbclient/CoordinationInterface.h" +#include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/TesterInterface.actor.h" #include "fdbserver/WorkerInterface.actor.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbrpc/simulator.h" #include "fdbclient/ManagementAPI.actor.h" -#include "ClusterRecruitmentInterface.h" #include "flow/actorcompiler.h" // This must be the last #include. static std::set const& normalAttritionErrors() { @@ -128,9 +129,9 @@ struct MachineAttritionWorkload : TestWorkload { } if (!clientId && !g_network->isSimulated()) { double meanDelay = testDuration / machinesToKill; - return timeout(reportErrorsExcept(noSimMachineKillWorker(this, meanDelay, cx), - "noSimMachineKillWorkerError", UID(), &normalAttritionErrors()), - testDuration, Void()); + return timeout( + reportErrorsExcept(noSimMachineKillWorker(this, meanDelay, cx), "noSimMachineKillWorkerError", UID(), &normalAttritionErrors()), + testDuration, Void()); } if(killSelf) throw please_reboot(); @@ -140,6 +141,17 @@ struct MachineAttritionWorkload : TestWorkload { virtual void getMetrics( vector& m ) { } + static bool noSimIsViableKill(int coordFaultTolerance, int& killedCoord, std::vector coordAddrs, WorkerDetails worker) { + if (worker.processClass == ProcessClass::ClassType::TesterClass) return false; + bool isCoord = (std::find(coordAddrs.begin(), coordAddrs.end(), worker.interf.address()) != coordAddrs.end()); + if (isCoord && coordFaultTolerance > killedCoord) { + killedCoord++; + } else if (isCoord) { + return false; + } + return true; + } + ACTOR static Future noSimMachineKillWorker(MachineAttritionWorkload *self, double meanDelay, Database cx) { ASSERT(!g_network->isSimulated()); state int killedMachines = 0; @@ -154,18 +166,32 @@ struct MachineAttritionWorkload : TestWorkload { } else { rbReq.waitForDuration = std::numeric_limits::max(); } + // keep track of coordinator fault tolerance and make sure we don't go over + state ClientCoordinators coords(cx->getConnectionFile()); + state std::vector>> leaderServers; + state std::vector coordAddrs; + for (const auto& cls : coords.clientLeaderServers) { + leaderServers.push_back(retryBrokenPromise(cls.getLeader, GetLeaderRequest(coords.clusterKey, UID()), TaskPriority::CoordinationReply)); + coordAddrs.push_back(cls.getLeader.getEndpoint().getPrimaryAddress()); + } + wait(smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.0)); + int coordUnavailable = 0; + for (const auto& leaderServer : leaderServers) { + if (!leaderServer.isReady()) { + coordUnavailable++; + } + } + state int coordFaultTolerance = (leaderServers.size() - 1) / 2 - coordUnavailable; + state int killedCoord = 0; if (self->killDc) { wait(delay(delayBeforeKill)); // Pick a dcId to kill - while (workers.back().processClass == ProcessClass::ClassType::TesterClass) { - deterministicRandom()->randomShuffle(workers); - } Optional> killDcId = workers.back().interf.locality.dcId(); TraceEvent("Assassination").detail("TargetDataCenter", killDcId); for (const auto& worker : workers) { - // kill all matching dcId workers, except testers + // kill all matching dcId workers, except testers. Also preserve a majority of coordinators if (worker.interf.locality.dcId().present() && worker.interf.locality.dcId() == killDcId && - worker.processClass != ProcessClass::ClassType::TesterClass) { + noSimIsViableKill(coordFaultTolerance, killedCoord, coordAddrs, worker)) { worker.interf.clientInterface.reboot.send(rbReq); } } @@ -191,9 +217,9 @@ struct MachineAttritionWorkload : TestWorkload { } } } - // Pick a machine to kill, ignoring testers + // Pick a machine to kill, ignoring testers and preserving majority of coordinators state WorkerDetails targetMachine; - while (workers.back().processClass == ProcessClass::ClassType::TesterClass) { + while (!noSimIsViableKill(coordFaultTolerance, killedCoord, coordAddrs, workers.back())) { deterministicRandom()->randomShuffle(workers); } targetMachine = workers.back(); From 1eb3a70b9606802080547c806b7223ffd3b6e324 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Mon, 29 Jul 2019 23:40:28 -0700 Subject: [PATCH 040/184] Spill SharedTLog when there's more than one. When switching between spill_type or log_version, a new instance of a SharedTLog is created in the transaction log processes. If this is done in a saturated database, then doubling the amount of memory to hold mutations in memory can cause TLogs to be uncomfortably close to the 8GB OOM limit. Instead, we now thread which UID of a SharedTLog is active, and the other TLog spill out the majority of their mutations. This is a backport of #2213 (fef89aa1) to release-6.2 --- fdbserver/OldTLogServer_6_0.actor.cpp | 26 +++++++++-- fdbserver/TLogServer.actor.cpp | 63 ++++++++++++++++++++------- fdbserver/WorkerInterface.actor.h | 7 ++- fdbserver/worker.actor.cpp | 38 +++++++++++----- 4 files changed, 102 insertions(+), 32 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index c0ccd8eda9..2f5c2d2e35 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -262,6 +262,7 @@ struct TLogData : NonCopyable { int64_t instanceID; int64_t bytesInput; int64_t bytesDurable; + int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling. int64_t overheadBytesInput; int64_t overheadBytesDurable; @@ -288,7 +289,7 @@ struct TLogData : NonCopyable { : dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()), persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)), dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0), queueCommitEnd(0), - diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), overheadBytesInput(0), overheadBytesDurable(0), + diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), targetVolatileBytes(SERVER_KNOBS->TLOG_SPILL_THRESHOLD), overheadBytesInput(0), overheadBytesDurable(0), concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped() { @@ -697,7 +698,7 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; if(logData->stopped) { - if (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD) { + if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { totalSize = 0; Map>::iterator sizeItr = logData->version_sizes.begin(); @@ -742,7 +743,7 @@ ACTOR Future updateStorage( TLogData* self ) { } else { Map>::iterator sizeItr = logData->version_sizes.begin(); while( totalSize < SERVER_KNOBS->UPDATE_STORAGE_BYTE_LIMIT && sizeItr != logData->version_sizes.end() - && (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD || sizeItr->value.first == 0) ) + && (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= self->targetVolatileBytes || sizeItr->value.first == 0) ) { totalSize += sizeItr->value.first + sizeItr->value.second; ++sizeItr; @@ -2312,8 +2313,18 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit return Void(); } +ACTOR Future startSpillingInTenSeconds(TLogData* self, UID tlogId, Reference> activeSharedTLog) { + wait(delay(10)); + if (activeSharedTLog->get() != tlogId) { + // TODO: This should fully spill, but currently doing so will cause us to no longer update poppedVersion + // and QuietDatabase will hang thinking our TLog is behind. + self->targetVolatileBytes = SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT * 2; + } + return Void(); +} + // New tLog (if !recoverFrom.size()) or restore from network -ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered, std::string folder, Reference> degraded) { +ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered, std::string folder, Reference> degraded, Reference> activeSharedTLog) { state TLogData self( tlogId, persistentData, persistentQueue, db, degraded, folder ); state Future error = actorCollection( self.sharedActors.getFuture() ); @@ -2346,6 +2357,13 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } + when ( wait( activeSharedTLog->onChange() ) ) { + if (activeSharedTLog->get() == tlogId) { + self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; + } else { + self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); + } + } } } } catch (Error& e) { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index cfc52b0281..ed4adf6586 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -312,6 +312,7 @@ struct TLogData : NonCopyable { int64_t instanceID; int64_t bytesInput; int64_t bytesDurable; + int64_t targetVolatileBytes; // The number of bytes of mutations this TLog should hold in memory before spilling. int64_t overheadBytesInput; int64_t overheadBytesDurable; @@ -339,7 +340,7 @@ struct TLogData : NonCopyable { : dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()), persistentData(persistentData), rawPersistentQueue(persistentQueue), persistentQueue(new TLogQueue(persistentQueue, dbgid)), dbInfo(dbInfo), degraded(degraded), queueCommitBegin(0), queueCommitEnd(0), - diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), overheadBytesInput(0), overheadBytesDurable(0), + diskQueueCommitBytes(0), largeDiskQueueCommitBytes(false), bytesInput(0), bytesDurable(0), targetVolatileBytes(SERVER_KNOBS->TLOG_SPILL_THRESHOLD), overheadBytesInput(0), overheadBytesDurable(0), peekMemoryLimiter(SERVER_KNOBS->TLOG_SPILL_REFERENCE_MAX_PEEK_MEMORY_BYTES), concurrentLogRouterReads(SERVER_KNOBS->CONCURRENT_LOG_ROUTER_READS), ignorePopRequest(false), ignorePopDeadline(), ignorePopUid(), dataFolder(folder), toBePopped() @@ -952,7 +953,7 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; if(logData->stopped) { - if (self->bytesInput - self->bytesDurable >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD) { + if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { totalSize = 0; Map>::iterator sizeItr = logData->version_sizes.begin(); @@ -1000,10 +1001,12 @@ ACTOR Future updateStorage( TLogData* self ) { if(logData->version_sizes.empty()) { nextVersion = logData->version.get(); } else { + // Double check that a running TLog wasn't wrongly affected by spilling locked SharedTLogs. + ASSERT_WE_THINK(self->targetVolatileBytes == SERVER_KNOBS->TLOG_SPILL_THRESHOLD); Map>::iterator sizeItr = logData->version_sizes.begin(); while( totalSize < SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT && sizeItr != logData->version_sizes.end() - && (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= SERVER_KNOBS->TLOG_SPILL_THRESHOLD || sizeItr->value.first == 0) ) + && (logData->bytesInput.getValue() - logData->bytesDurable.getValue() - totalSize >= self->targetVolatileBytes || sizeItr->value.first == 0) ) { totalSize += sizeItr->value.first + sizeItr->value.second; ++sizeItr; @@ -2593,20 +2596,10 @@ ACTOR Future updateLogSystem(TLogData* self, Reference logData, L } } -ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, LocalityData locality ) { - state TLogInterface recruited(self->dbgid, locality); - recruited.initEndpoints(); - - DUMPTOKEN( recruited.peekMessages ); - DUMPTOKEN( recruited.popMessages ); - DUMPTOKEN( recruited.commit ); - DUMPTOKEN( recruited.lock ); - DUMPTOKEN( recruited.getQueuingMetrics ); - DUMPTOKEN( recruited.confirmRunning ); - +void stopAllTLogs( TLogData* self, UID newLogId ) { for(auto it : self->id_data) { if( !it.second->stopped ) { - TraceEvent("TLogStoppedByNewRecruitment", self->dbgid).detail("LogId", it.second->logId).detail("StoppedId", it.first.toString()).detail("RecruitedId", recruited.id()).detail("EndEpoch", it.second->logSystem->get().getPtr() != 0); + TraceEvent("TLogStoppedByNewRecruitment", self->dbgid).detail("LogId", it.second->logId).detail("StoppedId", it.first.toString()).detail("RecruitedId", newLogId).detail("EndEpoch", it.second->logSystem->get().getPtr() != 0); if(!it.second->isPrimary && it.second->logSystem->get()) { it.second->removed = it.second->removed && it.second->logSystem->get()->endEpoch(); } @@ -2620,6 +2613,21 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit } it.second->stopCommit.trigger(); } +} + +// Start the tLog role for a worker +ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, LocalityData locality ) { + state TLogInterface recruited(self->dbgid, locality); + recruited.initEndpoints(); + + DUMPTOKEN( recruited.peekMessages ); + DUMPTOKEN( recruited.popMessages ); + DUMPTOKEN( recruited.commit ); + DUMPTOKEN( recruited.lock ); + DUMPTOKEN( recruited.getQueuingMetrics ); + DUMPTOKEN( recruited.confirmRunning ); + + stopAllTLogs(self, recruited.id()); state Reference logData = Reference( new LogData(self, recruited, req.remoteTag, req.isPrimary, req.logRouterTags, req.txsTags, req.recruitmentID, currentProtocolVersion, req.allTags) ); self->id_data[recruited.id()] = logData; @@ -2736,8 +2744,21 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit return Void(); } +ACTOR Future startSpillingInTenSeconds(TLogData* self, UID tlogId, Reference> activeSharedTLog) { + wait(delay(10)); + if (activeSharedTLog->get() != tlogId) { + // TODO: This should fully spill, but currently doing so will cause us to no longer update poppedVersion + // and QuietDatabase will hang thinking our TLog is behind. + TraceEvent("SharedTLogBeginSpilling", self->dbgid).detail("NowActive", activeSharedTLog->get()); + self->targetVolatileBytes = SERVER_KNOBS->REFERENCE_SPILL_UPDATE_STORAGE_BYTE_LIMIT * 2; + } else { + TraceEvent("SharedTLogSkipSpilling", self->dbgid).detail("NowActive", activeSharedTLog->get()); + } + return Void(); +} + // New tLog (if !recoverFrom.size()) or restore from network -ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered, std::string folder, Reference> degraded ) { +ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, Promise oldLog, Promise recovered, std::string folder, Reference> degraded, Reference> activeSharedTLog ) { state TLogData self( tlogId, persistentData, persistentQueue, db, degraded, folder ); state Future error = actorCollection( self.sharedActors.getFuture() ); @@ -2770,6 +2791,16 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } + when ( wait( activeSharedTLog->onChange() ) ) { + if (activeSharedTLog->get() == tlogId) { + TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get()); + self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; + } else { + stopAllTLogs(&self, tlogId); + TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get()); + self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); + } + } } } } catch (Error& e) { diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index c0d447d35f..8e4e009188 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -445,7 +445,9 @@ ACTOR Future masterProxyServer(MasterProxyInterface proxy, InitializeMaste ACTOR Future tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, - Promise oldLog, Promise recovered, std::string folder, Reference> degraded); // changes tli->id() to be the recovered ID + Promise oldLog, Promise recovered, std::string folder, + Reference> degraded, Reference> activeSharedTLog); + ACTOR Future monitorServerDBInfo(Reference>> ccInterface, Reference ccf, LocalityData locality, Reference> dbInfo); @@ -467,7 +469,8 @@ namespace oldTLog_6_0 { ACTOR Future tLog(IKeyValueStore* persistentData, IDiskQueue* persistentQueue, Reference> db, LocalityData locality, PromiseStream tlogRequests, UID tlogId, bool restoreFromDisk, - Promise oldLog, Promise recovered, std::string folder, Reference> degraded); + Promise oldLog, Promise recovered, std::string folder, + Reference> degraded, Reference> activeSharedTLog); } typedef decltype(&tLog) TLogFn; diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 22f2b221ef..7b7b45b0e6 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -754,6 +754,17 @@ ACTOR Future monitorServerDBInfo( Reference actor = Void(); + UID uid = UID(); + PromiseStream requests; + + SharedLogsValue() = default; + SharedLogsValue( Future actor, UID uid, PromiseStream requests ) + : actor(actor), uid(uid), requests(requests) { + } +}; + ACTOR Future workerServer( Reference connFile, Reference>> ccInterface, @@ -782,7 +793,9 @@ ACTOR Future workerServer( // decide if we should collapse them into the same SharedTLog instance as well. The answer // here is no, so that when running with log_version==3, all files should say V=3. state std::map, - std::pair, PromiseStream>> sharedLogs; + SharedLogsValue> sharedLogs; + state Reference> activeSharedTLog(new AsyncVar()); + state std::string coordFolder = abspath(_coordFolder); state WorkerInterface interf( locality ); @@ -899,13 +912,15 @@ ACTOR Future workerServer( auto& logData = sharedLogs[std::make_tuple(s.tLogOptions.version, s.storeType, s.tLogOptions.spillType)]; // FIXME: Shouldn't if logData.first isValid && !isReady, shouldn't we // be sending a fake InitializeTLogRequest rather than calling tLog() ? - Future tl = tLogFn( kv, queue, dbInfo, locality, !logData.first.isValid() || logData.first.isReady() ? logData.second : PromiseStream(), s.storeID, true, oldLog, recovery, folder, degraded ); + Future tl = tLogFn( kv, queue, dbInfo, locality, !logData.actor.isValid() || logData.actor.isReady() ? logData.requests : PromiseStream(), s.storeID, true, oldLog, recovery, folder, degraded, activeSharedTLog ); recoveries.push_back(recovery.getFuture()); + activeSharedTLog->set(s.storeID); tl = handleIOErrors( tl, kv, s.storeID ); tl = handleIOErrors( tl, queue, s.storeID ); - if(!logData.first.isValid() || logData.first.isReady()) { - logData.first = oldLog.getFuture() || tl; + if(!logData.actor.isValid() || logData.actor.isReady()) { + logData.actor = oldLog.getFuture() || tl; + logData.uid = s.storeID; } errorForwarders.add( forwardError( errors, Role::SHARED_TRANSACTION_LOG, s.storeID, tl ) ); } @@ -1045,8 +1060,8 @@ ACTOR Future workerServer( TLogOptions tLogOptions(req.logVersion, req.spillType); TLogFn tLogFn = tLogFnForOptions(tLogOptions); auto& logData = sharedLogs[std::make_tuple(req.logVersion, req.storeType, req.spillType)]; - logData.second.send(req); - if(!logData.first.isValid() || logData.first.isReady()) { + logData.requests.send(req); + if(!logData.actor.isValid() || logData.actor.isReady()) { UID logId = deterministicRandom()->randomUniqueID(); std::map details; details["ForMaster"] = req.recruitmentID.shortString(); @@ -1063,11 +1078,14 @@ ACTOR Future workerServer( filesClosed.add( data->onClosed() ); filesClosed.add( queue->onClosed() ); - logData.first = tLogFn( data, queue, dbInfo, locality, logData.second, logId, false, Promise(), Promise(), folder, degraded ); - logData.first = handleIOErrors( logData.first, data, logId ); - logData.first = handleIOErrors( logData.first, queue, logId ); - errorForwarders.add( forwardError( errors, Role::SHARED_TRANSACTION_LOG, logId, logData.first ) ); + Future tLogCore = tLogFn( data, queue, dbInfo, locality, logData.requests, logId, false, Promise(), Promise(), folder, degraded, activeSharedTLog ); + tLogCore = handleIOErrors( tLogCore, data, logId ); + tLogCore = handleIOErrors( tLogCore, queue, logId ); + errorForwarders.add( forwardError( errors, Role::SHARED_TRANSACTION_LOG, logId, tLogCore ) ); + logData.actor = tLogCore; + logData.uid = logId; } + activeSharedTLog->set(logData.uid); } when( InitializeStorageRequest req = waitNext(interf.storage.getFuture()) ) { if( !storageCache.exists( req.reqId ) ) { From 0e9d08280567ec558ad8c12377b8f499f8aa54c8 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 17 Oct 2019 21:34:17 -0700 Subject: [PATCH 041/184] Bug fixes in FIFOQueue concurrent nested reads and writes caused by the pager/freelist circular dependencies. --- fdbserver/VersionedBTree.actor.cpp | 168 +++++++++++++++-------------- 1 file changed, 88 insertions(+), 80 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index eb6428dc68..d4c4ed5c61 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -206,10 +206,7 @@ public: Future operation; Mode mode; - uint32_t debug_id; - Cursor() : mode(NONE) { - debug_id = deterministicRandom()->randomUInt32(); } // Initialize a cursor. Since cursors can have async operations pending they can't be copied cleanly. @@ -236,10 +233,10 @@ public: operation = Void(); } - debug_printf("FIFOQueue::Cursor initialized: %s\n", toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) initialized\n", toString().c_str()); if(mode == WRITE && initialPageID != invalidLogicalPageID) { - newPage(initialPageID); + addNewPage(initialPageID, 0, true); } } @@ -250,10 +247,14 @@ public: } std::string toString() const { - if(mode == NONE) { - return format("{cursor=%x queue=n/a}", debug_id); + if(mode == WRITE) { + return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1); } - return format("{cursor=%x queue=%s mode=%d pos=%s:%d endOffset=%d endPage=%s}", debug_id, queue ? queue->name.c_str() : "null", mode, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, ::toString(endPageID).c_str()); + if(mode == READ) { + return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, ::toString(endPageID).c_str()); + } + ASSERT(mode == NONE); + return format("{NullCursor=%p}", this); } #pragma pack(push, 1) @@ -291,17 +292,18 @@ public: Future loadPage() { ASSERT(mode == READ); - debug_printf("FIFOQueue::Cursor loading %s\n", toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) loadPage\n", toString().c_str()); return map(queue->pager->readPage(pageID, true), [=](Reference p) { page = p; ASSERT(raw()->formatVersion == RawPage::FORMAT_VERSION); + debug_printf("FIFOQueue::Cursor(%s) loadPage done\n", toString().c_str()); return Void(); }); } void writePage() { ASSERT(mode == WRITE); - debug_printf("FIFOQueue(%s) writing page %s\n", queue->name.c_str(), toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) writePage\n", toString().c_str()); VALGRIND_MAKE_MEM_DEFINED(raw()->begin(), offset); VALGRIND_MAKE_MEM_DEFINED(raw()->begin() + offset, queue->dataBytesPerPage - raw()->endOffset); queue->pager->updatePage(pageID, page); @@ -310,81 +312,81 @@ public: } } - ACTOR static Future newPage_impl(Cursor *self, Future previous, LogicalPageID newPageID, int newOffset, bool initializeNewPage) { - ASSERT(self->mode == WRITE); - wait(previous); - debug_printf("FIFOQueue::Cursor Adding page %s init=%d %s\n", ::toString(newPageID).c_str(), initializeNewPage, self->toString().c_str()); - ASSERT(self->mode == WRITE); - if(newPageID == invalidLogicalPageID) { - debug_printf("FIFOQueue::Cursor Allocating new page %s\n", self->toString().c_str()); - wait(store(newPageID, self->queue->pager->newPageID())); - // numPages is only increased if the page is allocated here. - // Callers who pass in a page are responsible for updating numPages when necessary (it isn't always necessary) - ++self->queue->numPages; - } - debug_printf("FIFOQueue::Cursor Adding page %s init=%d %s\n", ::toString(newPageID).c_str(), initializeNewPage, self->toString().c_str()); - - // Update existing page and write, if it exists - if(self->page) { - self->setNext(newPageID, newOffset); - debug_printf("FIFOQueue::Cursor Linked new page, writing %s\n", self->toString().c_str()); - self->writePage(); - } - - self->pageID = newPageID; - self->offset = newOffset; - - if(initializeNewPage) { - self->page = self->queue->pager->newPageBuffer(); - self->setNext(0, 0); - auto p = self->raw(); - p->formatVersion = RawPage::FORMAT_VERSION; - p->endOffset = 0; - } - - debug_printf("FIFOQueue::Cursor Added page %s\n", self->toString().c_str()); - return Void(); - } - // Link the current page to newPageID:newOffset and then write it to the pager. // If initializeNewPage is true a page buffer will be allocated for the new page and it will be initialized // as a new tail page. - void newPage(LogicalPageID newPageID = invalidLogicalPageID, int newOffset = 0, bool initializeNewPage = true) { - operation = newPage_impl(this, operation, newPageID, newOffset, initializeNewPage); + void addNewPage(LogicalPageID newPageID, int newOffset, bool initializeNewPage) { + ASSERT(mode == WRITE); + ASSERT(newPageID != invalidLogicalPageID); + debug_printf("FIFOQueue::Cursor(%s) Adding page %s init=%d\n", toString().c_str(), ::toString(newPageID).c_str(), initializeNewPage); + + // Update existing page and write, if it exists + if(page) { + setNext(newPageID, newOffset); + debug_printf("FIFOQueue::Cursor(%s) Linked new page\n", toString().c_str()); + writePage(); + } + + pageID = newPageID; + offset = newOffset; + + if(initializeNewPage) { + debug_printf("FIFOQueue::Cursor(%s) Initializing new page\n", toString().c_str()); + page = queue->pager->newPageBuffer(); + setNext(0, 0); + auto p = raw(); + p->formatVersion = RawPage::FORMAT_VERSION; + ASSERT(newOffset == 0); + p->endOffset = 0; + } + else { + page.clear(); + } } // Write item to the next position in the current page or, if it won't fit, add a new page and write it there. - ACTOR static Future write_impl(Cursor *self, Future previous, T item) { + ACTOR static Future write_impl(Cursor *self, T item, Future start) { ASSERT(self->mode == WRITE); + + // Wait for the previous operation to finish + state Future previous = self->operation; + wait(start); wait(previous); + state int bytesNeeded = Codec::bytesNeeded(item); if(self->offset + bytesNeeded > self->queue->dataBytesPerPage) { - debug_printf("FIFOQueue::Cursor write(%s) page is full, adding new page %s\n", ::toString(item).c_str(), self->toString().c_str()); - wait(newPage_impl(self, Void(), invalidLogicalPageID, 0, true)); + debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", self->toString().c_str(), ::toString(item).c_str()); + LogicalPageID newPageID = wait(self->queue->pager->newPageID()); + self->addNewPage(newPageID, 0, true); wait(yield()); } - debug_printf("FIFOQueue::Cursor write(%s) %s\n", ::toString(item).c_str(), self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) write(%s)\n", self->toString().c_str(), ::toString(item).c_str()); auto p = self->raw(); Codec::writeToBytes(p->begin() + self->offset, item); self->offset += bytesNeeded; p->endOffset = self->offset; ++self->queue->numEntries; - debug_printf("FIFOQueue::Cursor write(%s) finished, %s\n", ::toString(item).c_str(), self->toString().c_str()); return Void(); } void write(const T &item) { - operation = write_impl(this, operation, item); + Promise p; + operation = write_impl(this, item, p.getFuture()); + p.send(Void()); } // Read the next item at the cursor, moving to a new page first if the current page is exhausted - ACTOR static Future> readNext_impl(Cursor *self, Future previous, Optional upperBound) { + ACTOR static Future> readNext_impl(Cursor *self, Optional upperBound, Future start) { ASSERT(self->mode == READ); + + // Wait for the previous operation to finish + state Future previous = self->operation; + wait(start); wait(previous); - debug_printf("FIFOQueue::Cursor readNext begin %s\n", self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) readNext begin\n", self->toString().c_str()); if(self->pageID == invalidLogicalPageID || self->pageID == self->endPageID) { - debug_printf("FIFOQueue::Cursor readNext returning nothing %s\n", self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) readNext returning nothing\n", self->toString().c_str()); return Optional(); } @@ -394,31 +396,31 @@ public: wait(yield()); } - debug_printf("FIFOQueue::Cursor readNext reading at current position %s\n", self->toString().c_str()); auto p = self->raw(); + debug_printf("FIFOQueue::Cursor(%s) readNext reading at current position\n", self->toString().c_str()); ASSERT(self->offset < p->endOffset); int bytesRead; T result = Codec::readFromBytes(p->begin() + self->offset, bytesRead); if(upperBound.present() && upperBound.get() < result) { - debug_printf("FIFOQueue(%s) not popping %s, exceeds upper bound %s %s\n", - self->queue->name.c_str(), ::toString(result).c_str(), ::toString(upperBound.get()).c_str(), self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) not popping %s, exceeds upper bound %s\n", + self->toString().c_str(), ::toString(result).c_str(), ::toString(upperBound.get()).c_str()); return Optional(); } self->offset += bytesRead; --self->queue->numEntries; - debug_printf("FIFOQueue::Cursor popped %s, %s\n", ::toString(result).c_str(), self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) popped %s\n", self->toString().c_str(), ::toString(result).c_str()); ASSERT(self->offset <= p->endOffset); if(self->offset == p->endOffset) { - debug_printf("FIFOQueue::Cursor Page exhausted, %s\n", self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) Page exhausted\n", self->toString().c_str()); LogicalPageID oldPageID = self->pageID; self->pageID = p->nextPageID; self->offset = p->nextOffset; --self->queue->numPages; self->page.clear(); - debug_printf("FIFOQueue::Cursor Page exhausted, moved to new page, %s\n", self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) Page exhausted, moved to new page\n", self->toString().c_str()); // Freeing the old page must happen after advancing the cursor and clearing the page reference because // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this @@ -433,8 +435,10 @@ public: if(mode == NONE) { return Optional(); } - Future> read = readNext_impl(this, operation, upperBound); + Promise p; + Future> read = readNext_impl(this, upperBound, p.getFuture()); operation = success(read); + p.send(Void()); return read; } }; @@ -564,7 +568,7 @@ public: // If a new tail page was allocated, link the last page of the tail writer to it. if(newTailPage.get() != invalidLogicalPageID) { - tailWriter.newPage(newTailPage.get(), 0, false); + tailWriter.addNewPage(newTailPage.get(), 0, false); // The flush sequence allocated a page and added it to the queue so increment numPages ++numPages; @@ -577,7 +581,7 @@ public: // If the headWriter wrote anything, link its tail page to the headReader position and point the headReader // to the start of the headWriter if(headWriter.pendingWrites()) { - headWriter.newPage(headReader.pageID, headReader.offset, false); + headWriter.addNewPage(headReader.pageID, headReader.offset, false); headReader.pageID = headWriter.firstPageIDWritten; headReader.offset = 0; } @@ -724,6 +728,7 @@ public: Entry &toEvict = evictionOrder.front(); // Don't evict the entry that was just added as then we can't return a reference to it. if(toEvict.index != index && toEvict.item.evictable()) { + debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); evictionOrder.pop_front(); cache.erase(toEvict.index); } @@ -973,7 +978,7 @@ public: // First try the free list Optional freePageID = wait(self->freeList.pop()); if(freePageID.present()) { - debug_printf("COWPager(%s) newPageID() returned %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); + debug_printf("COWPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); return freePageID.get(); } @@ -986,7 +991,7 @@ public: // Lastly, grow the pager file by a page and return it. LogicalPageID id = self->pHeader->pageCount; ++self->pHeader->pageCount; - debug_printf("COWPager(%s) new page, %s at end of file\n", self->filename.c_str(), toString(id).c_str()); + debug_printf("COWPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str()); return id; }; @@ -1009,14 +1014,14 @@ public: void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("COWPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places // the new content in the cache entry when the write is launched, not when it is completed. // Any waiting readers should not see this write (though this might change) if(cacheEntry.reading()) { // Wait for the read to finish, then start the write. - cacheEntry.writeFuture = map(success(cacheEntry.page), [=](Void) { + cacheEntry.writeFuture = map(success(cacheEntry.readFuture), [=](Void) { writePhysicalPage(pageID, data); return Void(); }); @@ -1037,7 +1042,7 @@ public: operations.add(forwardError(cacheEntry.writeFuture, errorPromise)); // Always update the page contents immediately regardless of what happened above. - cacheEntry.page = data; + cacheEntry.readFuture = data; } Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) override { @@ -1078,6 +1083,7 @@ public: ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { state Reference page = self->newPageBuffer(); + debug_printf("COWPager(%s) op=read_physical_start %s\n", self->filename.c_str(), toString(pageID).c_str()); int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); debug_printf("COWPager(%s) op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); ASSERT(readBytes == self->physicalPageSize); @@ -1103,22 +1109,24 @@ public: // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache if(!cacheable) { + debug_printf("COWPager(%s) op=read_nocache %s\n", filename.c_str(), toString(pageID).c_str()); PageCacheEntry *pCacheEntry = pageCache.getIfExists(pageID); if(pCacheEntry != nullptr) { - return pCacheEntry->page; + return pCacheEntry->readFuture; } return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); } PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.page.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("COWPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); - if(!cacheEntry.page.isValid()) { - cacheEntry.page = readPhysicalPage(this, (PhysicalPageID)pageID); + if(!cacheEntry.readFuture.isValid()) { + debug_printf("COWPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); + cacheEntry.readFuture = readPhysicalPage(this, (PhysicalPageID)pageID); } - return forwardError(cacheEntry.page, errorPromise); + return forwardError(cacheEntry.readFuture, errorPromise); } // Get snapshot as of the most recent committed version of the pager @@ -1304,11 +1312,11 @@ private: #pragma pack(pop) struct PageCacheEntry { - Future> page; + Future> readFuture; Future writeFuture; bool reading() const { - return page.isValid() && !page.isReady(); + return readFuture.isValid() && !readFuture.isReady(); } bool writing() const { @@ -1317,11 +1325,11 @@ private: bool evictable() const { // Don't evict if a page is still being read or written - return page.isReady() && !writing(); + return !reading() && !writing(); } void destroy() { - page.cancel(); + readFuture.cancel(); writeFuture.cancel(); } }; From 44175e0921949a7dc880331fd30415d8982e1954 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 18 Oct 2019 01:27:00 -0700 Subject: [PATCH 042/184] COWPager will no longer expire read Snapshots that are still in use. --- fdbserver/IPager.h | 11 ++-- fdbserver/IVersionedStore.h | 3 +- fdbserver/VersionedBTree.actor.cpp | 95 ++++++++++++++++++++---------- 3 files changed, 73 insertions(+), 36 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 508c90cf9b..d6e60fd2fe 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -213,13 +213,14 @@ public: // After the returned future is ready, future calls must not wait. virtual Future getLatestVersion() = 0; - // The pager can invalidate snapshots at versions < v and reuse - // any pages that were freed as of version v - virtual void setOldestVersion(Version v) = 0; - - // Get the oldest readable version + // Returns the oldest readable version as of the most recent committed version virtual Future getOldestVersion() = 0; + // The pager can reuse pages that were freed at a version less than v. + // If any snapshots are in use at a version less than v, the pager can invalidate them + // or keep their versions around until the snapshots are no longer in use. + virtual void setOldestVersion(Version v) = 0; + protected: ~IPager2() {} // Destruction should be done using close()/dispose() from the IClosable interface }; diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h index d991073b2d..482a1521a9 100644 --- a/fdbserver/IVersionedStore.h +++ b/fdbserver/IVersionedStore.h @@ -58,7 +58,8 @@ public: virtual void clear(KeyRangeRef range) = 0; virtual void mutate(int op, StringRef param1, StringRef param2) = 0; virtual void setWriteVersion(Version) = 0; // The write version must be nondecreasing - virtual void forgetVersions(Version begin, Version end) = 0; // Versions [begin, end) no longer readable + virtual void setOldestVersion(Version v) = 0; // Set oldest readable version to be used in next commit + virtual Version getOldestVersion() = 0; // Get oldest readable version virtual Future commit() = 0; virtual Future getLatestVersion() = 0; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index d4c4ed5c61..bce0462add 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -779,6 +779,8 @@ ACTOR template Future forwardError(Future f, Promise target } } +class COWPagerSnapshot; + class COWPager : public IPager2 { public: typedef FastAllocatedPage Page; @@ -940,8 +942,8 @@ public: self->pHeader->pageCount = 2; // Create a new free list - self->freeList.create(self, self->newPageID().get(), "FreeList"); - self->delayedFreeList.create(self, self->newPageID().get(), "delayedFreeList"); + self->freeList.create(self, self->newLastPageID(), "FreeList"); + self->delayedFreeList.create(self, self->newLastPageID(), "delayedFreeList"); // The first commit() below will flush the queues and update the queue states in the header, // but since the queues will not be used between now and then their states will not change. @@ -982,19 +984,28 @@ public: return freePageID.get(); } - Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{self->pLastCommittedHeader->oldestVersion, 0})); + // Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in the snapshots list + ASSERT(!self->snapshots.empty()); + Version oldestVersion = std::min(self->pLastCommittedHeader->oldestVersion, self->snapshots.front().version); + Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{oldestVersion, 0})); if(delayedFreePageID.present()) { debug_printf("COWPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); return delayedFreePageID.get().pageID; } - // Lastly, grow the pager file by a page and return it. - LogicalPageID id = self->pHeader->pageCount; - ++self->pHeader->pageCount; + // Lastly, add a new page to the pager + LogicalPageID id = self->newLastPageID(); debug_printf("COWPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str()); return id; }; + // Grow the pager file by pone page and return it + LogicalPageID newLastPageID() { + LogicalPageID id = pHeader->pageCount; + ++pHeader->pageCount; + return id; + } + Future newPageID() override { return forwardError(newPageID_impl(this), errorPromise); } @@ -1131,7 +1142,7 @@ public: // Get snapshot as of the most recent committed version of the pager Reference getReadSnapshot(Version v) override; - void addLatestSnapshot() override; + void addLatestSnapshot(); void setOldestVersion(Version v) override { ASSERT(v >= pHeader->oldestVersion); @@ -1156,6 +1167,10 @@ public: loop { state bool freeBusy = wait(self->freeList.preFlush()); state bool delayedFreeBusy = wait(self->delayedFreeList.preFlush()); + + // Once preFlush() returns false for both queues then there are no more operations pending + // on either queue. If preFlush() returns true for either queue in one loop execution then + // it could have generated new work for itself or the other queue. if(!freeBusy && !delayedFreeBusy) { break; } @@ -1184,6 +1199,9 @@ public: self->updateCommittedHeader(); self->addLatestSnapshot(); + // Try to expire snapshots up to the oldest version, in case some were being kept around due to being in use, + // because maybe some are no longer in use. + self->expireSnapshots(self->pHeader->oldestVersion); return Void(); } @@ -1268,15 +1286,8 @@ public: private: ~COWPager() {} - // Expire snapshots up to but not including v - void expireSnapshots(Version v) { - debug_printf("COWPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); - while(snapshots.size() > 1 && snapshots.front().version < v) { - debug_printf("COWPager(%s) expiring snapshot for %" PRId64 "\n", filename.c_str(), snapshots.front().version); - snapshots.front().expired.sendError(transaction_too_old()); - snapshots.pop_front(); - } - } + // Try to expire snapshots up to but not including v, but do not expire any snapshots that are in use. + void expireSnapshots(Version v); #pragma pack(push, 1) // Header is the format of page 0 of the database @@ -1373,7 +1384,7 @@ private: struct SnapshotEntry { Version version; Promise expired; - Reference snapshot; + Reference snapshot; }; struct SnapshotEntryLessThanVersion { @@ -1390,7 +1401,7 @@ private: }; // Prevents pager from reusing freed pages from version until the snapshot is destroyed -class COWPagerSnapshot : public IPagerSnapshot, ReferenceCounted { +class COWPagerSnapshot : public IPagerSnapshot, public ReferenceCounted { public: COWPagerSnapshot(COWPager *pager, Key meta, Version version, Future expiredFuture) : pager(pager), metaKey(meta), version(version), expired(expiredFuture) { } @@ -1428,6 +1439,18 @@ public: Key metaKey; }; +void COWPager::expireSnapshots(Version v) { + debug_printf("COWPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); + while(snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { + debug_printf("COWPager(%s) expiring snapshot for %" PRId64 "\n", filename.c_str(), snapshots.front().version); + // The snapshot contract could be made such that the expired promise isn't need anymore. In practice it + // probably is already not needed but it will gracefully handle the case where a user begins a page read + // with a snapshot reference, keeps the page read future, and drops the snapshot reference. + snapshots.front().expired.sendError(transaction_too_old()); + snapshots.pop_front(); + } +} + Reference COWPager::getReadSnapshot(Version v) { ASSERT(!snapshots.empty()); @@ -1444,7 +1467,7 @@ void COWPager::addLatestSnapshot() { snapshots.push_back({ pLastCommittedHeader->committedVersion, expired, - Reference(new COWPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) + Reference(new COWPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) }); } @@ -2479,8 +2502,13 @@ public: virtual void mutate(int op, StringRef param1, StringRef param2) NOT_IMPLEMENTED - // Versions [begin, end) no longer readable - virtual void forgetVersions(Version begin, Version end) NOT_IMPLEMENTED + virtual void setOldestVersion(Version v) { + m_newOldestVersion = v; + } + + virtual Version getOldestVersion() { + return m_pager->getOldestVersion().get(); + } virtual Future getLatestVersion() { if(m_writeVersion != invalidVersion) @@ -2567,7 +2595,9 @@ public: ACTOR static Future init_impl(VersionedBTree *self) { state Version latest = wait(self->m_pager->getLatestVersion()); - debug_printf("Recovered pager to version %" PRId64 "\n", latest); + self->m_newOldestVersion = self->m_pager->getOldestVersion().get(); + + debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", self->m_newOldestVersion); state Key meta = self->m_pager->getMetaKey(); if(meta.size() == 0) { @@ -2612,12 +2642,11 @@ public: m_latestCommit.cancel(); } - // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed - // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. + // readAtVersion() may only be called on a committed v which has previously been passed to setWriteVersion() and never previously passed + // to setOldestVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less // than or equal to the given version. - // If readAtVersion() is called on the *current* write version, the given read cursor MAY reflect subsequent writes at the same - // write version, OR it may represent a snapshot as of the call to readAtVersion(). + // v must be a committed version. virtual Reference readAtVersion(Version v) { // Only committed versions can be read. Version recordVersion = singleVersion ? 0 : v; @@ -2909,6 +2938,7 @@ private: Version m_writeVersion; Version m_lastCommittedVersion; + Version m_newOldestVersion; Future m_latestCommit; Future m_init; std::string m_name; @@ -3680,10 +3710,8 @@ private: // Wait for the latest commit that started to be finished. wait(previousCommit); - // Advance oldest version by a random number between 0 and the difference between the latest and oldest versions. - Version newOldestVersion = self->m_pager->getOldestVersion().get() + deterministicRandom()->randomInt(0, self->m_pager->getLatestVersion().get() - self->m_pager->getOldestVersion().get() + 1); - self->m_pager->setOldestVersion(newOldestVersion); - debug_printf("%s: Beginning commit of version %" PRId64 ", oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, newOldestVersion); + self->m_pager->setOldestVersion(self->m_newOldestVersion); + debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, self->m_newOldestVersion); state Future lazyDelete = incrementalLazyDelete(self, 100); @@ -5277,6 +5305,7 @@ TEST_CASE("!/redwood/correctness/btree") { state int mutationBytesTarget = shortTest ? 5000 : randomSize(std::min(maxCommitSize * 100, 100e6)); state double clearProbability = deterministicRandom()->random01() * .1; state double coldStartProbability = deterministicRandom()->random01(); + state double advanceOldVersionProbability = deterministicRandom()->random01(); state double maxWallClockDuration = 60; printf("\n"); @@ -5290,6 +5319,7 @@ TEST_CASE("!/redwood/correctness/btree") { printf("mutationBytesTarget: %d\n", mutationBytesTarget); printf("clearProbability: %f\n", clearProbability); printf("coldStartProbability: %f\n", coldStartProbability); + printf("advanceOldVersionProbability: %f\n", advanceOldVersionProbability); printf("\n"); printf("Deleting existing test data...\n"); @@ -5431,6 +5461,11 @@ TEST_CASE("!/redwood/correctness/btree") { Version v = version; // Avoid capture of version as a member of *this + // Sometimes advance the oldest version to close the gap between the oldest and latest versions by a random amount. + if(deterministicRandom()->random01() < advanceOldVersionProbability) { + btree->setOldestVersion(btree->getLastCommittedVersion() - deterministicRandom()->randomInt(0, btree->getLastCommittedVersion() - btree->getOldestVersion() + 1)); + } + commit = map(btree->commit(), [=](Void) { printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); // Notify the background verifier that version is committed and therefore readable From a79757a788a46adb545524730e3cddbd00af60a3 Mon Sep 17 00:00:00 2001 From: mpilman Date: Mon, 21 Oct 2019 10:57:58 -0700 Subject: [PATCH 043/184] Fix compiler errors on Catalina Fixes #2263 --- cmake/ConfigureCompiler.cmake | 5 +++++ flow/Platform.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index df2235759e..a29edf28e5 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -221,9 +221,14 @@ else() # Check whether we can use dtrace probes include(CheckSymbolExists) check_symbol_exists(DTRACE_PROBE sys/sdt.h SUPPORT_DTRACE) + check_symbol_exists(aligned_alloc stdlib.h HAS_ALIGNED_ALLOC) + message(STATUS "Has aligned_alloc: ${HAS_ALIGNED_ALLOC}") if(SUPPORT_DTRACE) add_compile_definitions(DTRACE_PROBES) endif() + if(HAS_ALIGNED_ALLOC) + add_compile_definitions(HAS_ALIGNED_ALLOC) + endif() if(CMAKE_COMPILER_IS_GNUCXX) set(USE_LTO OFF CACHE BOOL "Do link time optimization") diff --git a/flow/Platform.h b/flow/Platform.h index fd511d4e6c..217dd0f645 100644 --- a/flow/Platform.h +++ b/flow/Platform.h @@ -524,6 +524,7 @@ inline static void aligned_free(void* ptr) { free(ptr); } inline static void* aligned_alloc(size_t alignment, size_t size) { return memalign(alignment, size); } #endif #elif defined(__APPLE__) +#if !defined(HAS_ALIGNED_ALLOC) #include inline static void* aligned_alloc(size_t alignment, size_t size) { // Linux's aligned_alloc() requires alignment to be a power of 2. While posix_memalign() @@ -540,6 +541,7 @@ inline static void* aligned_alloc(size_t alignment, size_t size) { posix_memalign(&ptr, alignment, size); return ptr; } +#endif inline static void aligned_free(void* ptr) { free(ptr); } #endif From 2d0722b0c7545c3d206a70156da4fe49672fb3ba Mon Sep 17 00:00:00 2001 From: mpilman Date: Mon, 21 Oct 2019 11:22:05 -0700 Subject: [PATCH 044/184] fixed cmake version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index edd172327e..8d648cf38a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ # limitations under the License. cmake_minimum_required(VERSION 3.12) project(foundationdb - VERSION 6.2.6 + VERSION 6.2.7 DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions." HOMEPAGE_URL "http://www.foundationdb.org/" LANGUAGES C CXX ASM) From 12c517ab1658fd5aec292ebd8fb9f0532d370145 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 21 Oct 2019 16:01:45 -0700 Subject: [PATCH 045/184] limit the number of committed version updates in progress simultaneously to prevent running out of memory --- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/MasterProxyServer.actor.cpp | 10 +++++++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index a8fd4ecbd4..c692d80ed9 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -317,6 +317,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( ENFORCED_MIN_RECOVERY_DURATION, 0.085 ); if( shortRecoveryDuration ) ENFORCED_MIN_RECOVERY_DURATION = 0.01; init( REQUIRED_MIN_RECOVERY_DURATION, 0.080 ); if( shortRecoveryDuration ) REQUIRED_MIN_RECOVERY_DURATION = 0.01; init( ALWAYS_CAUSAL_READ_RISKY, false ); + init( MAX_COMMIT_UPDATES, 100000 ); if( randomize && BUGGIFY ) MAX_COMMIT_UPDATES = 1; // Master Server // masterCommitter() in the master server will allow lower priority tasks (e.g. DataDistibution) diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index d9e85470a1..924e6a427f 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -262,6 +262,7 @@ public: double ENFORCED_MIN_RECOVERY_DURATION; double REQUIRED_MIN_RECOVERY_DURATION; bool ALWAYS_CAUSAL_READ_RISKY; + int MAX_COMMIT_UPDATES; // Master Server double COMMIT_SLEEP_TIME; diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index 1016d5ba40..a64eafa2bc 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -236,6 +236,7 @@ struct ProxyCommitData { Optional latencyBandConfig; double lastStartCommit; double lastCommitLatency; + int updateCommitRequests = 0; NotifiedDouble lastCommitTime; //The tag related to a storage server rarely change, so we keep a vector of tags for each key range to be slightly more CPU efficient. @@ -1045,7 +1046,9 @@ ACTOR Future commitBatch( ACTOR Future updateLastCommit(ProxyCommitData* self, Optional debugID = Optional()) { state double confirmStart = now(); self->lastStartCommit = confirmStart; + self->updateCommitRequests++; wait(self->logSystem->confirmEpochLive(debugID)); + self->updateCommitRequests--; self->lastCommitLatency = now()-confirmStart; self->lastCommitTime = std::max(self->lastCommitTime.get(), confirmStart); return Void(); @@ -1453,7 +1456,12 @@ ACTOR Future lastCommitUpdater(ProxyCommitData* self, PromiseStreamupdateCommitRequests < SERVER_KNOBS->MAX_COMMIT_UPDATES) { + addActor.send(updateLastCommit(self)); + } else { + TraceEvent(g_network->isSimulated() ? SevInfo : SevWarnAlways, "TooManyLastCommitUpdates").suppressFor(1.0); + self->lastStartCommit = now(); + } } } } From 2caad04d9c40a942c95343d5804303fd12fb5c89 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 22 Oct 2019 11:58:40 -0700 Subject: [PATCH 046/184] Keys in the destUIDLookupPrefix can be cleaned up automatically if they do not have an associated entry in the logRangesRange keyspace --- fdbserver/Status.actor.cpp | 55 +++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 5962556855..77632a28b4 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1151,26 +1151,61 @@ ACTOR static Future consistencyCheckStatusFetcher(Database cx, JsonBuilder return Void(); } +struct LogRangeAndUID { + KeyRange range; + UID destID; + + LogRangeAndUID(KeyRange const& range, UID const& destID) : range(range), destID(destID) {} + + bool operator < (LogRangeAndUID const& r) const { + if(range.begin != r.range.begin) return range.begin < r.range.begin; + if(range.end != r.range.end) return range.end < r.range.end; + return destID < r.destID; + } +}; + ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray *messages, std::set *incomplete_reasons) { try { state Transaction tr(cx); + state Future timeoutFuture = delay(5.0); loop { try { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); - Standalone existingDestUidValues = wait(timeoutError(tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY), 5.0)); - std::set> existingRanges; - for(auto it : existingDestUidValues) { - KeyRange range = BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); - std::pair rangePair = std::make_pair(range.begin,range.end); - if(existingRanges.count(rangePair)) { - messages->push_back(JsonString::makeMessage("duplicate_mutation_streams", format("Backup and DR are not sharing the same stream of mutations for `%s` - `%s`", printable(range.begin).c_str(), printable(range.end).c_str()).c_str())); - break; - } - existingRanges.insert(rangePair); + state Future> existingDestUidValues = tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY); + state Future> existingLogRanges = tr.getRange(logRangesRange, CLIENT_KNOBS->TOO_MANY); + wait( (success(existingDestUidValues) && success(existingLogRanges)) || timeoutFuture ); + if(timeoutFuture.isReady()) { + throw timed_out(); } + + std::set loggingRanges; + for(auto& it : existingLogRanges.get()) { + Key logDestination; + UID logUid; + KeyRef logRangeBegin = logRangesDecodeKey(it.key, &logUid); + Key logRangeEnd = logRangesDecodeValue(it.value, &logDestination); + loggingRanges.insert(LogRangeAndUID(KeyRangeRef(logRangeBegin, logRangeEnd), logUid)); + } + + std::set> existingRanges; + for(auto& it : existingDestUidValues.get()) { + KeyRange range = BinaryReader::fromStringRef(it.key.removePrefix(destUidLookupPrefix), IncludeVersion()); + UID logUid = BinaryReader::fromStringRef(it.value, Unversioned()); + if(loggingRanges.count(LogRangeAndUID(range, logUid))) { + std::pair rangePair = std::make_pair(range.begin,range.end); + if(existingRanges.count(rangePair)) { + messages->push_back(JsonString::makeMessage("duplicate_mutation_streams", format("Backup and DR are not sharing the same stream of mutations for `%s` - `%s`", printable(range.begin).c_str(), printable(range.end).c_str()).c_str())); + break; + } + existingRanges.insert(rangePair); + } else { + tr.clear(it.key); + } + } + wait(tr.commit()); break; } catch(Error &e) { if(e.code() == error_code_timed_out) { From d5c2147c0c9c7919ab3279f453087ba41a03b6c5 Mon Sep 17 00:00:00 2001 From: Evan Tschannen <36455792+etschannen@users.noreply.github.com> Date: Tue, 22 Oct 2019 13:27:52 -0700 Subject: [PATCH 047/184] Update fdbserver/Status.actor.cpp Co-Authored-By: A.J. Beamon --- fdbserver/Status.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 77632a28b4..9c67a8d306 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1186,7 +1186,7 @@ ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray * Key logDestination; UID logUid; KeyRef logRangeBegin = logRangesDecodeKey(it.key, &logUid); - Key logRangeEnd = logRangesDecodeValue(it.value, &logDestination); + Key logRangeEnd = logRangesDecodeValue(it.value, &logDestination); loggingRanges.insert(LogRangeAndUID(KeyRangeRef(logRangeBegin, logRangeEnd), logUid)); } From 3478652d06879a0c179adca3eaff7c031449f3fb Mon Sep 17 00:00:00 2001 From: Evan Tschannen <36455792+etschannen@users.noreply.github.com> Date: Tue, 22 Oct 2019 13:32:09 -0700 Subject: [PATCH 048/184] Apply suggestions from code review Co-Authored-By: A.J. Beamon --- fdbserver/Status.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index 9c67a8d306..d4733bbfa8 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1167,7 +1167,7 @@ struct LogRangeAndUID { ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray *messages, std::set *incomplete_reasons) { try { state Transaction tr(cx); - state Future timeoutFuture = delay(5.0); + state Future timeoutFuture = timeoutError(Never(), 5.0); loop { try { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); @@ -1205,7 +1205,7 @@ ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray * tr.clear(it.key); } } - wait(tr.commit()); + wait(tr.commit() || timeoutFuture); break; } catch(Error &e) { if(e.code() == error_code_timed_out) { From e6f5748791d3a26ea7bef0019ed1935b79c34e72 Mon Sep 17 00:00:00 2001 From: Xin Dong Date: Tue, 22 Oct 2019 13:47:58 -0700 Subject: [PATCH 049/184] Use a large value for read sampling size threshold. Also at sampling site, don't round up small values to avoid sampling every key. --- fdbserver/Knobs.cpp | 2 +- fdbserver/storageserver.actor.cpp | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index d0f86f1ab8..8445ff8ba6 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -454,7 +454,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( SPLIT_JITTER_AMOUNT, 0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2; init( IOPS_UNITS_PER_SAMPLE, 10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 ); init( BANDWIDTH_UNITS_PER_SAMPLE, SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 ); - init( BYTES_READ_UNITS_PER_SAMPLE, 100); // Effectively weight up read on small or non-existing key/values. + init( BYTES_READ_UNITS_PER_SAMPLE, 10000); //Storage Server init( STORAGE_LOGGING_DELAY, 5.0 ); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 12d5c9e7a9..342f4d87ad 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -890,9 +890,10 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { } StorageMetrics metrics; - metrics.bytesReadPerKSecond = v.present() ? std::max((int64_t)(req.key.size() + v.get().size()), - SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE) - : SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + // If the read yields no value, randomly sample the empty read. + metrics.bytesReadPerKSecond = + v.present() ? (int64_t)(req.key.size() + v.get().size()) + : deterministicRandom()->random01() > 0.5 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; data->metrics.notify(req.key, metrics); if( req.debugID.present() ) @@ -1271,7 +1272,7 @@ ACTOR Future readRange( StorageServer* data, Version version, result.more = limit == 0 || *pLimitBytes<=0; // FIXME: Does this have to be exact? result.version = version; StorageMetrics metrics; - metrics.bytesReadPerKSecond = std::max(readSize, SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE); + metrics.bytesReadPerKSecond = readSize; data->metrics.notify(limit >= 0 ? range.begin : range.end, metrics); return result; } @@ -1327,14 +1328,15 @@ ACTOR Future findKey( StorageServer* data, KeySelectorRef sel, Version vers *pOffset = 0; StorageMetrics metrics; - metrics.bytesReadPerKSecond = - std::max((int64_t)rep.data[index].key.size(), SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE); + metrics.bytesReadPerKSecond = (int64_t)rep.data[index].key.size(); data->metrics.notify(sel.getKey(), metrics); return rep.data[ index ].key; } else { StorageMetrics metrics; - metrics.bytesReadPerKSecond = SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE; + // Randomly sample an empty read + metrics.bytesReadPerKSecond = + deterministicRandom()->random01() > 0.5 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; data->metrics.notify(sel.getKey(), metrics); // FIXME: If range.begin=="" && !forward, return success? From af72d155663c374736b63f2a0869edb8ba9b8d89 Mon Sep 17 00:00:00 2001 From: Xin Dong Date: Tue, 22 Oct 2019 13:53:28 -0700 Subject: [PATCH 050/184] Update fdbserver/Knobs.cpp From AJ: to match typical aligned format used on other variables. Co-Authored-By: A.J. Beamon --- fdbserver/Knobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 8445ff8ba6..eb409c147d 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -454,7 +454,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( SPLIT_JITTER_AMOUNT, 0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2; init( IOPS_UNITS_PER_SAMPLE, 10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 ); init( BANDWIDTH_UNITS_PER_SAMPLE, SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 ); - init( BYTES_READ_UNITS_PER_SAMPLE, 10000); + init( BYTES_READ_UNITS_PER_SAMPLE, 10000 ); //Storage Server init( STORAGE_LOGGING_DELAY, 5.0 ); From 2d74288d1605a40c3a3a16800885b28ab917afd7 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 22 Oct 2019 16:33:44 -0700 Subject: [PATCH 051/184] Added a comment to clarify why cleanup work is done in status --- fdbserver/Status.actor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index d4733bbfa8..d3b95e793f 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1171,15 +1171,12 @@ ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray * loop { try { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); - tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); - tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::LOCK_AWARE); + tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); state Future> existingDestUidValues = tr.getRange(KeyRangeRef(destUidLookupPrefix, strinc(destUidLookupPrefix)), CLIENT_KNOBS->TOO_MANY); state Future> existingLogRanges = tr.getRange(logRangesRange, CLIENT_KNOBS->TOO_MANY); wait( (success(existingDestUidValues) && success(existingLogRanges)) || timeoutFuture ); - if(timeoutFuture.isReady()) { - throw timed_out(); - } std::set loggingRanges; for(auto& it : existingLogRanges.get()) { @@ -1202,6 +1199,9 @@ ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray * } existingRanges.insert(rangePair); } else { + //This cleanup is done during status, because it should only be required once after upgrading to 6.2.7 or later. + //There is no other good location to detect that the metadata is mismatched. + TraceEvent(SevWarnAlways, "CleaningDestUidLookup").detail("K", it.key.printable()).detail("V", it.value.printable()); tr.clear(it.key); } } From 35ac0071a806987462845a19d0918f0dbd9a28ac Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 22 Oct 2019 17:06:54 -0700 Subject: [PATCH 052/184] fixed a compiler error --- fdbserver/Status.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/Status.actor.cpp b/fdbserver/Status.actor.cpp index d3b95e793f..70e71e922e 100644 --- a/fdbserver/Status.actor.cpp +++ b/fdbserver/Status.actor.cpp @@ -1167,7 +1167,7 @@ struct LogRangeAndUID { ACTOR static Future logRangeWarningFetcher(Database cx, JsonBuilderArray *messages, std::set *incomplete_reasons) { try { state Transaction tr(cx); - state Future timeoutFuture = timeoutError(Never(), 5.0); + state Future timeoutFuture = timeoutError(Future(Never()), 5.0); loop { try { tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); From 6a57fab43145526858169424056307ee3be0d8de Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 22 Oct 2019 17:17:29 -0700 Subject: [PATCH 053/184] Bug fixes in lazy subtree deletion, queue pushFront(), queue flush(), and advancing the oldest pager version. CommitSubtree no longer forces page rewrites due to boundary changes. IPager2 and IVersionedStore now have explicit async init() functions to avoid returning futures from some frequently used functions. --- fdbserver/IPager.h | 15 ++- fdbserver/IVersionedStore.h | 3 +- fdbserver/VersionedBTree.actor.cpp | 150 +++++++++++++++++------------ 3 files changed, 103 insertions(+), 65 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index d6e60fd2fe..35549ac096 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -209,16 +209,21 @@ public: virtual StorageBytes getStorageBytes() = 0; + // Future returned is ready when pager has been initialized from disk and is ready for reads and writes. + // It is invalid to call most other functions until init() is ready. + // TODO: Document further. + virtual Future init() = 0; + // Returns latest committed version - // After the returned future is ready, future calls must not wait. - virtual Future getLatestVersion() = 0; + virtual Version getLatestVersion() = 0; // Returns the oldest readable version as of the most recent committed version - virtual Future getOldestVersion() = 0; + virtual Version getOldestVersion() = 0; + // Sets the oldest readable version to be put into affect at the next commit. // The pager can reuse pages that were freed at a version less than v. - // If any snapshots are in use at a version less than v, the pager can invalidate them - // or keep their versions around until the snapshots are no longer in use. + // If any snapshots are in use at a version less than v, the pager can either forcefully + // invalidate them or keep their versions around until the snapshots are no longer in use. virtual void setOldestVersion(Version v) = 0; protected: diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h index 482a1521a9..de4cfd2084 100644 --- a/fdbserver/IVersionedStore.h +++ b/fdbserver/IVersionedStore.h @@ -62,7 +62,8 @@ public: virtual Version getOldestVersion() = 0; // Get oldest readable version virtual Future commit() = 0; - virtual Future getLatestVersion() = 0; + virtual Future init() = 0; + virtual Version getLatestVersion() = 0; // readAtVersion() may only be called on a version which has previously been passed to setWriteVersion() and never previously passed // to forgetVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index bce0462add..ab06953722 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -354,13 +354,13 @@ public: wait(previous); state int bytesNeeded = Codec::bytesNeeded(item); - if(self->offset + bytesNeeded > self->queue->dataBytesPerPage) { + if(self->pageID == invalidLogicalPageID || self->offset + bytesNeeded > self->queue->dataBytesPerPage) { debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", self->toString().c_str(), ::toString(item).c_str()); LogicalPageID newPageID = wait(self->queue->pager->newPageID()); self->addNewPage(newPageID, 0, true); wait(yield()); } - debug_printf("FIFOQueue::Cursor(%s) write(%s)\n", self->toString().c_str(), ::toString(item).c_str()); + debug_printf("FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), ::toString(item).c_str()); auto p = self->raw(); Codec::writeToBytes(p->begin() + self->offset, item); self->offset += bytesNeeded; @@ -410,7 +410,7 @@ public: self->offset += bytesRead; --self->queue->numEntries; - debug_printf("FIFOQueue::Cursor(%s) popped %s\n", self->toString().c_str(), ::toString(result).c_str()); + debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), ::toString(result).c_str()); ASSERT(self->offset <= p->endOffset); if(self->offset == p->endOffset) { @@ -425,9 +425,11 @@ public: // Freeing the old page must happen after advancing the cursor and clearing the page reference because // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this // very same queue. + // Queue pages are freed at page 0 because they can be reused after the next commit. self->queue->pager->freePage(oldPageID, 0); } + debug_printf("FIFOQueue(%s) pop(upperBound=%s) -> %s\n", self->queue->name.c_str(), ::toString(upperBound).c_str(), ::toString(result).c_str()); return result; } @@ -584,6 +586,7 @@ public: headWriter.addNewPage(headReader.pageID, headReader.offset, false); headReader.pageID = headWriter.firstPageIDWritten; headReader.offset = 0; + headReader.page.clear(); } // Update headReader's end page to the new tail page @@ -986,8 +989,7 @@ public: // Try to reuse pages up to the earlier of the oldest version set by the user or the oldest snapshot still in the snapshots list ASSERT(!self->snapshots.empty()); - Version oldestVersion = std::min(self->pLastCommittedHeader->oldestVersion, self->snapshots.front().version); - Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{oldestVersion, 0})); + Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{self->effectiveOldestVersion(), 0})); if(delayedFreePageID.present()) { debug_printf("COWPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); return delayedFreePageID.get().pageID; @@ -1070,13 +1072,13 @@ public: void freePage(LogicalPageID pageID, Version v) override { // If v is older than the oldest version still readable then mark pageID as free as of the next commit - if(v < pLastCommittedHeader->oldestVersion) { - debug_printf("COWPager(%s) op=freeNow %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); + if(v < effectiveOldestVersion()) { + debug_printf("COWPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); freeList.pushBack(pageID); } else { // Otherwise add it to the delayed free list - debug_printf("COWPager(%s) op=freeLater %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); + debug_printf("COWPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); delayedFreeList.pushBack({v, pageID}); } }; @@ -1144,6 +1146,7 @@ public: Reference getReadSnapshot(Version v) override; void addLatestSnapshot(); + // Set the pending oldest versiont to keep as of the next commit void setOldestVersion(Version v) override { ASSERT(v >= pHeader->oldestVersion); ASSERT(v <= pHeader->committedVersion); @@ -1151,12 +1154,17 @@ public: expireSnapshots(v); }; - Future getOldestVersion() override { - return map(recoverFuture, [=](Void) { - return pLastCommittedHeader->oldestVersion; - }); + // Get the oldest version set as of the last commit. + Version getOldestVersion() override { + return pLastCommittedHeader->oldestVersion; }; + // Calculate the *effective* oldest version, which can be older than the one set in the last commit since we + // are allowing active snapshots to temporarily delay page reuse. + Version effectiveOldestVersion() { + return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version); + } + ACTOR static Future commit_impl(COWPager *self) { debug_printf("COWPager(%s) commit begin\n", self->filename.c_str()); @@ -1277,10 +1285,12 @@ public: return StorageBytes(free, total, pagerSize, free + reusable); } - Future getLatestVersion() override { - return map(recoverFuture, [=](Void) { - return pLastCommittedHeader->committedVersion; - }); + Future init() override { + return recoverFuture; + } + + Version getLatestVersion() override { + return pLastCommittedHeader->committedVersion; } private: @@ -1442,7 +1452,7 @@ public: void COWPager::expireSnapshots(Version v) { debug_printf("COWPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); while(snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { - debug_printf("COWPager(%s) expiring snapshot for %" PRId64 "\n", filename.c_str(), snapshots.front().version); + debug_printf("COWPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); // The snapshot contract could be made such that the expired promise isn't need anymore. In practice it // probably is already not needed but it will gracefully handle the case where a user begins a page read // with a snapshot reference, keeps the page read future, and drops the snapshot reference. @@ -2507,10 +2517,10 @@ public: } virtual Version getOldestVersion() { - return m_pager->getOldestVersion().get(); + return m_pager->getOldestVersion(); } - virtual Future getLatestVersion() { + virtual Version getLatestVersion() { if(m_writeVersion != invalidVersion) return m_writeVersion; return m_pager->getLatestVersion(); @@ -2536,9 +2546,9 @@ public: m_latestCommit = m_init; } - ACTOR static Future incrementalLazyDelete(VersionedBTree *self, int minPages) { + ACTOR static Future incrementalLazyDelete(VersionedBTree *self, bool *stop, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { // TODO: Is it contractually okay to always to read at the latest version? - state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion().get()); + state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion()); state int freedPages = 0; loop { @@ -2546,7 +2556,7 @@ public: state Optional q = wait(self->m_lazyDeleteQueue.pop()); debug_printf("LazyDelete: popped %s\n", toString(q).c_str()); if(!q.present()) { - return Void(); + break; } // Read the page without caching @@ -2587,15 +2597,20 @@ public: self->freeBtreePage(q.get().pageID, v); freedPages += q.get().pageID.size(); - if(freedPages >= minPages) { - return Void(); + // If stop is set and we've freed the minimum number of pages required, or the maximum is exceeded, return. + if((freedPages >= minPages && *stop) || freedPages >= maxPages) { + break; } } + + return freedPages; } ACTOR static Future init_impl(VersionedBTree *self) { - state Version latest = wait(self->m_pager->getLatestVersion()); - self->m_newOldestVersion = self->m_pager->getOldestVersion().get(); + wait(self->m_pager->init()); + + state Version latest = self->m_pager->getLatestVersion(); + self->m_newOldestVersion = self->m_pager->getOldestVersion(); debug_printf("Recovered pager to version %" PRId64 ", oldest version is %" PRId64 "\n", self->m_newOldestVersion); @@ -2632,7 +2647,9 @@ public: return Void(); } - Future init() { return m_init; } + Future init() override { + return m_init; + } virtual ~VersionedBTree() { // This probably shouldn't be called directly (meaning deleting an instance directly) but it should be safe, @@ -3325,10 +3342,6 @@ private: debug_printf("%s decodeLower=%s decodeUpper=%s\n", context.c_str(), decodeLowerBound->toString().c_str(), decodeUpperBound->toString().c_str()); self->counts.commitToPageStart++; - // If a boundary changed, the page must be rewritten regardless of KV mutations - state bool boundaryChanged = (lowerBound != decodeLowerBound) || (upperBound != decodeUpperBound); - debug_printf("%s boundaryChanged=%d\n", context.c_str(), boundaryChanged); - // Find the slice of the mutation buffer that is relevant to this subtree // TODO: Rather than two lower_bound searches, perhaps just compare each mutation to the upperBound key while iterating state MutationBufferT::const_iterator iMutationBoundary = mutationBuffer->upper_bound(lowerBound->key); @@ -3354,27 +3367,43 @@ private: return results; } - // If there are no forced boundary changes then this subtree is unchanged. - if(!boundaryChanged) { - results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); - debug_printf("%s page contains a single key '%s' which is not changing, returning %s\n", context.c_str(), lowerBound->key.toString().c_str(), toString(results).c_str()); - return results; - } + // Otherwise, no changes to this subtree + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s page contains a single key '%s' which is not changing, returning %s\n", context.c_str(), lowerBound->key.toString().c_str(), toString(results).c_str()); + return results; } - // Another way to have no mutations is to have a single mutation range cover this - // subtree but have no changes in it MutationBufferT::const_iterator iMutationBoundaryNext = iMutationBoundary; ++iMutationBoundaryNext; - if(!boundaryChanged && iMutationBoundaryNext == iMutationBoundaryEnd && - ( iMutationBoundary->second.noChanges() || - ( !iMutationBoundary->second.rangeClearVersion.present() && - iMutationBoundary->first < lowerBound->key) - ) - ) { - results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); - debug_printf("%s no changes because sole mutation range was not cleared, returning %s\n", context.c_str(), toString(results).c_str()); - return results; + // If one mutation range covers the entire page + if(iMutationBoundaryNext == iMutationBoundaryEnd) { + // If there are no changes in the range (no clear, no boundary key mutations) + // OR there are changes but for a key that is less than the page lower boundary and therefore not part of this page + if(iMutationBoundary->second.noChanges() || + ( !iMutationBoundary->second.rangeClearVersion.present() && iMutationBoundary->first < lowerBound->key) + ) { + results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); + debug_printf("%s no changes on this subtree, returning %s\n", context.c_str(), toString(results).c_str()); + return results; + } + + // If the range is cleared and there either no sets or the sets aren't relevant to this subtree then delete it + // The last if subexpression is checking that either the next key in the mutation buffer is being changed or + // the upper bound key of this page isn't the same. + if(iMutationBoundary->second.rangeClearVersion.present() + && (iMutationBoundary->second.startKeyMutations.empty() || iMutationBoundary->first < lowerBound->key) + && (!iMutationBoundaryEnd->second.startKeyMutations.empty() || upperBound->key != iMutationBoundaryEnd->first) + ) { + debug_printf("%s %s cleared, deleting it, returning %s\n", context.c_str(), isLeaf ? "Page" : "Subtree", toString(results).c_str()); + Version clearVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : iMutationBoundary->second.rangeClearVersion.get(); + if(isLeaf) { + self->freeBtreePage(rootID, clearVersion); + } + else { + self->m_lazyDeleteQueue.pushBack(LazyDeleteQueueEntry{clearVersion, rootID}); + } + return results; + } } self->counts.commitToPage++; @@ -3530,8 +3559,7 @@ private: debug_printf("%s Done merging mutations into existing leaf contents, made %d changes\n", context.c_str(), changes); // No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records. - // But if a boundary was changed then we must rewrite the page anyway. - if(!boundaryChanged && minVersion == invalidVersion) { + if(minVersion == invalidVersion) { results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(results).c_str()); ASSERT(changes == 0); @@ -3713,10 +3741,11 @@ private: self->m_pager->setOldestVersion(self->m_newOldestVersion); debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, self->m_newOldestVersion); - state Future lazyDelete = incrementalLazyDelete(self, 100); + state bool lazyDeleteStop = false; + state Future lazyDelete = incrementalLazyDelete(self, &lazyDeleteStop); // Get the latest version from the pager, which is what we will read at - state Version latestVersion = wait(self->m_pager->getLatestVersion()); + state Version latestVersion = self->m_pager->getLatestVersion(); debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); if(REDWOOD_DEBUG) { @@ -3755,7 +3784,9 @@ private: self->m_header.root.set(rootPageID, sizeof(headerSpace) - sizeof(m_header)); - wait(lazyDelete); + lazyDeleteStop = true; + wait(success(lazyDelete)); + debug_printf("Lazy delete freed %u pages\n", lazyDelete.get()); self->m_pager->setCommitVersion(writeVersion); @@ -4336,7 +4367,7 @@ public: ACTOR Future init_impl(KeyValueStoreRedwoodUnversioned *self) { TraceEvent(SevInfo, "RedwoodInit").detail("FilePrefix", self->m_filePrefix); wait(self->m_tree->init()); - Version v = wait(self->m_tree->getLatestVersion()); + Version v = self->m_tree->getLatestVersion(); self->m_tree->setWriteVersion(v + 1); TraceEvent(SevInfo, "RedwoodInitComplete").detail("FilePrefix", self->m_filePrefix); return Void(); @@ -4373,6 +4404,7 @@ public: Future commit(bool sequential = false) { Future c = m_tree->commit(); + m_tree->setOldestVersion(m_tree->getLatestVersion()); m_tree->setWriteVersion(m_tree->getWriteVersion() + 1); return catchError(c); } @@ -5334,7 +5366,7 @@ TEST_CASE("!/redwood/correctness/btree") { state std::map, Optional> written; state std::set keys; - state Version lastVer = wait(btree->getLatestVersion()); + state Version lastVer = btree->getLatestVersion(); printf("Starting from version: %" PRId64 "\n", lastVer); state Version version = lastVer + 1; @@ -5508,7 +5540,7 @@ TEST_CASE("!/redwood/correctness/btree") { btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); - Version v = wait(btree->getLatestVersion()); + Version v = btree->getLatestVersion(); ASSERT(v == version); printf("Recovered from disk. Latest version %" PRId64 "\n", v); @@ -5545,7 +5577,7 @@ TEST_CASE("!/redwood/correctness/btree") { } ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, char lastChar) { - state Version readVer = wait(btree->getLatestVersion()); + state Version readVer = btree->getLatestVersion(); state int c = 0; state double readStart = timer(); printf("Executing %d random seeks\n", count); @@ -5569,7 +5601,7 @@ TEST_CASE("!/redwood/correctness/pager/cow") { int pageSize = 4096; state IPager2 *pager = new COWPager(pageSize, pagerFile, 0); - wait(success(pager->getLatestVersion())); + wait(success(pager->init())); state LogicalPageID id = wait(pager->newPageID()); Reference p = pager->newPageBuffer(); memset(p->mutate(), (char)id, p->size()); @@ -5622,7 +5654,7 @@ TEST_CASE("!/redwood/performance/set") { while(kvBytesTotal < kvBytesTarget) { wait(yield()); - Version lastVer = wait(btree->getLatestVersion()); + Version lastVer = btree->getLatestVersion(); state Version version = lastVer + 1; btree->setWriteVersion(version); int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); From c008e7f8b3082cbf9527c308c70da686e53d1ca3 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Mon, 14 Oct 2019 18:03:12 -0700 Subject: [PATCH 054/184] When switching parallel->single->parallel, reset sequence and peekId This fixes an issue where one could hang for 10min for the second parallel peek to time out, if one happened to catch the edge of a onlySpilled transition wrong. --- fdbserver/LogSystemPeekCursor.actor.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 99c0c221b7..bac736afec 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -152,6 +152,9 @@ ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self while(self->futureResults.size() < SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->interf->get().present()) { self->futureResults.push_back( brokenPromiseToNever( self->interf->get().interf().peekMessages.getReply(TLogPeekRequest(self->messageVersion.version,self->tag,self->returnIfBlocked, self->onlySpilled, std::make_pair(self->randomID, self->sequence++)), taskID) ) ); } + } else if (self->futureResults.size() == 1) { + self->randomID = deterministicRandom()->randomUniqueID(); + self->sequence = 0; } else if (self->futureResults.size() == 0) { return Void(); } From 1e5b8c74e3c07e5e6a006452487e609708c930ae Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Tue, 22 Oct 2019 17:04:57 -0700 Subject: [PATCH 055/184] Continuing a parallel peek after a timeout would hang. This is to guard against the case where 1. Peeks with sequence numbers 0-39 are submitted 2. A 15min pause happens, in which timeout removes the peek tracker data 3. Peeks with sequence numbers 40-59 are submitted, with the same peekId The second round of peeks wouldn't have the data left that it's allowed to start running peek 40 immediately, and thus would hang for 10min until it gets cleaned up. Also, guard against overflowing the sequence number. --- fdbserver/LogSystemPeekCursor.actor.cpp | 3 +++ fdbserver/OldTLogServer_4_6.actor.cpp | 3 +++ fdbserver/OldTLogServer_6_0.actor.cpp | 3 +++ fdbserver/TLogServer.actor.cpp | 3 +++ 4 files changed, 12 insertions(+) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index bac736afec..98ba5a4bb0 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -152,6 +152,9 @@ ACTOR Future serverPeekParallelGetMore( ILogSystem::ServerPeekCursor* self while(self->futureResults.size() < SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->interf->get().present()) { self->futureResults.push_back( brokenPromiseToNever( self->interf->get().interf().peekMessages.getReply(TLogPeekRequest(self->messageVersion.version,self->tag,self->returnIfBlocked, self->onlySpilled, std::make_pair(self->randomID, self->sequence++)), taskID) ) ); } + if (self->sequence == std::numeric_limitssequence)>::max()) { + throw timed_out(); + } } else if (self->futureResults.size() == 1) { self->randomID = deterministicRandom()->randomUniqueID(); self->sequence = 0; diff --git a/fdbserver/OldTLogServer_4_6.actor.cpp b/fdbserver/OldTLogServer_4_6.actor.cpp index c07f820f3e..0e02cd57b6 100644 --- a/fdbserver/OldTLogServer_4_6.actor.cpp +++ b/fdbserver/OldTLogServer_4_6.actor.cpp @@ -875,6 +875,9 @@ namespace oldTLog_4_6 { try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw timed_out(); + } if(sequence > 0) { auto& trackerData = self->peekTracker[peekId]; trackerData.lastUpdate = now(); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index c0ccd8eda9..1314ce52ca 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1036,6 +1036,9 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw timed_out(); + } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index cfc52b0281..4f1c02962d 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -1337,6 +1337,9 @@ ACTOR Future tLogPeekMessages( TLogData* self, TLogPeekRequest req, Refere try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && logData->peekTracker.find(peekId) == logData->peekTracker.end()) { + throw timed_out(); + } auto& trackerData = logData->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); From 96d463bab6ed11eaf6c87d966d8611866556ba58 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 22 Oct 2019 23:24:20 -0700 Subject: [PATCH 056/184] FastRestore:Fix bug in applying mutations and increase atomicOp test worload When Applier applies mutations to the destination cluster, it advances the mutation cursor twice when it should only advance it once. This makes restore miss some mutations when the applying txn includes more than 1 mutations. --- fdbserver/RestoreApplier.actor.cpp | 38 +++++++++---------- fdbserver/RestoreWorker.actor.cpp | 2 +- ...llelRestoreCorrectnessAtomicOpTinyData.txt | 5 ++- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index c305ab72c1..c0b81615f6 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -115,11 +115,12 @@ ACTOR static Future handleSendMutationVectorRequest(RestoreSendMutationVec state int mIndex = 0; for (mIndex = 0; mIndex < mutations.size(); mIndex++) { MutationRef mutation = mutations[mIndex]; - // TraceEvent(SevDebug, "FastRestore") - // .detail("ApplierNode", self->id()) - // .detail("FileUID", req.fileUID) - // .detail("Version", commitVersion) - // .detail("MutationReceived", mutation.toString()); + TraceEvent(SevDebug, "FastRestore") + .detail("ApplierNode", self->id()) + .detail("FileUID", req.fileIndex) + .detail("Version", commitVersion) + .detail("Index", mIndex) + .detail("MutationReceived", mutation.toString()); self->kvOps[commitVersion].push_back_deep(self->kvOps[commitVersion].arena(), mutation); } curFilePos.set(req.version); @@ -218,9 +219,7 @@ struct DBApplyProgress { } bool shouldCommit() { - // TODO: Change transactionSize > 0 to transactionSize > opConfig.transactionBatchSizeThreshold to batch - // mutations in a txn - return (!lastTxnHasError && (startNextVersion || transactionSize > 0 || curItInCurTxn == self->kvOps.end())); + return (!lastTxnHasError && (startNextVersion || transactionSize >= opConfig.transactionBatchSizeThreshold || curItInCurTxn == self->kvOps.end())); } bool hasError() { return lastTxnHasError; } @@ -299,7 +298,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { TraceEvent("FastRestore_ApplierTxn") .detail("ApplierApplyToDB", self->id()) .detail("TxnId", progress.curTxnId) - .detail("StartIndexInCurrentTxn", progress.curIndexInCurTxn) + .detail("CurrentIndexInCurrentTxn", progress.curIndexInCurTxn) .detail("CurrentIteratorMutations", progress.curItInCurTxn->second.size()) .detail("Version", progress.curItInCurTxn->first); @@ -315,7 +314,13 @@ ACTOR Future applyToDB(Reference self, Database cx) { TraceEvent(SevError, "FastRestore").detail("InvalidMutationType", m.type); } - // TraceEvent(SevDebug, "FastRestore_Debug").detail("ApplierApplyToDB", self->describeNode()).detail("Version", progress.curItInCurTxn->first).detail("Mutation", m.toString()); + TraceEvent(SevDebug, "FastRestore_Debug") + .detail("ApplierApplyToDB", self->describeNode()) + .detail("Version", progress.curItInCurTxn->first) + .detail("Index", progress.curIndexInCurTxn) + .detail("Mutation", m.toString()) + .detail("MutationSize", m.expectedSize()) + .detail("TxnSize", progress.transactionSize); if (m.type == MutationRef::SetValue) { tr->set(m.param1, m.param2); } else if (m.type == MutationRef::ClearRange) { @@ -332,14 +337,10 @@ ACTOR Future applyToDB(Reference self, Database cx) { progress.transactionSize += m.expectedSize(); - if (progress.transactionSize >= opConfig.transactionBatchSizeThreshold) { // commit per 512B + progress.nextMutation(); // Prepare for the next mutation + // commit per transactionBatchSizeThreshold bytes; and commit does not cross version boundary + if (progress.transactionSize >= opConfig.transactionBatchSizeThreshold || progress.startNextVersion || progress.isDone()) { break; // Got enough mutation in the txn - } else { - progress.nextMutation(); - // Mutations in the same transaction come from the same version - if (progress.startNextVersion || progress.isDone()) { - break; - } } } } // !lastTxnHasError @@ -348,8 +349,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { if (progress.shouldCommit()) { wait(tr->commit()); } - // Logic for a successful transaction: Update current txn info and uncommitted txn info - progress.nextMutation(); + if (progress.isDone()) { // Are all mutations processed? break; } diff --git a/fdbserver/RestoreWorker.actor.cpp b/fdbserver/RestoreWorker.actor.cpp index c53bbd6be1..a1253a3757 100644 --- a/fdbserver/RestoreWorker.actor.cpp +++ b/fdbserver/RestoreWorker.actor.cpp @@ -183,7 +183,7 @@ void initRestoreWorkerConfig() { opConfig.num_loaders = g_network->isSimulated() ? 3 : opConfig.num_loaders; opConfig.num_appliers = g_network->isSimulated() ? 3 : opConfig.num_appliers; opConfig.transactionBatchSizeThreshold = - g_network->isSimulated() ? 1 : opConfig.transactionBatchSizeThreshold; // Byte + g_network->isSimulated() ? 512 : opConfig.transactionBatchSizeThreshold; // Byte TraceEvent("FastRestore") .detail("InitOpConfig", "Result") .detail("NumLoaders", opConfig.num_loaders) diff --git a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt index dad1ef5c47..a15eca91fa 100644 --- a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt +++ b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt @@ -8,6 +8,9 @@ testTitle=BackupAndParallelRestoreWithAtomicOp ; transactionsPerSecond=250.0 testDuration=30.0 clearAfterTest=false +; Specify a type of atomicOp +; opType=0 +; actorsPerClient=1 ; AtomicBackupCorrectness.txt does not mix Cycle and AtomicOps workloads ; testName=Cycle @@ -59,4 +62,4 @@ testTitle=BackupAndParallelRestoreWithAtomicOp buggify=off ;testDuration=360000 ;not work ;timeout is in seconds -timeout=360000 \ No newline at end of file +timeout=360000 From 0c325c5351bfeab1e8392676fd0ffd77552ee971 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 23 Oct 2019 01:59:36 -0700 Subject: [PATCH 057/184] Always check which SharedTLog is active In case it is set before we get to the onChange() --- fdbserver/OldTLogServer_6_0.actor.cpp | 14 +++++++------- fdbserver/TLogServer.actor.cpp | 20 ++++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 2f5c2d2e35..63d1bbb770 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -2347,6 +2347,12 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ self.sharedActors.send( updateStorageLoop(&self) ); loop { + if (activeSharedTLog->get() == tlogId) { + self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; + } else { + self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); + } + choose { when ( InitializeTLogRequest req = waitNext(tlogRequests.getFuture() ) ) { if( !self.tlogCache.exists( req.recruitmentID ) ) { @@ -2357,13 +2363,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } - when ( wait( activeSharedTLog->onChange() ) ) { - if (activeSharedTLog->get() == tlogId) { - self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; - } else { - self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); - } - } + when ( wait( activeSharedTLog->onChange() ) ) {} } } } catch (Error& e) { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index ed4adf6586..bffa41c54a 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -2781,6 +2781,15 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ self.sharedActors.send( updateStorageLoop(&self) ); loop { + if (activeSharedTLog->get() == tlogId) { + TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get()); + self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; + } else { + stopAllTLogs(&self, tlogId); + TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get()); + self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); + } + choose { when ( InitializeTLogRequest req = waitNext(tlogRequests.getFuture() ) ) { if( !self.tlogCache.exists( req.recruitmentID ) ) { @@ -2791,16 +2800,7 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } - when ( wait( activeSharedTLog->onChange() ) ) { - if (activeSharedTLog->get() == tlogId) { - TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get()); - self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; - } else { - stopAllTLogs(&self, tlogId); - TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get()); - self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); - } - } + when ( wait( activeSharedTLog->onChange() ) ) {} } } } catch (Error& e) { From 613bbaecc451a5ac330d6adca5b1030e1ee28d98 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Wed, 23 Oct 2019 09:31:06 -0700 Subject: [PATCH 058/184] Bug fix in queue page footprint tracking. Added VersionedBTree::destroyAndCheckSanity() which clears the tree, processes the entire lazy delete queue, and then verifies some pager usage statistics. This check is currently disabled because it appears to find a bug where the final state has a few more pages in use than expected. StorageBytes now includes the delayed free list pages as free space since they will be reusable soon. --- fdbserver/VersionedBTree.actor.cpp | 57 +++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index ab06953722..9f5db9a5f7 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -49,6 +49,10 @@ std::string toString(const T &o) { return o.toString(); } +std::string toString(StringRef s) { + return s.printable(); +} + std::string toString(LogicalPageID id) { if(id == invalidLogicalPageID) { return "LogicalPageID{invalid}"; @@ -358,6 +362,7 @@ public: debug_printf("FIFOQueue::Cursor(%s) write(%s) page is full, adding new page\n", self->toString().c_str(), ::toString(item).c_str()); LogicalPageID newPageID = wait(self->queue->pager->newPageID()); self->addNewPage(newPageID, 0, true); + ++self->queue->numPages; wait(yield()); } debug_printf("FIFOQueue::Cursor(%s) before write(%s)\n", self->toString().c_str(), ::toString(item).c_str()); @@ -1171,7 +1176,7 @@ public: // Write old committed header to Page 1 self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); - // Flush the free list delayed free list queues together as they are used by freePage() and newPageID() + // Flush the free list and delayed free list queues together as they are used by freePage() and newPageID() loop { state bool freeBusy = wait(self->freeList.preFlush()); state bool delayedFreeBusy = wait(self->delayedFreeList.preFlush()); @@ -1281,10 +1286,22 @@ public: int64_t total; g_network->getDiskBytes(parentDirectory(filename), free, total); int64_t pagerSize = pHeader->pageCount * physicalPageSize; - int64_t reusable = freeList.numEntries * physicalPageSize; + + // It is not exactly known how many pages on the delayed free list are usable as of right now. It could be, + // if each commit delayed entries that were freeable were shuffled from the delayed free queue to the free queue. + // but this doesn't seem necessary most of the time. + int64_t reusable = (freeList.numEntries + delayedFreeList.numEntries) * physicalPageSize; + return StorageBytes(free, total, pagerSize, free + reusable); } + // Get the number of pages in use but not by the pager itself. + int64_t getUserPageCount() { + int userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries; + debug_printf("COWPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 "\n", filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries); + return userPages; + } + Future init() override { return recoverFuture; } @@ -2546,7 +2563,7 @@ public: m_latestCommit = m_init; } - ACTOR static Future incrementalLazyDelete(VersionedBTree *self, bool *stop, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { + ACTOR static Future incrementalLazyDelete(VersionedBTree *self, bool *pStop = nullptr, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { // TODO: Is it contractually okay to always to read at the latest version? state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion()); state int freedPages = 0; @@ -2598,7 +2615,7 @@ public: freedPages += q.get().pageID.size(); // If stop is set and we've freed the minimum number of pages required, or the maximum is exceeded, return. - if((freedPages >= minPages && *stop) || freedPages >= maxPages) { + if((freedPages >= minPages && pStop != nullptr && *pStop) || freedPages >= maxPages) { break; } } @@ -2703,6 +2720,38 @@ public: return commit_impl(this); } + ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree *self) { + ASSERT(g_network->isSimulated()); + + self->setWriteVersion(self->getLatestVersion() + 1); + self->clear(KeyRangeRef(dbBegin.key, dbEnd.key)); + + loop { + int freedPages = wait(self->incrementalLazyDelete(self)); + debug_printf("incrementalLazyDelete freed %d\n", freedPages); + wait(self->commit()); + if(self->m_lazyDeleteQueue.numEntries == 0) { + break; + } + self->setWriteVersion(self->getLatestVersion() + 1); + } + + LazyDeleteQueueT::QueueState s = self->m_lazyDeleteQueue.getState(); + ASSERT(s.numEntries == 0); + ASSERT(s.numPages == 1); + + debug_printf("rootPageCount %d\n", self->m_header.root.count); + ASSERT(self->m_header.height == 1); + // All that should be in use now is the root page and the lazy delete queue empty page. + ASSERT(((COWPager *)self->m_pager)->getUserPageCount() == self->m_header.root.count + 1); + + return Void(); + } + + Future destroyAndCheckSanity() { + return destroyAndCheckSanity_impl(this); + } + bool isSingleVersion() const { return singleVersion; } From a1bed51d34ee9a8fcf6c9f7a86aa08568855bd07 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 23 Oct 2019 10:29:58 -0700 Subject: [PATCH 059/184] Ignore batch priority GRVs for latency band tracking --- fdbserver/MasterProxyServer.actor.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index a64eafa2bc..cbb882fa37 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -1131,7 +1131,9 @@ ACTOR Future sendGrvReplies(Future replyFuture, std:: GetReadVersionReply reply = wait(replyFuture); double end = timer(); for(GetReadVersionRequest const& request : requests) { - stats->grvLatencyBands.addMeasurement(end - request.requestTime()); + if(request.priority() >= GetReadVersionRequest::PRIORITY_DEFAULT) { + stats->grvLatencyBands.addMeasurement(end - request.requestTime()); + } request.reply.send(reply); } From 84bd55caa362183a221b4ff7096d430470ee377f Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Wed, 23 Oct 2019 10:41:09 -0700 Subject: [PATCH 060/184] Add release note --- documentation/sphinx/source/release-notes.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 7d88da488e..6494248b1b 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,7 +2,7 @@ Release Notes ############# -6.2.6 +6.2.7 ===== Performance @@ -58,6 +58,7 @@ Fixes * Committing transactions larger than 1 MB could cause the proxy to stall for up to a second. [6.2.6] `(PR #2250) `_. * The cluster controller could become saturated in clusters with large numbers of connected clients using TLS. [6.2.6] `(PR #2252) `_. * Backup and DR would not share a mutation stream if they were started on different versions of FoundationDB. Either backup or DR must be restarted to resolve this issue. [6.2.6] `(PR #2202) `_. +* Don't track batch priority GRV requests in latency bands. [6.2.7] `(PR #2279) `_. Status ------ From 2722c8b188ad5fbe467d580f1c63d3b46419a1c5 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 23 Oct 2019 11:15:54 -0700 Subject: [PATCH 061/184] avoid starting a new startSpillingActor with every TLog recruitment --- fdbserver/OldTLogServer_6_0.actor.cpp | 16 +++++++++------- fdbserver/TLogServer.actor.cpp | 22 ++++++++++++---------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index f05b312da8..12a5bd6d94 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -2348,14 +2348,9 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ self.sharedActors.send( commitQueue(&self) ); self.sharedActors.send( updateStorageLoop(&self) ); + state Future activeSharedChange = Void(); loop { - if (activeSharedTLog->get() == tlogId) { - self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; - } else { - self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); - } - choose { when ( InitializeTLogRequest req = waitNext(tlogRequests.getFuture() ) ) { if( !self.tlogCache.exists( req.recruitmentID ) ) { @@ -2366,7 +2361,14 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } - when ( wait( activeSharedTLog->onChange() ) ) {} + when ( wait( activeSharedChange ) ) { + if (activeSharedTLog->get() == tlogId) { + self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; + } else { + self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); + } + activeSharedChange = activeSharedTLog->onChange(); + } } } } catch (Error& e) { diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 3937e64bdc..b5578bedd7 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -2782,17 +2782,9 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ self.sharedActors.send( commitQueue(&self) ); self.sharedActors.send( updateStorageLoop(&self) ); + state Future activeSharedChange = Void(); loop { - if (activeSharedTLog->get() == tlogId) { - TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get()); - self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; - } else { - stopAllTLogs(&self, tlogId); - TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get()); - self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); - } - choose { when ( InitializeTLogRequest req = waitNext(tlogRequests.getFuture() ) ) { if( !self.tlogCache.exists( req.recruitmentID ) ) { @@ -2803,7 +2795,17 @@ ACTOR Future tLog( IKeyValueStore* persistentData, IDiskQueue* persistentQ } } when ( wait( error ) ) { throw internal_error(); } - when ( wait( activeSharedTLog->onChange() ) ) {} + when ( wait( activeSharedChange ) ) { + if (activeSharedTLog->get() == tlogId) { + TraceEvent("SharedTLogNowActive", self.dbgid).detail("NowActive", activeSharedTLog->get()); + self.targetVolatileBytes = SERVER_KNOBS->TLOG_SPILL_THRESHOLD; + } else { + stopAllTLogs(&self, tlogId); + TraceEvent("SharedTLogQueueSpilling", self.dbgid).detail("NowActive", activeSharedTLog->get()); + self.sharedActors.send( startSpillingInTenSeconds(&self, tlogId, activeSharedTLog) ); + } + activeSharedChange = activeSharedTLog->onChange(); + } } } } catch (Error& e) { From 2f6b661b51ebf8322dbb851edd4c5740c7276afc Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 23 Oct 2019 11:17:53 -0700 Subject: [PATCH 062/184] updated documentation for 6.2.7 --- documentation/sphinx/source/downloads.rst | 24 +++++++++---------- documentation/sphinx/source/release-notes.rst | 4 +++- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index 82aefde475..64d13865f0 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.2.6.pkg `_ +* `FoundationDB-6.2.7.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.2.6-1_amd64.deb `_ -* `foundationdb-server-6.2.6-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.2.7-1_amd64.deb `_ +* `foundationdb-server-6.2.7-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.2.6-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.2.6-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.7-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.2.7-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.2.6-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.2.6-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.7-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.2.7-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.2.6-x64.msi `_ +* `foundationdb-6.2.7-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package: -* `foundationdb-6.2.6.tar.gz `_ +* `foundationdb-6.2.7.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.2.6.gem `_ +* `fdb-6.2.7.gem `_ Java 8+ ------- -* `fdb-java-6.2.6.jar `_ -* `fdb-java-6.2.6-javadoc.jar `_ +* `fdb-java-6.2.7.jar `_ +* `fdb-java-6.2.7-javadoc.jar `_ Go 1.11+ -------- diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 6494248b1b..f964f8bcf4 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -39,7 +39,6 @@ Fixes * File descriptors opened by clients and servers set close-on-exec, if available on the platform. `(PR #1581) `_. * ``fdbrestore`` commands other than ``start`` required a default cluster file to be found but did not actually use it. `(PR #1912) `_. * Unneeded network connections were not being closed because peer reference counts were handled improperly. `(PR #1768) `_. -* Under certain conditions, cross region replication could stall for 10 minute periods. `(PR #1818) `_. * In very rare scenarios, master recovery would restart because system metadata was loaded incorrectly. `(PR #1919) `_. * Ratekeeper will aggressively throttle when unable to fetch the list of storage servers for a considerable period of time. `(PR #1858) `_. * Proxies could become overloaded when all storage servers on a team fail. [6.2.1] `(PR #1976) `_. @@ -59,6 +58,8 @@ Fixes * The cluster controller could become saturated in clusters with large numbers of connected clients using TLS. [6.2.6] `(PR #2252) `_. * Backup and DR would not share a mutation stream if they were started on different versions of FoundationDB. Either backup or DR must be restarted to resolve this issue. [6.2.6] `(PR #2202) `_. * Don't track batch priority GRV requests in latency bands. [6.2.7] `(PR #2279) `_. +* Transaction log processes used twice their normal memory when switching spill types. [6.2.7] `(PR #2256) `_. +* Under certain conditions, cross region replication could stall for 10 minute periods. [6.2.7] `(PR #1818) `_ `(PR #2276) `_. Status ------ @@ -135,6 +136,7 @@ Fixes only impacting 6.2.0+ * A storage server could crash if it took longer than 10 minutes to fetch a key range from another server. [6.2.5] `(PR #2170) `_. * Excluding or including servers would restart the data distributor. [6.2.5] `(PR #2170) `_. * The data distributor could read invalid memory when estimating database size. [6.2.6] `(PR #2225) `_. +* Status could incorrectly report that backup and DR were not sharing a mutation stream. [6.2.7] `(PR #2274) `_. Earlier release notes --------------------- From 47dc0ee25c001b2400e240ff82fd74df0eed149b Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 23 Oct 2019 10:37:38 -0700 Subject: [PATCH 063/184] removed coordinator check and added pre-processing of workers rather than checking each cycle --- .../workloads/MachineAttrition.actor.cpp | 45 ++++++------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 0993dc39b1..38b54609d3 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -141,14 +141,8 @@ struct MachineAttritionWorkload : TestWorkload { virtual void getMetrics( vector& m ) { } - static bool noSimIsViableKill(int coordFaultTolerance, int& killedCoord, std::vector coordAddrs, WorkerDetails worker) { + static bool noSimIsViableKill(WorkerDetails worker) { if (worker.processClass == ProcessClass::ClassType::TesterClass) return false; - bool isCoord = (std::find(coordAddrs.begin(), coordAddrs.end(), worker.interf.address()) != coordAddrs.end()); - if (isCoord && coordFaultTolerance > killedCoord) { - killedCoord++; - } else if (isCoord) { - return false; - } return true; } @@ -156,9 +150,8 @@ struct MachineAttritionWorkload : TestWorkload { ASSERT(!g_network->isSimulated()); state int killedMachines = 0; state double delayBeforeKill = deterministicRandom()->random01() * meanDelay; - state std::vector workers = + state std::vector allWorkers = wait(self->dbInfo->get().clusterInterface.getWorkers.getReply(GetWorkersRequest())); - deterministicRandom()->randomShuffle(workers); // Can reuse reboot request to send to each interface since no reply promise needed state RebootRequest rbReq; if (self->reboot) { @@ -166,32 +159,22 @@ struct MachineAttritionWorkload : TestWorkload { } else { rbReq.waitForDuration = std::numeric_limits::max(); } - // keep track of coordinator fault tolerance and make sure we don't go over - state ClientCoordinators coords(cx->getConnectionFile()); - state std::vector>> leaderServers; - state std::vector coordAddrs; - for (const auto& cls : coords.clientLeaderServers) { - leaderServers.push_back(retryBrokenPromise(cls.getLeader, GetLeaderRequest(coords.clusterKey, UID()), TaskPriority::CoordinationReply)); - coordAddrs.push_back(cls.getLeader.getEndpoint().getPrimaryAddress()); - } - wait(smartQuorum(leaderServers, leaderServers.size() / 2 + 1, 1.0)); - int coordUnavailable = 0; - for (const auto& leaderServer : leaderServers) { - if (!leaderServer.isReady()) { - coordUnavailable++; + state std::vector workers; + // Pre-processing step: remove all testers from list of workers + for (const auto& worker : allWorkers) { + if (noSimIsViableKill(worker)) { + workers.push_back(worker); } } - state int coordFaultTolerance = (leaderServers.size() - 1) / 2 - coordUnavailable; - state int killedCoord = 0; if (self->killDc) { wait(delay(delayBeforeKill)); // Pick a dcId to kill + deterministicRandom()->randomShuffle(workers); Optional> killDcId = workers.back().interf.locality.dcId(); TraceEvent("Assassination").detail("TargetDataCenter", killDcId); for (const auto& worker : workers) { - // kill all matching dcId workers, except testers. Also preserve a majority of coordinators - if (worker.interf.locality.dcId().present() && worker.interf.locality.dcId() == killDcId && - noSimIsViableKill(coordFaultTolerance, killedCoord, coordAddrs, worker)) { + // kill all matching dcId workers + if (worker.interf.locality.dcId().present() && worker.interf.locality.dcId() == killDcId) { worker.interf.clientInterface.reboot.send(rbReq); } } @@ -217,11 +200,9 @@ struct MachineAttritionWorkload : TestWorkload { } } } - // Pick a machine to kill, ignoring testers and preserving majority of coordinators + // Pick a machine to kill state WorkerDetails targetMachine; - while (!noSimIsViableKill(coordFaultTolerance, killedCoord, coordAddrs, workers.back())) { - deterministicRandom()->randomShuffle(workers); - } + deterministicRandom()->randomShuffle(workers); targetMachine = workers.back(); TraceEvent("Assassination") .detail("TargetMachine", targetMachine.interf.locality.toString()) @@ -229,7 +210,7 @@ struct MachineAttritionWorkload : TestWorkload { .detail("KilledMachines", killedMachines) .detail("MachinesToKill", self->machinesToKill) .detail("MachinesToLeave", self->machinesToLeave) - .detail("Machines", self->machines.size()); + .detail("Machines", workers.size()); targetMachine.interf.clientInterface.reboot.send(rbReq); killedMachines++; workers.pop_back(); From d97ff756386b7f4cb09518b294133f3b2e7ee45c Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 23 Oct 2019 11:29:47 -0700 Subject: [PATCH 064/184] added mode to specifically kill all workers with same machineId --- fdbserver/workloads/MachineAttrition.actor.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 38b54609d3..fd7c5cdcfb 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -64,6 +64,7 @@ struct MachineAttritionWorkload : TestWorkload { double testDuration, suspendDuration; bool reboot; bool killDc; + bool killMachine; bool killSelf; bool replacement; bool waitForVersion; @@ -83,6 +84,7 @@ struct MachineAttritionWorkload : TestWorkload { suspendDuration = getOption( options, LiteralStringRef("suspendDuration"), 1.0 ); reboot = getOption( options, LiteralStringRef("reboot"), false ); killDc = getOption( options, LiteralStringRef("killDc"), deterministicRandom()->random01() < 0.25 ); + killMachine = getOption( options, LiteralStringRef("killMachine"), false); killSelf = getOption( options, LiteralStringRef("killSelf"), false ); replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 ); waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false ); @@ -178,6 +180,18 @@ struct MachineAttritionWorkload : TestWorkload { worker.interf.clientInterface.reboot.send(rbReq); } } + } else if (self->killMachine) { + wait(delay(delayBeforeKill)); + // Pick a machine to kill + deterministicRandom()->randomShuffle(workers); + Optional> killMachineId = workers.back().interf.locality.machineId(); + TraceEvent("Assassination").detail("TargetMachine", killMachineId); + for (const auto& worker : workers) { + // kill all matching machine workers + if (worker.interf.locality.machineId().present() && worker.interf.locality.machineId() == killMachineId) { + worker.interf.clientInterface.reboot.send(rbReq); + } + } } else { while (killedMachines < self->machinesToKill && workers.size() > self->machinesToLeave) { TraceEvent("WorkerKillBegin") From fc31c8dafaf9a583fa96a1da3239ad4f45acc093 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 23 Oct 2019 11:55:04 -0700 Subject: [PATCH 065/184] update installer WIX GUID following release --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index dd1524d0c7..620add1a09 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ Date: Wed, 23 Oct 2019 11:58:59 -0700 Subject: [PATCH 066/184] - Changed SHARD_MAX_BYTES_READ_PRE_KEYSEC to be equivalent to 8MiB/s, which when times the sample expire interval(120 seconds) yields 960MiB/s. A shard having a read rate larger than that will be marked as read-hot. The number 960MiB was chosen to be roughtly twice the size of the max allowed shard size to avoid wrongly marking a shard as read-hot when doing a table scan on it. - Also tuned down the empty key sampling percentage to be 5%. --- fdbserver/Knobs.cpp | 6 +++--- fdbserver/storageserver.actor.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index eb409c147d..1c5657707b 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -129,8 +129,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( MAX_SHARD_BYTES, 500000000 ); init( KEY_SERVER_SHARD_BYTES, 500000000 ); bool buggifySmallReadBandwidth = randomize && BUGGIFY; - init( SHARD_MAX_BYTES_READ_PER_KSEC, 100LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000; - /* 100*1MB/sec * 1000sec/ksec + init( SHARD_MAX_BYTES_READ_PER_KSEC, 3LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000; + /* 8*1MB/sec * 1000sec/ksec Shards with more than this read bandwidth will be considered as a read cache candidate */ init( SHARD_MAX_BYTES_READ_PER_KSEC_JITTER, 0.1 ); @@ -454,7 +454,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( SPLIT_JITTER_AMOUNT, 0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2; init( IOPS_UNITS_PER_SAMPLE, 10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 ); init( BANDWIDTH_UNITS_PER_SAMPLE, SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 ); - init( BYTES_READ_UNITS_PER_SAMPLE, 10000 ); + init( BYTES_READ_UNITS_PER_SAMPLE, 100000 ); //Storage Server init( STORAGE_LOGGING_DELAY, 5.0 ); diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 342f4d87ad..44de8a8444 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -893,7 +893,7 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { // If the read yields no value, randomly sample the empty read. metrics.bytesReadPerKSecond = v.present() ? (int64_t)(req.key.size() + v.get().size()) - : deterministicRandom()->random01() > 0.5 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; + : deterministicRandom()->random01() > 0.95 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; data->metrics.notify(req.key, metrics); if( req.debugID.present() ) @@ -1336,7 +1336,7 @@ ACTOR Future findKey( StorageServer* data, KeySelectorRef sel, Version vers StorageMetrics metrics; // Randomly sample an empty read metrics.bytesReadPerKSecond = - deterministicRandom()->random01() > 0.5 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; + deterministicRandom()->random01() > 0.95 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; data->metrics.notify(sel.getKey(), metrics); // FIXME: If range.begin=="" && !forward, return success? From 41f0cd624b8627f9d6331b9758999140d800862f Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Oct 2019 13:36:19 -0700 Subject: [PATCH 067/184] FastRestore:Applier:Use shouldCommit to replace the duplicate code --- fdbserver/RestoreApplier.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index c0b81615f6..64cfda039e 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -339,7 +339,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { progress.nextMutation(); // Prepare for the next mutation // commit per transactionBatchSizeThreshold bytes; and commit does not cross version boundary - if (progress.transactionSize >= opConfig.transactionBatchSizeThreshold || progress.startNextVersion || progress.isDone()) { + if (progress.shouldCommit()) { break; // Got enough mutation in the txn } } From eb910b850b2a20e88a449df3b563bce1a1aed49d Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 23 Oct 2019 13:48:24 -0700 Subject: [PATCH 068/184] fixed a window build error --- fdbserver/VersionedBTree.actor.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 9f5db9a5f7..33c81c6708 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1276,10 +1276,6 @@ public: return closedPromise.getFuture(); } - Future onClose() override { - return closedPromise.getFuture(); - } - StorageBytes getStorageBytes() override { ASSERT(recoverFuture.isReady()); int64_t free; From ba7e499efeeac6bec676d90cf25ff151f5c0f05d Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Oct 2019 13:57:40 -0700 Subject: [PATCH 069/184] FastRestore:AtomicOpTest:Limit 1 actor per client --- fdbserver/RestoreApplier.actor.cpp | 2 +- fdbserver/workloads/AtomicOps.actor.cpp | 6 ++++-- tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 64cfda039e..800ea02079 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -359,7 +359,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { .detail("TxnStatus", "?") .detail("ApplierApplyToDB", self->id()) .detail("TxnId", progress.curTxnId) - .detail("StartIndexInCurrentTxn", progress.curIndexInCurTxn) + .detail("CurrentIndexInCurrentTxn", progress.curIndexInCurTxn) .detail("Version", progress.curItInCurTxn->first) .error(e, true); progress.lastTxnHasError = true; diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index 9188f6d094..14180a3327 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -102,10 +102,12 @@ struct AtomicOpsWorkload : TestWorkload { } virtual Future start( Database const& cx ) { - for(int c=0; cclone(), this, actorCount / transactionsPerSecond ), testDuration, Void()) ); + } + return delay(testDuration); } diff --git a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt index a15eca91fa..1c168afb81 100644 --- a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt +++ b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt @@ -2,8 +2,8 @@ testTitle=BackupAndParallelRestoreWithAtomicOp testName=AtomicOps nodeCount=30000 ; transactionsPerSecond=2500.0 -; transactionsPerSecond=500.0 - transactionsPerSecond=100.0 + transactionsPerSecond=500.0 +; transactionsPerSecond=100.0 ; nodeCount=4 ; transactionsPerSecond=250.0 testDuration=30.0 From 103cc37a35e569b9b35591e65c052fe2ae490f05 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 23 Oct 2019 14:19:17 -0700 Subject: [PATCH 070/184] added datahall kill and option to target a specific datahall/dc/machine id --- fdbserver/worker.actor.cpp | 4 ++- .../workloads/MachineAttrition.actor.cpp | 27 ++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index 70ca357b2c..fcc05bed66 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -964,8 +964,10 @@ ACTOR Future workerServer( when( RebootRequest req = waitNext( interf.clientInterface.reboot.getFuture() ) ) { state RebootRequest rebootReq = req; + // If suspendDuration is INT_MAX, the trace will not be logged if it was inside the next block + // Also a useful trace to have even if suspendDuration is 0 + TraceEvent("RebootRequestSuspendingProcess").detail("Duration", req.waitForDuration); if(req.waitForDuration) { - TraceEvent("RebootRequestSuspendingProcess").detail("Duration", req.waitForDuration); flushTraceFileVoid(); setProfilingEnabled(0); g_network->stop(); diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index fd7c5cdcfb..fdb9ac2ab0 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -65,7 +65,9 @@ struct MachineAttritionWorkload : TestWorkload { bool reboot; bool killDc; bool killMachine; + bool killDatahall; bool killSelf; + std::string targetId; bool replacement; bool waitForVersion; bool allowFaultInjection; @@ -85,7 +87,9 @@ struct MachineAttritionWorkload : TestWorkload { reboot = getOption( options, LiteralStringRef("reboot"), false ); killDc = getOption( options, LiteralStringRef("killDc"), deterministicRandom()->random01() < 0.25 ); killMachine = getOption( options, LiteralStringRef("killMachine"), false); + killDatahall = getOption( options, LiteralStringRef("killDatahall"), false); killSelf = getOption( options, LiteralStringRef("killSelf"), false ); + targetId = getOption( options, LiteralStringRef("targetId"), ""); replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 ); waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false ); allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true ); @@ -172,11 +176,12 @@ struct MachineAttritionWorkload : TestWorkload { wait(delay(delayBeforeKill)); // Pick a dcId to kill deterministicRandom()->randomShuffle(workers); - Optional> killDcId = workers.back().interf.locality.dcId(); - TraceEvent("Assassination").detail("TargetDataCenter", killDcId); + Optional> killDcId = self->targetId.empty() ? workers.back().interf.locality.dcId() : self->targetId; + TraceEvent("Assassination").detail("TargetDataCenterId", killDcId); for (const auto& worker : workers) { // kill all matching dcId workers if (worker.interf.locality.dcId().present() && worker.interf.locality.dcId() == killDcId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); worker.interf.clientInterface.reboot.send(rbReq); } } @@ -184,11 +189,25 @@ struct MachineAttritionWorkload : TestWorkload { wait(delay(delayBeforeKill)); // Pick a machine to kill deterministicRandom()->randomShuffle(workers); - Optional> killMachineId = workers.back().interf.locality.machineId(); - TraceEvent("Assassination").detail("TargetMachine", killMachineId); + Optional> killMachineId = self->targetId.empty() ? workers.back().interf.locality.machineId() : self->targetId; + TraceEvent("Assassination").detail("TargetMachineId", killMachineId); for (const auto& worker : workers) { // kill all matching machine workers if (worker.interf.locality.machineId().present() && worker.interf.locality.machineId() == killMachineId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); + worker.interf.clientInterface.reboot.send(rbReq); + } + } + } else if (self->killDatahall) { + wait(delay(delayBeforeKill)); + // Pick a datahall to kill + deterministicRandom()->randomShuffle(workers); + Optional> killDatahallId = self->targetId.empty() ? workers.back().interf.locality.dataHallId() : self->targetId; + TraceEvent("Assassination").detail("TargetDatahallId", killDatahallId); + for (const auto& worker : workers) { + // kill all matching datahall workers + if (worker.interf.locality.dataHallId().present() && worker.interf.locality.dataHallId() == killDatahallId) { + TraceEvent("SendingRebootRequest").detail("TargetMachine", worker.interf.locality.toString()); worker.interf.clientInterface.reboot.send(rbReq); } } From 7af3239ee794cb8bf2dfa3960ee8fd4e8c777d28 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Oct 2019 14:36:34 -0700 Subject: [PATCH 071/184] FastRestore:AtomicOpTest:Debug:1 key per group for ops keyspace --- tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt index 1c168afb81..39dc51032e 100644 --- a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt +++ b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt @@ -1,6 +1,8 @@ testTitle=BackupAndParallelRestoreWithAtomicOp testName=AtomicOps - nodeCount=30000 +; nodeCount=30000 +; Make ops space only 1 key per group + nodeCount=100 ; transactionsPerSecond=2500.0 transactionsPerSecond=500.0 ; transactionsPerSecond=100.0 From ab262e5e4dddef08ecbabb5d0f8097447549c9e7 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Wed, 23 Oct 2019 14:55:28 -0700 Subject: [PATCH 072/184] use StringRef over std::string for workload params --- fdbserver/workloads/MachineAttrition.actor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index fdb9ac2ab0..1fc0e34ea2 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -67,7 +67,7 @@ struct MachineAttritionWorkload : TestWorkload { bool killMachine; bool killDatahall; bool killSelf; - std::string targetId; + Standalone targetId; bool replacement; bool waitForVersion; bool allowFaultInjection; @@ -89,7 +89,7 @@ struct MachineAttritionWorkload : TestWorkload { killMachine = getOption( options, LiteralStringRef("killMachine"), false); killDatahall = getOption( options, LiteralStringRef("killDatahall"), false); killSelf = getOption( options, LiteralStringRef("killSelf"), false ); - targetId = getOption( options, LiteralStringRef("targetId"), ""); + targetId = getOption( options, LiteralStringRef("targetId"), LiteralStringRef("")); replacement = getOption( options, LiteralStringRef("replacement"), reboot && deterministicRandom()->random01() < 0.5 ); waitForVersion = getOption( options, LiteralStringRef("waitForVersion"), false ); allowFaultInjection = getOption( options, LiteralStringRef("allowFaultInjection"), true ); @@ -176,7 +176,7 @@ struct MachineAttritionWorkload : TestWorkload { wait(delay(delayBeforeKill)); // Pick a dcId to kill deterministicRandom()->randomShuffle(workers); - Optional> killDcId = self->targetId.empty() ? workers.back().interf.locality.dcId() : self->targetId; + Optional> killDcId = self->targetId.toString().empty() ? workers.back().interf.locality.dcId() : self->targetId; TraceEvent("Assassination").detail("TargetDataCenterId", killDcId); for (const auto& worker : workers) { // kill all matching dcId workers @@ -189,7 +189,7 @@ struct MachineAttritionWorkload : TestWorkload { wait(delay(delayBeforeKill)); // Pick a machine to kill deterministicRandom()->randomShuffle(workers); - Optional> killMachineId = self->targetId.empty() ? workers.back().interf.locality.machineId() : self->targetId; + Optional> killMachineId = self->targetId.toString().empty() ? workers.back().interf.locality.machineId() : self->targetId; TraceEvent("Assassination").detail("TargetMachineId", killMachineId); for (const auto& worker : workers) { // kill all matching machine workers @@ -202,7 +202,7 @@ struct MachineAttritionWorkload : TestWorkload { wait(delay(delayBeforeKill)); // Pick a datahall to kill deterministicRandom()->randomShuffle(workers); - Optional> killDatahallId = self->targetId.empty() ? workers.back().interf.locality.dataHallId() : self->targetId; + Optional> killDatahallId = self->targetId.toString().empty() ? workers.back().interf.locality.dataHallId() : self->targetId; TraceEvent("Assassination").detail("TargetDatahallId", killDatahallId); for (const auto& worker : workers) { // kill all matching datahall workers From bae0c907a640792b77c6d10c9497b00dd5948119 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Oct 2019 15:05:03 -0700 Subject: [PATCH 073/184] FastRestore:Convert unnecessary actor function to plain function --- fdbserver/RestoreApplier.actor.cpp | 3 ++- fdbserver/RestoreLoader.actor.cpp | 25 +++++++++++++------------ fdbserver/RestoreRoleCommon.actor.cpp | 4 ++-- fdbserver/RestoreRoleCommon.actor.h | 2 +- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 800ea02079..fd92e71c3c 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -69,7 +69,8 @@ ACTOR Future restoreApplierCore(RestoreApplierInterface applierInterf, int } when(RestoreVersionBatchRequest req = waitNext(applierInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; - exitRole = handleFinishRestoreRequest(req, self); + handleFinishRestoreRequest(req, self); + exitRole = Void(); } when(wait(exitRole)) { TraceEvent("FastRestore").detail("RestoreApplierCore", "ExitRole").detail("NodeID", self->id()); diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 5be0b73c1d..7e936f0faf 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -39,9 +39,9 @@ void splitMutation(Reference self, MutationRef m, Arena& mvec void _parseSerializedMutation(VersionedMutationsMap* kvOps, SerializedMutationListMap* mutationMap, bool isSampling = false); -ACTOR Future handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self); -ACTOR Future handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, - Reference self); +void handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self); +void handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, + Reference self); ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Reference self, bool isSampling = false); ACTOR Future sendMutationsToApplier(Reference self, VersionedMutationsMap* kvOps, @@ -72,12 +72,12 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } when(RestoreSysInfoRequest req = waitNext(loaderInterf.updateRestoreSysInfo.getFuture())) { requestTypeStr = "updateRestoreSysInfo"; - actors.add(handleRestoreSysInfoRequest(req, self)); + handleRestoreSysInfoRequest(req, self); } when(RestoreSetApplierKeyRangeVectorRequest req = waitNext(loaderInterf.setApplierKeyRangeVectorRequest.getFuture())) { requestTypeStr = "setApplierKeyRangeVectorRequest"; - actors.add(handleSetApplierKeyRangeVectorRequest(req, self)); + handleSetApplierKeyRangeVectorRequest(req, self); } when(RestoreLoadFileRequest req = waitNext(loaderInterf.loadFile.getFuture())) { requestTypeStr = "loadFile"; @@ -90,7 +90,8 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; - exitRole = handleFinishRestoreRequest(req, self); + handleFinishRestoreRequest(req, self); + exitRole = Void(); } when(wait(exitRole)) { TraceEvent("FastRestore").detail("RestoreLoaderCore", "ExitRole").detail("NodeID", self->id()); @@ -109,31 +110,31 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } // Assume: Only update the local data if it (applierInterf) has not been set -ACTOR Future handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self) { +void handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self) { TraceEvent("FastRestore").detail("HandleRestoreSysInfoRequest", self->id()); ASSERT(self.isValid()); // The loader has received the appliers interfaces if (!self->appliersInterf.empty()) { req.reply.send(RestoreCommonReply(self->id())); - return Void(); + return; } self->appliersInterf = req.sysInfo.appliers; req.reply.send(RestoreCommonReply(self->id())); - return Void(); + return; } -ACTOR Future handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, - Reference self) { +void handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, + Reference self) { // Idempodent operation. OK to re-execute the duplicate cmd if (self->rangeToApplier.empty()) { self->rangeToApplier = req.rangeToApplier; } req.reply.send(RestoreCommonReply(self->id())); - return Void(); + return; } ACTOR Future _processLoadingParam(LoadingParam param, Reference self) { diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index 6217dc8c85..b6c2e51deb 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -43,7 +43,7 @@ ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id) { return Void(); } -ACTOR Future handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self) { +void handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self) { if (self->versionBatchStart) { self->versionBatchStart = false; } @@ -55,7 +55,7 @@ ACTOR Future handleFinishRestoreRequest(RestoreVersionBatchRequest req, Re req.reply.send(RestoreCommonReply(self->id())); - return Void(); + return; } ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index 86d63bbaa4..de02d4630b 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -55,7 +55,7 @@ typedef std::map>> VersionedMutations ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id); ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); -ACTOR Future handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self); +void handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self); // Helper class for reading restore data from a buffer and throwing the right errors. // This struct is mostly copied from StringRefReader. We add a sanity check in this struct. From 1ae02dd1df396dd93da783455e221e3a3827ef39 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Oct 2019 17:21:45 -0700 Subject: [PATCH 074/184] FastRestore:AtomicOp test:Add sanity check for setup step --- fdbserver/workloads/AtomicOps.actor.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index 14180a3327..15e8809f8b 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -123,6 +123,24 @@ struct AtomicOpsWorkload : TestWorkload { Key logKey( int group ) { return StringRef(format("log%08x%08x%08x",group,clientId,opNum++));} ACTOR Future _setup( Database cx, AtomicOpsWorkload* self ) { + // Sanity check if log keyspace has elements + state ReadYourWritesTransaction tr1(cx); + loop { + try { + Key begin(std::string("log")); + Standalone log = wait( tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY) ); + if (!log.empty()) { + TraceEvent(SevError, "AtomicOpSetup").detail("LogKeySpace", "Not empty").detail("Result", log.toString()); + for(auto& kv : log) { + TraceEvent(SevWarn, "AtomicOpSetup").detail("K", kv.key.toString()).detail("V", kv.value.toString()); + } + } + break; + } catch( Error &e ) { + wait( tr1.onError(e) ); + } + } + state int g = 0; for(; g < 100; g++) { state ReadYourWritesTransaction tr(cx); @@ -168,7 +186,6 @@ struct AtomicOpsWorkload : TestWorkload { break; } catch( Error &e ) { wait( tr.onError(e) ); - // self->opNum--; } } } From b1881a7c1c52eef650c55440fd1028cd469ae7c4 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Oct 2019 20:49:14 -0700 Subject: [PATCH 075/184] FastRestore:Apply clang-format --- fdbserver/RestoreApplier.actor.cpp | 7 ++++--- fdbserver/workloads/AtomicOps.actor.cpp | 24 ++++++++++++++---------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index fd92e71c3c..61e7b1b1d7 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -220,7 +220,8 @@ struct DBApplyProgress { } bool shouldCommit() { - return (!lastTxnHasError && (startNextVersion || transactionSize >= opConfig.transactionBatchSizeThreshold || curItInCurTxn == self->kvOps.end())); + return (!lastTxnHasError && (startNextVersion || transactionSize >= opConfig.transactionBatchSizeThreshold || + curItInCurTxn == self->kvOps.end())); } bool hasError() { return lastTxnHasError; } @@ -320,8 +321,8 @@ ACTOR Future applyToDB(Reference self, Database cx) { .detail("Version", progress.curItInCurTxn->first) .detail("Index", progress.curIndexInCurTxn) .detail("Mutation", m.toString()) - .detail("MutationSize", m.expectedSize()) - .detail("TxnSize", progress.transactionSize); + .detail("MutationSize", m.expectedSize()) + .detail("TxnSize", progress.transactionSize); if (m.type == MutationRef::SetValue) { tr->set(m.param1, m.param2); } else if (m.type == MutationRef::ClearRange) { diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index 15e8809f8b..d090d71249 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -102,12 +102,11 @@ struct AtomicOpsWorkload : TestWorkload { } virtual Future start( Database const& cx ) { - for(int c=0; cclone(), this, actorCount / transactionsPerSecond ), testDuration, Void()) ); + timeout(atomicOpWorker(cx->clone(), this, actorCount / transactionsPerSecond), testDuration, Void())); } - + return delay(testDuration); } @@ -128,16 +127,21 @@ struct AtomicOpsWorkload : TestWorkload { loop { try { Key begin(std::string("log")); - Standalone log = wait( tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY) ); + Standalone log = + wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); if (!log.empty()) { - TraceEvent(SevError, "AtomicOpSetup").detail("LogKeySpace", "Not empty").detail("Result", log.toString()); - for(auto& kv : log) { - TraceEvent(SevWarn, "AtomicOpSetup").detail("K", kv.key.toString()).detail("V", kv.value.toString()); + TraceEvent(SevError, "AtomicOpSetup") + .detail("LogKeySpace", "Not empty") + .detail("Result", log.toString()); + for (auto& kv : log) { + TraceEvent(SevWarn, "AtomicOpSetup") + .detail("K", kv.key.toString()) + .detail("V", kv.value.toString()); } } break; - } catch( Error &e ) { - wait( tr1.onError(e) ); + } catch (Error& e) { + wait(tr1.onError(e)); } } From f8e44d2f712952e6b9a1f439db2431dd89fc4bce Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 23 Oct 2019 23:04:39 -0700 Subject: [PATCH 076/184] fix: If a storage server was offline, it would not be checked for being in an undesired dc --- fdbserver/DataDistribution.actor.cpp | 36 +++++++++++++++++++--------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 73891b11f1..001afcbb99 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -3175,8 +3175,21 @@ ACTOR Future serverMetricsPolling( TCServerInfo *server) { } } -//Returns the KeyValueStoreType of server if it is different from self->storeType -ACTOR Future keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo *server) { +//Returns if the KeyValueStoreType of server is different from self->storeType or the desired datacenter does not match +ACTOR Future keyValueStoreTypeTracker(DDTeamCollection* self, TCServerInfo *server) { + if ((!self->includedDCs.empty() && + std::find(self->includedDCs.begin(), self->includedDCs.end(), server->lastKnownInterface.locality.dcId()) == + self->includedDCs.end()) || + (!self->isValidLocality(self->configuration.storagePolicy, server->lastKnownInterface.locality))) { + TraceEvent("KeyValueStoreTypeChanged", self->distributorId) + .detail("ServerID", server->id) + .detail("StoreType", "?") + .detail("DesiredType", self->configuration.storageServerStoreType.toString()) + .detail("IsValidLocality", self->isValidLocality(self->configuration.storagePolicy, + server->lastKnownInterface.locality)); + return Void(); + } + state KeyValueStoreType type = wait(brokenPromiseToNever(server->lastKnownInterface.getKeyValueStoreType.getReplyWithTaskID(TaskPriority::DataDistribution))); if (type == self->configuration.storageServerStoreType && (self->includedDCs.empty() || @@ -3186,7 +3199,14 @@ ACTOR Future keyValueStoreTypeTracker(DDTeamCollection* self, wait(Future(Never())); } - return type; + TraceEvent("KeyValueStoreTypeChanged", self->distributorId) + .detail("ServerID", server->id) + .detail("StoreType", type.toString()) + .detail("DesiredType", self->configuration.storageServerStoreType.toString()) + .detail("IsValidLocality", self->isValidLocality(self->configuration.storagePolicy, + server->lastKnownInterface.locality)); + + return Void(); } ACTOR Future waitForAllDataRemoved( Database cx, UID serverID, Version addedVersion, DDTeamCollection* teams ) { @@ -3302,7 +3322,7 @@ ACTOR Future storageServerTracker( state Future metricsTracker = serverMetricsPolling( server ); state Future> interfaceChanged = server->onInterfaceChanged; - state Future storeTracker = keyValueStoreTypeTracker( self, server ); + state Future storeTracker = keyValueStoreTypeTracker( self, server ); state bool hasWrongStoreTypeOrDC = false; state int targetTeamNumPerServer = (SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (self->configuration.storageTeamSize + 1)) / 2; @@ -3527,13 +3547,7 @@ ACTOR Future storageServerTracker( when( wait( otherChanges.empty() ? Never() : quorum( otherChanges, 1 ) ) ) { TraceEvent("SameAddressChangedStatus", self->distributorId).detail("ServerID", server->id); } - when( KeyValueStoreType type = wait( storeTracker ) ) { - TraceEvent("KeyValueStoreTypeChanged", self->distributorId) - .detail("ServerID", server->id) - .detail("StoreType", type.toString()) - .detail("DesiredType", self->configuration.storageServerStoreType.toString()) - .detail("IsValidLocality", self->isValidLocality(self->configuration.storagePolicy, - server->lastKnownInterface.locality)); + when( wait( storeTracker ) ) { TEST(true); //KeyValueStore type changed storeTracker = Never(); From a7492aab0ada02424f6e43d0bfbc7a947fee82ed Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 23 Oct 2019 23:06:02 -0700 Subject: [PATCH 077/184] fix: poppedVersion can update during a yield, so all work must be done immediately after getMore returns --- fdbserver/LogSystemPeekCursor.actor.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 98ba5a4bb0..4c4409c0c0 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -991,8 +991,16 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) { } ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, Version maxVersion, TaskPriority taskID ) { + if(cursor->version().version >= maxVersion) { + return Void(); + } loop { wait(yield()); + wait(cursor->getMore(taskID)); + self->poppedVersion = std::max(self->poppedVersion, cursor->popped()); + if(self->canDiscardPopped) { + self->initialPoppedVersion = std::max(self->initialPoppedVersion, cursor->popped()); + } if(cursor->version().version >= maxVersion) { return Void(); } @@ -1003,11 +1011,6 @@ ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe return Void(); } } - wait(cursor->getMore(taskID)); - self->poppedVersion = std::max(self->poppedVersion, cursor->popped()); - if(self->canDiscardPopped) { - self->initialPoppedVersion = std::max(self->initialPoppedVersion, cursor->popped()); - } } } From 5d7c84b80339e072484eaf29e12c74b0cd4949f3 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Thu, 24 Oct 2019 09:45:04 -0700 Subject: [PATCH 078/184] moved shuffle outside of the conditional blocks --- fdbserver/workloads/MachineAttrition.actor.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 1fc0e34ea2..9cd608b0e6 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -172,10 +172,10 @@ struct MachineAttritionWorkload : TestWorkload { workers.push_back(worker); } } + deterministicRandom()->randomShuffle(workers); if (self->killDc) { wait(delay(delayBeforeKill)); // Pick a dcId to kill - deterministicRandom()->randomShuffle(workers); Optional> killDcId = self->targetId.toString().empty() ? workers.back().interf.locality.dcId() : self->targetId; TraceEvent("Assassination").detail("TargetDataCenterId", killDcId); for (const auto& worker : workers) { @@ -188,7 +188,6 @@ struct MachineAttritionWorkload : TestWorkload { } else if (self->killMachine) { wait(delay(delayBeforeKill)); // Pick a machine to kill - deterministicRandom()->randomShuffle(workers); Optional> killMachineId = self->targetId.toString().empty() ? workers.back().interf.locality.machineId() : self->targetId; TraceEvent("Assassination").detail("TargetMachineId", killMachineId); for (const auto& worker : workers) { @@ -201,7 +200,6 @@ struct MachineAttritionWorkload : TestWorkload { } else if (self->killDatahall) { wait(delay(delayBeforeKill)); // Pick a datahall to kill - deterministicRandom()->randomShuffle(workers); Optional> killDatahallId = self->targetId.toString().empty() ? workers.back().interf.locality.dataHallId() : self->targetId; TraceEvent("Assassination").detail("TargetDatahallId", killDatahallId); for (const auto& worker : workers) { @@ -235,7 +233,6 @@ struct MachineAttritionWorkload : TestWorkload { } // Pick a machine to kill state WorkerDetails targetMachine; - deterministicRandom()->randomShuffle(workers); targetMachine = workers.back(); TraceEvent("Assassination") .detail("TargetMachine", targetMachine.interf.locality.toString()) From 7579bc7e7e9048bf0c7ed57d519425cb315da454 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 24 Oct 2019 10:09:37 -0700 Subject: [PATCH 079/184] updated release notes --- documentation/sphinx/source/release-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index f964f8bcf4..6dabb859b9 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -60,6 +60,7 @@ Fixes * Don't track batch priority GRV requests in latency bands. [6.2.7] `(PR #2279) `_. * Transaction log processes used twice their normal memory when switching spill types. [6.2.7] `(PR #2256) `_. * Under certain conditions, cross region replication could stall for 10 minute periods. [6.2.7] `(PR #1818) `_ `(PR #2276) `_. +* When dropping a remote region from the configuration after processes in the region have failed, data distribution would create teams from the dead servers for one minute. [6.2.7] `(PR #2286) `_. Status ------ From a290e2cb2b25e1abf11d656afe4ca58c0683a917 Mon Sep 17 00:00:00 2001 From: Xin Dong Date: Thu, 24 Oct 2019 11:02:17 -0700 Subject: [PATCH 080/184] Use 8 MiB for real --- fdbserver/Knobs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 1c5657707b..4db024fed4 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -129,7 +129,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( MAX_SHARD_BYTES, 500000000 ); init( KEY_SERVER_SHARD_BYTES, 500000000 ); bool buggifySmallReadBandwidth = randomize && BUGGIFY; - init( SHARD_MAX_BYTES_READ_PER_KSEC, 3LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000; + init( SHARD_MAX_BYTES_READ_PER_KSEC, 8LL*1000000*1000 ); if( buggifySmallReadBandwidth ) SHARD_MAX_BYTES_READ_PER_KSEC = 100LL*1000*1000; /* 8*1MB/sec * 1000sec/ksec Shards with more than this read bandwidth will be considered as a read cache candidate */ From 48aa55699a9bef4e342227cf6b07556183f38004 Mon Sep 17 00:00:00 2001 From: Kao Makino Date: Thu, 24 Oct 2019 16:46:25 +0000 Subject: [PATCH 081/184] Cleanup mako c-binding benchmark --- bindings/c/test/mako/mako.c | 790 +++++++++++++++++----------------- bindings/c/test/mako/mako.h | 17 +- bindings/c/test/mako/mako.rst | 33 +- 3 files changed, 434 insertions(+), 406 deletions(-) diff --git a/bindings/c/test/mako/mako.c b/bindings/c/test/mako/mako.c index b365ce3d32..cc8cdc785f 100755 --- a/bindings/c/test/mako/mako.c +++ b/bindings/c/test/mako/mako.c @@ -23,12 +23,17 @@ #include "utils.h" #include "fdbclient/zipf.h" +/* global variables */ +FILE *printme; /* descriptor used for default messages */ +FILE *annoyme; /* descriptor used for annoying messages */ +FILE *debugme; /* descriptor used for debug messages */ + #define check_fdb_error(_e) \ do { \ if (_e) { \ fprintf(stderr, "ERROR: Failed at %s:%d (%s)\n", __FILE__, __LINE__, \ fdb_get_error(_e)); \ - goto FDB_FAIL; \ + goto failExit; \ } \ } while (0) @@ -37,10 +42,47 @@ if ((fdb_future_block_until_ready(_f)) != 0) { \ fprintf(stderr, "ERROR: fdb_future_block_until_ready failed at %s:%d\n", \ __FILE__, __LINE__); \ - goto FDB_FAIL; \ + goto failExit; \ } \ } while (0) +#define fdb_wait_and_handle_error(_func, _f, _t) \ + do { \ + int err = wait_future(_f); \ + if (err) { \ + int err2; \ + if ((err != 1020 /* not_committed */) && \ + (err != 1021 /* commit_unknown_result */)) { \ + fprintf(stderr, "ERROR: Error %s (%d) occured at %s\n", \ + #_func, err, fdb_get_error(err)); \ + } else { \ + fprintf(annoyme, "ERROR: Error %s (%d) occured at %s\n", \ + #_func, err, fdb_get_error(err)); \ + } \ + fdb_future_destroy(_f); \ + _f = fdb_transaction_on_error(_t, err); \ + /* this will return the original error for non-retryable errors */ \ + err2 = wait_future(_f); \ + fdb_future_destroy(_f); \ + if (err2) { \ + /* unretryable error */ \ + fprintf(stderr, \ + "ERROR: fdb_transaction_on_error returned %d at %s:%d\n", \ + err2, __FILE__, __LINE__); \ + fdb_transaction_reset(_t); \ + /* TODO: if we adda retry limit in the future, \ + * handle the conflict stats properly. \ + */ \ + return FDB_ERROR_ABORT; \ + } \ + if (err == 1020 /* not_committed */) { \ + return FDB_ERROR_CONFLICT; \ + } \ + return FDB_ERROR_RETRY; \ + } \ + } while (0) + + fdb_error_t wait_future(FDBFuture *f) { fdb_error_t err; @@ -52,47 +94,17 @@ fdb_error_t wait_future(FDBFuture *f) { } -int commit_transaction(FDBTransaction *transaction, mako_stats_t *stats) { +int commit_transaction(FDBTransaction *transaction) { FDBFuture *f; - fdb_error_t err = 0; - int retry = DEFAULT_RETRY_COUNT; - do { - f = fdb_transaction_commit(transaction); - err = wait_future(f); - fdb_future_destroy(f); - if (stats) { - if (err == 1020 /* not_committed */) - stats->conflicts++; - else { - stats->errors[OP_COMMIT]++; - } - } - - if (err) { - fprintf(stderr, "ERROR: Error %d occured at fdb_transaction_commit\n", - err); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - fprintf(stderr, - "ERROR: fdb_transaction_on_error returned %d at %s:%d\n", - err, __FILE__, __LINE__); - break; - } - } else { - if (stats) - stats->ops[OP_COMMIT]++; - break; - } - } while (err && retry--); - - return err; + f = fdb_transaction_commit(transaction); + fdb_wait_and_handle_error(commit_transaction, f, transaction); + + return FDB_SUCCESS; } -void update_op_stats(struct timespec *start, struct timespec *end, int op, + +void update_op_lat_stats(struct timespec *start, struct timespec *end, int op, mako_stats_t *stats) { uint64_t latencyus; @@ -109,13 +121,12 @@ void update_op_stats(struct timespec *start, struct timespec *end, int op, } } + /* FDB network thread */ void *fdb_network_thread(void *args) { fdb_error_t err; - if (((mako_args_t *)args)->verbose == VERBOSE_DEBUG) { - printf("DEBUG: fdb_network_thread started\n"); - } + fprintf(debugme, "DEBUG: fdb_network_thread started\n"); err = fdb_run_network(); if (err) { @@ -125,6 +136,7 @@ void *fdb_network_thread(void *args) { return 0; } + /* cleanup database */ int cleanup(FDBTransaction *transaction, mako_args_t *args) { struct timespec timer_start, timer_end; @@ -138,24 +150,23 @@ int cleanup(FDBTransaction *transaction, mako_args_t *args) { clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_start); fdb_transaction_clear_range(transaction, (uint8_t *)beginstr, 5, (uint8_t *)endstr, 5); - if (commit_transaction(transaction, NULL)) - goto FDB_FAIL; + if (commit_transaction(transaction) != FDB_SUCCESS) + goto failExit; fdb_transaction_reset(transaction); clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_end); - if (args->verbose >= VERBOSE_DEFAULT) { - printf("INFO: Clear range: %6.3f sec\n", - ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + - timer_end.tv_nsec - timer_start.tv_nsec) / - 1000000000); - } + fprintf(printme, "INFO: Clear range: %6.3f sec\n", + ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + + timer_end.tv_nsec - timer_start.tv_nsec) / + 1000000000); return 0; -FDB_FAIL: +failExit: fprintf(stderr, "ERROR: FDB failure in cleanup()\n"); return -1; } + /* populate database */ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id, int thread_id, int thread_tps, mako_stats_t *stats) { @@ -221,12 +232,12 @@ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id, /* commit every 100 inserts (default) */ if (i % args->txnspec.ops[OP_INSERT][OP_COUNT] == 0) { - if (commit_transaction(transaction, NULL)) - goto FDB_FAIL; + if (commit_transaction(transaction) != FDB_SUCCESS) + goto failExit; /* xact latency stats */ clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, stats); stats->ops[OP_COMMIT]++; clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_start); @@ -237,29 +248,27 @@ int populate(FDBTransaction *transaction, mako_args_t *args, int worker_id, } } - if (commit_transaction(transaction, NULL)) - goto FDB_FAIL; + if (commit_transaction(transaction) != FDB_SUCCESS) + goto failExit; /* xact latency stats */ clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, stats); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, stats); clock_gettime(CLOCK_MONOTONIC, &timer_end); stats->xacts++; - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: Populated %d rows (%d-%d): %6.3f sec\n", end - begin, begin, - end, - ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + - timer_end.tv_nsec - timer_start.tv_nsec) / - 1000000000); - } + fprintf(debugme, "DEBUG: Populated %d rows (%d-%d): %6.3f sec\n", end - begin, begin, + end, + ((timer_end.tv_sec - timer_start.tv_sec) * 1000000000.0 + + timer_end.tv_nsec - timer_start.tv_nsec) / + 1000000000); free(keystr); free(valstr); return 0; -FDB_FAIL: +failExit: if (keystr) free(keystr); if (valstr) @@ -268,50 +277,40 @@ FDB_FAIL: return -1; } -int64_t run_op_getreadversion(FDBTransaction *transaction) { - int64_t rv = 0; + +int64_t run_op_getreadversion(FDBTransaction *transaction, int64_t *rv) { FDBFuture *f; fdb_error_t err; - int retry = DEFAULT_RETRY_COUNT; - do { - f = fdb_transaction_get_read_version(transaction); - err = wait_future(f); + *rv = 0; - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get_read_version: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get_read_version(transaction); + fdb_wait_and_handle_error(fdb_transaction_get_read_version, f, transaction); #if FDB_API_VERSION < 620 - err = fdb_future_get_version(f, &rv); + err = fdb_future_get_version(f, rv); #else - err = fdb_future_get_int64(f, &rv); + err = fdb_future_get_int64(f, rv); #endif + fdb_future_destroy(f); if (err) { #if FDB_API_VERSION < 620 fprintf(stderr, "ERROR: fdb_future_get_version: %s\n", fdb_get_error(err)); #else fprintf(stderr, "ERROR: fdb_future_get_int64: %s\n", fdb_get_error(err)); #endif + return FDB_ERROR_RETRY; } - fdb_future_destroy(f); - return rv; + + /* fail if rv not properly set */ + if (!*rv) { + return FDB_ERROR_RETRY; + } + return FDB_SUCCESS; } + int run_op_get(FDBTransaction *transaction, char *keystr, char *valstr, int snapshot) { FDBFuture *f; @@ -319,41 +318,23 @@ int run_op_get(FDBTransaction *transaction, char *keystr, char *valstr, char *val; int vallen; fdb_error_t err; - int retry = DEFAULT_RETRY_COUNT; - - do { - f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), - snapshot); - err = wait_future(f); - - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), + snapshot); + fdb_wait_and_handle_error(fdb_transaction_get, f, transaction); + err = fdb_future_get_value(f, &out_present, (const uint8_t **)&val, &vallen); fdb_future_destroy(f); if (err || !out_present) { /* error or value not present */ - return -1; + return FDB_ERROR_RETRY; } strncpy(valstr, val, vallen); valstr[vallen] = '\0'; - return 0; + return FDB_SUCCESS; } + int run_op_getrange(FDBTransaction *transaction, char *keystr, char *keystr2, char *valstr, int snapshot, int reverse) { FDBFuture *f; @@ -361,111 +342,79 @@ int run_op_getrange(FDBTransaction *transaction, char *keystr, char *keystr2, FDBKeyValue const *out_kv; int out_count; int out_more; - int retry = DEFAULT_RETRY_COUNT; - do { - f = fdb_transaction_get_range( - transaction, - FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((uint8_t *)keystr, strlen(keystr)), - FDB_KEYSEL_LAST_LESS_OR_EQUAL((uint8_t *)keystr2, strlen(keystr2)) + 1, - 0 /* limit */, 0 /* target_bytes */, - FDB_STREAMING_MODE_WANT_ALL /* FDBStreamingMode */, 0 /* iteration */, - snapshot, reverse /* reverse */); - err = wait_future(f); - - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get_range: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get_range( + transaction, + FDB_KEYSEL_FIRST_GREATER_OR_EQUAL((uint8_t *)keystr, strlen(keystr)), + FDB_KEYSEL_LAST_LESS_OR_EQUAL((uint8_t *)keystr2, strlen(keystr2)) + 1, + 0 /* limit */, 0 /* target_bytes */, + FDB_STREAMING_MODE_WANT_ALL /* FDBStreamingMode */, 0 /* iteration */, + snapshot, reverse /* reverse */); + fdb_wait_and_handle_error(fdb_transaction_get_range, f, transaction); err = fdb_future_get_keyvalue_array(f, &out_kv, &out_count, &out_more); if (err) { fprintf(stderr, "ERROR: fdb_future_get_keyvalue_array: %s\n", fdb_get_error(err)); fdb_future_destroy(f); - return -1; + return FDB_ERROR_RETRY; } fdb_future_destroy(f); - return 0; + return FDB_SUCCESS; } + +/* Update -- GET and SET the same key */ int run_op_update(FDBTransaction *transaction, char *keystr, char *valstr) { FDBFuture *f; int out_present; char *val; int vallen; fdb_error_t err; - int retry = DEFAULT_RETRY_COUNT; /* GET first */ - do { - f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), 0); - err = wait_future(f); - - if (err) { - fdb_future_destroy(f); - f = fdb_transaction_on_error(transaction, err); - err = wait_future(f); - fdb_future_destroy(f); - if (err) { - /* not retryable */ - break; - } - } - } while (err && retry--); - - if (err) { - fprintf(stderr, "ERROR: fdb_transaction_get: %s\n", fdb_get_error(err)); - return -1; - } + f = fdb_transaction_get(transaction, (uint8_t *)keystr, strlen(keystr), 0); + fdb_wait_and_handle_error(fdb_transaction_get, f, transaction); err = fdb_future_get_value(f, &out_present, (const uint8_t **)&val, &vallen); fdb_future_destroy(f); if (err || !out_present) { /* error or value not present */ - return -1; + return FDB_ERROR_RETRY; } /* Update Value (SET) */ fdb_transaction_set(transaction, (uint8_t *)keystr, strlen(keystr), (uint8_t *)valstr, strlen(valstr)); - return 0; + return FDB_SUCCESS; } + int run_op_insert(FDBTransaction *transaction, char *keystr, char *valstr) { fdb_transaction_set(transaction, (uint8_t *)keystr, strlen(keystr), (uint8_t *)valstr, strlen(valstr)); - return 0; + return FDB_SUCCESS; } + int run_op_clear(FDBTransaction *transaction, char *keystr) { fdb_transaction_clear(transaction, (uint8_t *)keystr, strlen(keystr)); - return 0; + return FDB_SUCCESS; } + int run_op_clearrange(FDBTransaction *transaction, char *keystr, char *keystr2) { fdb_transaction_clear_range(transaction, (uint8_t *)keystr, strlen(keystr), (uint8_t *)keystr2, strlen(keystr2)); - return 0; + return FDB_SUCCESS; } + /* run one transaction */ -int run_transaction(FDBTransaction *transaction, mako_args_t *args, - mako_stats_t *stats, char *keystr, char *keystr2, - char *valstr) { +int run_one_transaction(FDBTransaction *transaction, mako_args_t *args, + mako_stats_t *stats, char *keystr, char *keystr2, + char *valstr) { int i; int count; int rc; @@ -478,172 +427,228 @@ int run_transaction(FDBTransaction *transaction, mako_args_t *args, int randstrlen; int rangei; - /* transaction */ - clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_start); - for (i = 0; i < MAX_OP; i++) { + /* make sure that the transaction object is clean */ + fdb_transaction_reset(transaction); + clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_start); + + retryTxn: + for (i = 0; i < MAX_OP; i++) { + if ((args->txnspec.ops[i][OP_COUNT] > 0) && (i != OP_COMMIT)) { for (count = 0; count < args->txnspec.ops[i][OP_COUNT]; count++) { + + /* note: for simplicity, always generate a new key(s) even when retrying */ - /* pick a random key(s) */ - if (args->zipf) { - keynum = zipfian_next(); - } else { - keynum = urand(0, args->rows - 1); - } - genkey(keystr, keynum, args->rows, args->key_length + 1); + /* pick a random key(s) */ + if (args->zipf) { + keynum = zipfian_next(); + } else { + keynum = urand(0, args->rows - 1); + } + genkey(keystr, keynum, args->rows, args->key_length + 1); + + /* range */ + if (args->txnspec.ops[i][OP_RANGE] > 0) { + keyend = keynum + args->txnspec.ops[i][OP_RANGE] - 1; /* inclusive */ + if (keyend > args->rows - 1) { + keyend = args->rows - 1; + } + genkey(keystr2, keyend, args->rows, args->key_length + 1); + } + + if (stats->xacts % args->sampling == 0) { + /* per op latency */ + clock_gettime(CLOCK_MONOTONIC, &timer_start); + } + + switch (i) { + case OP_GETREADVERSION: + rc = run_op_getreadversion(transaction, &readversion); + break; + case OP_GET: + rc = run_op_get(transaction, keystr, valstr, 0); + break; + case OP_GETRANGE: + rc = run_op_getrange(transaction, keystr, keystr2, valstr, 0, + args->txnspec.ops[i][OP_REVERSE]); + break; + case OP_SGET: + rc = run_op_get(transaction, keystr, valstr, 1); + break; + case OP_SGETRANGE: + rc = run_op_getrange(transaction, keystr, keystr2, valstr, 1, + args->txnspec.ops[i][OP_REVERSE]); + break; + case OP_UPDATE: + randstr(valstr, args->value_length + 1); + rc = run_op_update(transaction, keystr, valstr); + docommit = 1; + break; + case OP_INSERT: + randstr(keystr + KEYPREFIXLEN, + args->key_length - KEYPREFIXLEN + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + rc = run_op_insert(transaction, keystr, valstr); + docommit = 1; + break; + case OP_INSERTRANGE: + randstrlen = args->key_length - KEYPREFIXLEN - + digits(args->txnspec.ops[i][OP_RANGE]); + randstr(keystr + KEYPREFIXLEN, randstrlen + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { + sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", + digits(args->txnspec.ops[i][OP_RANGE]), rangei); + rc = run_op_insert(transaction, keystr, valstr); + if (rc != FDB_SUCCESS) + break; + } + docommit = 1; + break; + case OP_CLEAR: + rc = run_op_clear(transaction, keystr); + docommit = 1; + break; + case OP_SETCLEAR: + randstr(keystr + KEYPREFIXLEN, + args->key_length - KEYPREFIXLEN + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + rc = run_op_insert(transaction, keystr, valstr); + if (rc == FDB_SUCCESS) { + /* commit insert so mutation goes to storage */ + rc = commit_transaction(transaction); + if (rc == FDB_SUCCESS) { + stats->ops[OP_COMMIT]++; + clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, + OP_COMMIT, stats); + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; + } + fdb_transaction_reset(transaction); + rc = run_op_clear(transaction, keystr); + } + docommit = 1; + break; + case OP_CLEARRANGE: + rc = run_op_clearrange(transaction, keystr, keystr2); + docommit = 1; + break; + case OP_SETCLEARRANGE: + randstrlen = args->key_length - KEYPREFIXLEN - + digits(args->txnspec.ops[i][OP_RANGE]); + randstr(keystr + KEYPREFIXLEN, + randstrlen + 1); /* make it (almost) unique */ + randstr(valstr, args->value_length + 1); + for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { + sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", + digits(args->txnspec.ops[i][OP_RANGE]), rangei); + if (rangei == 0) { + strcpy(keystr2, keystr); + keystr2[strlen(keystr)] = '\0'; + } + rc = run_op_insert(transaction, keystr, valstr); + /* rollback not necessary, move on */ + if (rc == FDB_ERROR_RETRY) { + goto retryTxn; + } else if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + } + /* commit insert so mutation goes to storage */ + rc = commit_transaction(transaction); + if (rc == FDB_SUCCESS) { + stats->ops[OP_COMMIT]++; + clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, + OP_COMMIT, stats); + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; + } + fdb_transaction_reset(transaction); + rc = run_op_clearrange(transaction, keystr2, keystr); + docommit = 1; + break; + default: + fprintf(stderr, "ERROR: Unknown Operation %d\n", i); + break; + } - /* range */ - if (args->txnspec.ops[i][OP_RANGE] > 0) { - keyend = keynum + args->txnspec.ops[i][OP_RANGE] - 1; /* inclusive */ - if (keyend > args->rows - 1) { - keyend = args->rows - 1; - } - genkey(keystr2, keyend, args->rows, args->key_length + 1); - } + if (stats->xacts % args->sampling == 0) { + clock_gettime(CLOCK_MONOTONIC, &timer_end); + if (rc == FDB_SUCCESS) { + /* per op latency, record successful transactions */ + update_op_lat_stats(&timer_start, &timer_end, i, stats); + } + } - if (stats->xacts % args->sampling == 0) { - /* per op latency */ - clock_gettime(CLOCK_MONOTONIC, &timer_start); - } - - switch (i) { - case OP_GETREADVERSION: - readversion = run_op_getreadversion(transaction); - if (!readversion) { - rc = -1; - } - break; - case OP_GET: - rc = run_op_get(transaction, keystr, valstr, 0); - break; - case OP_GETRANGE: - rc = run_op_getrange(transaction, keystr, keystr2, valstr, 0, - args->txnspec.ops[i][OP_REVERSE]); - break; - case OP_SGET: - rc = run_op_get(transaction, keystr, valstr, 1); - break; - case OP_SGETRANGE: - rc = run_op_getrange(transaction, keystr, keystr2, valstr, 1, - args->txnspec.ops[i][OP_REVERSE]); - break; - case OP_UPDATE: - randstr(valstr, args->value_length + 1); - rc = run_op_update(transaction, keystr, valstr); - docommit = 1; - break; - case OP_INSERT: - randstr(keystr + KEYPREFIXLEN, args->key_length - KEYPREFIXLEN + - 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - rc = run_op_insert(transaction, keystr, valstr); - docommit = 1; - break; - case OP_INSERTRANGE: - randstrlen = args->key_length - KEYPREFIXLEN - - digits(args->txnspec.ops[i][OP_RANGE]); - randstr(keystr + KEYPREFIXLEN, - randstrlen + 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { - sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", - digits(args->txnspec.ops[i][OP_RANGE]), rangei); - rc = run_op_insert(transaction, keystr, valstr); - if (rc != 0) - break; - } - docommit = 1; - break; - case OP_CLEAR: - rc = run_op_clear(transaction, keystr); - docommit = 1; - break; - case OP_SETCLEAR: - randstr(keystr + KEYPREFIXLEN, args->key_length - KEYPREFIXLEN + - 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - rc = run_op_insert(transaction, keystr, valstr); - if (rc == 0) { - /* commit insert so mutation goes to storage */ - if (commit_transaction(transaction, stats) == 0) { - clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, - OP_COMMIT, stats); - } - fdb_transaction_reset(transaction); - rc = run_op_clear(transaction, keystr); - } - docommit = 1; - break; - case OP_CLEARRANGE: - rc = run_op_clearrange(transaction, keystr, keystr2); - docommit = 1; - break; - case OP_SETCLEARRANGE: - randstrlen = args->key_length - KEYPREFIXLEN - - digits(args->txnspec.ops[i][OP_RANGE]); - randstr(keystr + KEYPREFIXLEN, - randstrlen + 1); /* make it (almost) unique */ - randstr(valstr, args->value_length + 1); - for (rangei = 0; rangei < args->txnspec.ops[i][OP_RANGE]; rangei++) { - sprintf(keystr + KEYPREFIXLEN + randstrlen, "%0.*d", - digits(args->txnspec.ops[i][OP_RANGE]), rangei); - if (rangei == 0) { - strcpy(keystr2, keystr); - keystr2[strlen(keystr)] = '\0'; - } - rc = run_op_insert(transaction, keystr, valstr); - if (rc != 0) { - /* rollback not necessary, transaction will be reset */ - break; - } - } - /* commit inserts so mutation goes to storage */ - if (commit_transaction(transaction, stats) == 0) { - clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, - OP_COMMIT, stats); - } - fdb_transaction_reset(transaction); - rc = run_op_clearrange(transaction, keystr2, keystr); - docommit = 1; - break; - default: - fprintf(stderr, "ERROR: Unknown Operation %d\n", i); - break; - } - - if (stats->xacts % args->sampling == 0) { - clock_gettime(CLOCK_MONOTONIC, &timer_end); - if (rc == 0) { - /* per op latency */ - update_op_stats(&timer_start, &timer_end, i, stats); - } - } - - /* check rc */ - if (rc != 0) { - stats->errors[i]++; - } else { - stats->ops[i]++; - } + /* check rc and update stats */ + if (rc == FDB_SUCCESS) { + stats->ops[i]++; + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; + } } } } + + /* commit only successful transaction */ if (docommit | args->commit_get) { - if (commit_transaction(transaction, stats) == 0) { + rc = commit_transaction(transaction); + if (rc == FDB_SUCCESS) { + /* success */ + stats->ops[OP_COMMIT]++; clock_gettime(CLOCK_MONOTONIC, &timer_per_xact_end); - update_op_stats(&timer_per_xact_start, &timer_per_xact_end, OP_COMMIT, - stats); + update_op_lat_stats(&timer_per_xact_start, &timer_per_xact_end, + OP_COMMIT, stats); + } else { + /* error */ + if (rc == FDB_ERROR_CONFLICT) { + stats->conflicts++; + } else { + stats->errors[OP_COMMIT]++; + } + if (rc == FDB_ERROR_ABORT) { + return rc; /* abort */ + } + goto retryTxn; } } + stats->xacts++; - fdb_transaction_reset(transaction); return 0; } + int run_workload(FDBTransaction *transaction, mako_args_t *args, int thread_tps, volatile double *throttle_factor, int thread_iters, volatile int *signal, mako_stats_t *stats) { @@ -677,6 +682,7 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args, clock_gettime(CLOCK_MONOTONIC_COARSE, &timer_prev); + /* main transaction loop */ while (1) { if ((thread_tps > 0) && (xacts >= current_tps)) { @@ -699,17 +705,19 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args, } } - rc = run_transaction(transaction, args, stats, keystr, keystr2, valstr); + rc = run_one_transaction(transaction, args, stats, keystr, keystr2, valstr); if (rc) { - /* should never get here */ - fprintf(stderr, "ERROR: run_transaction failed (%d)\n", rc); + /* FIXME: run_one_transaction should return something meaningful */ + fprintf(annoyme, "ERROR: run_one_transaction failed (%d)\n", rc); } if (thread_iters > 0) { if (thread_iters == xacts) { + /* xact limit reached */ break; } } else if (*signal == SIGNAL_RED) { + /* signal turned red, target duration reached */ break; } xacts++; @@ -721,6 +729,7 @@ int run_workload(FDBTransaction *transaction, mako_args_t *args, return rc; } + /* mako worker thread */ void *worker_thread(void *thread_args) { int worker_id = ((thread_args_t *)thread_args)->process->worker_id; @@ -749,11 +758,9 @@ void *worker_thread(void *thread_args) { stats->latency_us_total[op] = 0; } - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: worker_id:%d (%d) thread_id:%d (%d) (tid:%d)\n", worker_id, - args->num_processes, thread_id, args->num_threads, - (unsigned int)pthread_self()); - } + fprintf(debugme, "DEBUG: worker_id:%d (%d) thread_id:%d (%d) (tid:%d)\n", worker_id, + args->num_processes, thread_id, args->num_threads, + (unsigned int)pthread_self()); if (args->tpsmax) { thread_tps = compute_thread_tps(args->tpsmax, worker_id, thread_id, @@ -801,11 +808,12 @@ void *worker_thread(void *thread_args) { } /* fall through */ -FDB_FAIL: +failExit: fdb_transaction_destroy(transaction); pthread_exit(0); } + /* mako worker process */ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { int i; @@ -824,23 +832,16 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { process.args = args; process.shm = (mako_shmhdr_t *)shm; - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: worker %d started\n", worker_id); - } + fprintf(debugme, "DEBUG: worker %d started\n", worker_id); /* Everything starts from here */ - /* Let's use the maximum API version */ - // fprintf(stderr, "fdb_get_max_api_version: %d\n", - // fdb_get_max_api_version()); - err = fdb_select_api_version(fdb_get_max_api_version()); + err = fdb_select_api_version(args->api_version); check_fdb_error(err); /* enable flatbuffers if specified */ if (args->flatbuffers) { #ifdef FDB_NET_OPTION_USE_FLATBUFFERS - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: Using flatbuffers\n"); - } + fprintf(debugme, "DEBUG: Using flatbuffers\n"); err = fdb_network_set_option(FDB_NET_OPTION_USE_FLATBUFFERS, (uint8_t *)&args->flatbuffers, sizeof(uint8_t)); @@ -851,20 +852,16 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { fdb_get_error(err)); } #else - if (args->verbose >= VERBOSE_DEFAULT) { - printf("INFO: flatbuffers is not supported in FDB API version %d\n", - FDB_API_VERSION); - } + fprintf(printme, "INFO: flatbuffers is not supported in FDB API version %d\n", + FDB_API_VERSION); #endif } /* enable tracing if specified */ if (args->trace) { - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: Enable Tracing (%s)\n", (args->tracepath[0] == '\0') - ? "current directory" - : args->tracepath); - } + fprintf(debugme, "DEBUG: Enable Tracing (%s)\n", (args->tracepath[0] == '\0') + ? "current directory" + : args->tracepath); err = fdb_network_set_option(FDB_NET_OPTION_TRACE_ENABLE, (uint8_t *)args->tracepath, strlen(args->tracepath)); @@ -881,9 +878,7 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { char delim[] = ", "; char *knob = strtok(args->knobs, delim); while (knob != NULL) { - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: Setting client knobs: %s\n", knob); - } + fprintf(debugme, "DEBUG: Setting client knobs: %s\n", knob); err = fdb_network_set_option(FDB_NET_OPTION_KNOB, (uint8_t *)knob, strlen(knob)); if (err) { @@ -895,16 +890,12 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { } /* Network thread must be setup before doing anything */ - if (args->verbose == VERBOSE_DEBUG) { - printf("DEBUG: fdb_setup_network\n"); - } + fprintf(debugme, "DEBUG: fdb_setup_network\n"); err = fdb_setup_network(); check_fdb_error(err); /* Each worker process will have its own network thread */ - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: creating network thread\n"); - } + fprintf(debugme, "DEBUG: creating network thread\n"); rc = pthread_create(&network_thread, NULL, fdb_network_thread, (void *)args); if (rc != 0) { fprintf(stderr, "ERROR: Cannot create a network thread\n"); @@ -935,13 +926,11 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { fdb_create_database(args->cluster_file, &process.database); #endif - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: creating %d worker threads\n", args->num_threads); - } + fprintf(debugme, "DEBUG: creating %d worker threads\n", args->num_threads); worker_threads = (pthread_t *)calloc(sizeof(pthread_t), args->num_threads); if (!worker_threads) { fprintf(stderr, "ERROR: cannot allocate worker_threads\n"); - goto EXIT; + goto failExit; } /* spawn worker threads */ @@ -949,7 +938,7 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { (thread_args_t *)calloc(sizeof(thread_args_t), args->num_threads); if (!thread_args) { fprintf(stderr, "ERROR: cannot allocate thread_args\n"); - goto EXIT; + goto failExit; } for (i = 0; i < args->num_threads; i++) { @@ -967,16 +956,14 @@ int worker_process_main(mako_args_t *args, int worker_id, mako_shmhdr_t *shm) { /* wait for everyone to finish */ for (i = 0; i < args->num_threads; i++) { - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: worker_thread %d joining\n", i); - } + fprintf(debugme, "DEBUG: worker_thread %d joining\n", i); rc = pthread_join(worker_threads[i], NULL); if (rc != 0) { fprintf(stderr, "ERROR: threads %d failed to join\n", i); } } -EXIT: +failExit: if (worker_threads) free(worker_threads); if (thread_args) @@ -989,18 +976,12 @@ EXIT: #endif /* stop the network thread */ - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: fdb_stop_network\n"); - } + fprintf(debugme, "DEBUG: fdb_stop_network\n"); err = fdb_stop_network(); check_fdb_error(err); -FDB_FAIL: - /* wait for the network thread to join */ - if (args->verbose >= VERBOSE_DEBUG) { - printf("DEBUG: network_thread joining\n"); - } + fprintf(debugme, "DEBUG: network_thread joining\n"); rc = pthread_join(network_thread, NULL); if (rc != 0) { fprintf(stderr, "ERROR: network thread failed to join\n"); @@ -1009,30 +990,32 @@ FDB_FAIL: return 0; } + /* initialize the parameters with default values */ int init_args(mako_args_t *args) { int i; if (!args) return -1; memset(args, 0, sizeof(mako_args_t)); /* zero-out everything */ + args->api_version = fdb_get_max_api_version(); args->json = 0; args->num_processes = 1; args->num_threads = 1; args->mode = MODE_INVALID; - args->rows = 10000; - args->seconds = 0; + args->rows = 100000; + args->seconds = 30; args->iteration = 0; args->tpsmax = 0; args->tpsmin = -1; args->tpsinterval = 10; args->tpschange = TPS_SIN; args->sampling = 1000; - args->key_length = 16; + args->key_length = 32; args->value_length = 16; args->zipf = 0; args->commit_get = 0; args->verbose = 1; - args->flatbuffers = 0; + args->flatbuffers = 0; /* internal */ args->knobs[0] = '\0'; args->trace = 0; args->tracepath[0] = '\0'; @@ -1042,6 +1025,7 @@ int init_args(mako_args_t *args) { return 0; } + /* parse transaction specification */ int parse_transaction(mako_args_t *args, char *optarg) { char *ptr = optarg; @@ -1099,9 +1083,7 @@ int parse_transaction(mako_args_t *args, char *optarg) { op = OP_SETCLEAR; ptr += 2; } else { - if (args->verbose == VERBOSE_DEBUG) { - printf("Error: Invalid transaction spec: %s\n", ptr); - } + fprintf(debugme, "Error: Invalid transaction spec: %s\n", ptr); error = 1; break; } @@ -1155,7 +1137,7 @@ int parse_transaction(mako_args_t *args, char *optarg) { if (args->verbose == VERBOSE_DEBUG) { for (op = 0; op < MAX_OP; op++) { - printf("DEBUG: OP: %d: %d: %d\n", op, args->txnspec.ops[op][0], + fprintf(debugme, "DEBUG: OP: %d: %d: %d\n", op, args->txnspec.ops[op][0], args->txnspec.ops[op][1]); } } @@ -1163,11 +1145,13 @@ int parse_transaction(mako_args_t *args, char *optarg) { return 0; } + void usage() { printf("Usage:\n"); printf("%-24s%s\n", "-h, --help", "Print this message"); printf("%-24s%s\n", " --version", "Print FDB version"); printf("%-24s%s\n", "-v, --verbose", "Specify verbosity"); + printf("%-24s%s\n", "-a, --api_version=API_VERSION", "Specify API_VERSION to use"); printf("%-24s%s\n", "-c, --cluster=FILE", "Specify FDB cluster file"); printf("%-24s%s\n", "-p, --procs=PROCS", "Specify number of worker processes"); @@ -1200,15 +1184,17 @@ void usage() { printf("%-24s%s\n", " --flatbuffers", "Use flatbuffers"); } + /* parse benchmark paramters */ int parse_args(int argc, char *argv[], mako_args_t *args) { int rc; int c; int idx; while (1) { - const char *short_options = "c:p:t:r:s:i:x:v:m:hjz"; + const char *short_options = "a:c:p:t:r:s:i:x:v:m:hjz"; static struct option long_options[] = { /* name, has_arg, flag, val */ + {"api_version", required_argument, NULL, 'a'}, {"cluster", required_argument, NULL, 'c'}, {"procs", required_argument, NULL, 'p'}, {"threads", required_argument, NULL, 't'}, @@ -1246,6 +1232,9 @@ int parse_args(int argc, char *argv[], mako_args_t *args) { case 'h': usage(); return -1; + case 'a': + args->api_version = atoi(optarg); + break; case 'c': strcpy(args->cluster_file, optarg); break; @@ -1340,9 +1329,27 @@ int parse_args(int argc, char *argv[], mako_args_t *args) { if ((args->tpsmin == -1) || (args->tpsmin > args->tpsmax)) { args->tpsmin = args->tpsmax; } + + if (args->verbose >= VERBOSE_DEFAULT) { + printme = stdout; + } else { + printme = fopen("/dev/null", "w"); + } + if (args->verbose >= VERBOSE_ANNOYING) { + annoyme = stdout; + } else { + annoyme = fopen("/dev/null", "w"); + } + if (args->verbose >= VERBOSE_DEBUG) { + debugme = stdout; + } else { + debugme = fopen("/dev/null", "w"); + } + return 0; } + int validate_args(mako_args_t *args) { if (args->mode == MODE_INVALID) { fprintf(stderr, "ERROR: --mode has to be set\n"); @@ -1380,6 +1387,7 @@ int validate_args(mako_args_t *args) { return 0; } + /* stats output formatting */ #define STR2(x) #x #define STR(x) STR2(x) @@ -1446,6 +1454,7 @@ void print_stats(mako_args_t *args, mako_stats_t *stats, struct timespec *now, return; } + void print_stats_header(mako_args_t *args) { int op; int i; @@ -1518,6 +1527,7 @@ void print_stats_header(mako_args_t *args) { printf("\n"); } + void print_report(mako_args_t *args, mako_stats_t *stats, struct timespec *timer_now, struct timespec *timer_start) { int i, j, op; @@ -1654,6 +1664,7 @@ void print_report(mako_args_t *args, mako_stats_t *stats, printf("\n"); } + int stats_process_main(mako_args_t *args, mako_stats_t *stats, volatile double *throttle_factor, volatile int *signal) { struct timespec timer_start, timer_prev, timer_now; @@ -1723,6 +1734,7 @@ int stats_process_main(mako_args_t *args, mako_stats_t *stats, return 0; } + int main(int argc, char *argv[]) { int rc; mako_args_t args; @@ -1779,7 +1791,7 @@ int main(int argc, char *argv[]) { if (ftruncate(shmfd, shmsize) < 0) { fprintf(stderr, "ERROR: ftruncate (fd:%d size:%llu) failed\n", shmfd, (unsigned long long)shmsize); - goto EXIT; + goto failExit; } /* map it */ @@ -1788,7 +1800,7 @@ int main(int argc, char *argv[]) { if (shm == MAP_FAILED) { fprintf(stderr, "ERROR: mmap (fd:%d size:%llu) failed\n", shmfd, (unsigned long long)shmsize); - goto EXIT; + goto failExit; } stats = (mako_stats_t *)((void *)shm + sizeof(mako_shmhdr_t)); @@ -1806,7 +1818,7 @@ int main(int argc, char *argv[]) { if (!worker_pids) { fprintf(stderr, "ERROR: cannot allocate worker_pids (%d processes)\n", args.num_processes); - goto EXIT; + goto failExit; } /* forking (num_process + 1) children */ @@ -1920,7 +1932,7 @@ int main(int argc, char *argv[]) { worker_pids[args.num_processes]); } -EXIT: +failExit: if (worker_pids) free(worker_pids); diff --git a/bindings/c/test/mako/mako.h b/bindings/c/test/mako/mako.h index 334a8774f8..d924f8a648 100755 --- a/bindings/c/test/mako/mako.h +++ b/bindings/c/test/mako/mako.h @@ -17,8 +17,6 @@ #include #endif -#define DEFAULT_RETRY_COUNT 3 - #define VERBOSE_NONE 0 #define VERBOSE_DEFAULT 1 #define VERBOSE_ANNOYING 2 @@ -29,9 +27,11 @@ #define MODE_BUILD 1 #define MODE_RUN 2 -/* we set mako_txn_t and mako_args_t only once in the master process, - * and won't be touched by child processes. - */ +#define FDB_SUCCESS 0 +#define FDB_ERROR_RETRY -1 +#define FDB_ERROR_ABORT -2 +#define FDB_ERROR_CONFLICT -3 + /* transaction specification */ enum Operations { @@ -55,7 +55,7 @@ enum Operations { #define OP_RANGE 1 #define OP_REVERSE 2 -/* for arguments */ +/* for long arguments */ enum Arguments { ARG_KEYLEN, ARG_VALLEN, @@ -82,6 +82,10 @@ enum TPSChangeTypes { #define KEYPREFIX "mako" #define KEYPREFIXLEN 4 +/* we set mako_txnspec_t and mako_args_t only once in the master process, + * and won't be touched by child processes. + */ + typedef struct { /* for each operation, it stores "count", "range" and "reverse" */ int ops[MAX_OP][3]; @@ -91,6 +95,7 @@ typedef struct { /* benchmark parameters */ typedef struct { + int api_version; int json; int num_processes; int num_threads; diff --git a/bindings/c/test/mako/mako.rst b/bindings/c/test/mako/mako.rst index 218642b7b3..05dcb525fc 100644 --- a/bindings/c/test/mako/mako.rst +++ b/bindings/c/test/mako/mako.rst @@ -38,6 +38,9 @@ Arguments | - ``build``: Populate data | - ``run``: Run the benchmark +- | ``-a | --api_version `` + | FDB API version to use (Default: Latest) + - | ``-c | --cluster `` | FDB cluster file (Required) @@ -48,7 +51,7 @@ Arguments | Number of threads per worker process (Default: 1) - | ``-r | --rows `` - | Number of rows populated (Default: 10000) + | Number of rows populated (Default: 100000) - | ``-s | --seconds `` | Test duration in seconds (Default: 30) @@ -58,12 +61,23 @@ Arguments | Specify the number of operations to be executed. | This option cannot be set with ``--seconds``. -- | ``--tps `` - | Target total transaction-per-second (TPS) of all worker processes/threads +- | ``--tps|--tpsmax `` + | Target total transaction-per-second (TPS) of all worker processes/threads. + | When --tpsmin is also specified, this defines the upper-bound TPS. | (Default: Unset / Unthrottled) +- | ``--tpsmin `` + | Target total lower-bound TPS of all worker processes/threads + | (Default: Unset / Unthrottled) + +- | ``--tpsinterval `` + | Time period TPS oscillates between --tpsmax and --tpsmin (Default: 10) + +- | ``--tpschange `` + | Shape of the TPS change (Default: sin) + - | ``--keylen `` - | Key string length in bytes (Default and Minimum: 16) + | Key string length in bytes (Default and Minimum: 32) - | ``--vallen `` | Value string length in bytes (Default and Minimum: 16) @@ -75,22 +89,19 @@ Arguments | Generate a skewed workload based on Zipf distribution (Default: Unset = Uniform) - | ``--sampling `` - | Sampling rate (1 sample / ops) for latency stats + | Sampling rate (1 sample / ops) for latency stats (Default: 1000) - | ``--trace`` - | Enable tracing. The trace file will be created in the current directory. + | Enable tracing. The trace file will be created in the current directory. (Default: Unset) - | ``--tracepath `` | Enable tracing and set the trace file path. - | ``--knobs `` - | Set client knobs - -- | ``--flatbuffers`` - | Enable flatbuffers + | Set client knobs (comma-separated) - | ``--commitget`` - | Force commit for read-only transactions + | Force commit for read-only transactions (Default: Unset) - | ``-v | --verbose `` | Set verbose level (Default: 1) From 85977fb8d57c00c77ee9d20f462da6c731cc0253 Mon Sep 17 00:00:00 2001 From: mpilman Date: Fri, 27 Sep 2019 11:28:15 -0700 Subject: [PATCH 082/184] Use O_DIRECT with EIO --- fdbrpc/AsyncFileEIO.actor.h | 3 +++ fdbrpc/Net2FileSystem.cpp | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index f786266888..05e732964e 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -246,6 +246,9 @@ private: if( flags & OPEN_READONLY ) oflags |= O_RDONLY; if( flags & OPEN_READWRITE ) oflags |= O_RDWR; if( flags & OPEN_ATOMIC_WRITE_AND_CREATE ) oflags |= O_TRUNC; +#if defined(__linux__) + if ( flags & OPEN_UNBUFFERED ) oflags |= O_DIRECT; +#endif return oflags; } diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 31ce9f6095..48267acb63 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -59,9 +59,9 @@ Future< Reference > Net2FileSystem::open( std::string filename Future> f; #ifdef __linux__ // In the vast majority of cases, we wish to use Kernel AIO. However, some systems - // dont properly support don’t properly support kernel async I/O without O_DIRECT - // or AIO at all. In such cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to - // EIO instead of Kernel AIO. + // don’t properly support kernel async I/O without O_DIRECT or AIO at all. In such + // cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to EIO instead + // of Kernel AIO. if ((flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) && !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO) f = AsyncFileKAIO::open(filename, flags, mode, NULL); From f41f19b5f675dd27c67a470a8686d5f6300e34d0 Mon Sep 17 00:00:00 2001 From: mpilman Date: Tue, 15 Oct 2019 10:22:18 -0700 Subject: [PATCH 083/184] Introduced knob to set eio parallelism --- fdbrpc/AsyncFileEIO.actor.h | 3 ++- flow/Knobs.cpp | 3 +++ flow/Knobs.h | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index 05e732964e..cc6755fe63 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -45,7 +45,8 @@ class AsyncFileEIO : public IAsyncFile, public ReferenceCounted { public: static void init() { - if (eio_init( &eio_want_poll, NULL )) { + eio_set_max_parallel(FLOW_KNOBS->EIO_MAX_PARALLELISM); + if (eio_init( &eio_want_poll, NULL )) { TraceEvent("EioInitError").detail("ErrorNo", errno); throw platform_error(); } diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 5578d3a62a..9149285246 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -85,6 +85,9 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( CACHE_EVICTION_POLICY, "random" ); init( PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION, 0.1 ); if( randomize && BUGGIFY ) PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION = 0.0; else if( randomize && BUGGIFY ) PAGE_CACHE_TRUNCATE_LOOKUP_FRACTION = 1.0; + //AsyncFileEIO + init( EIO_MAX_PARALLELISM, 4 ); + //AsyncFileKAIO init( MAX_OUTSTANDING, 64 ); init( MIN_SUBMIT, 10 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index 4865f8f7ab..eb79e95663 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -105,6 +105,9 @@ public: double TOO_MANY_CONNECTIONS_CLOSED_RESET_DELAY; int TOO_MANY_CONNECTIONS_CLOSED_TIMEOUT; + //AsyncFileEIO + int EIO_MAX_PARALLELISM; + //AsyncFileKAIO int MAX_OUTSTANDING; int MIN_SUBMIT; From 7ad0e20e4857d242e0f11a8ec3b9b4c6637fdb69 Mon Sep 17 00:00:00 2001 From: mpilman Date: Tue, 15 Oct 2019 11:16:37 -0700 Subject: [PATCH 084/184] Added knob to disable O_DIRECT --- fdbrpc/AsyncFileEIO.actor.h | 2 +- fdbrpc/Net2FileSystem.cpp | 2 +- flow/Knobs.cpp | 1 + flow/Knobs.h | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index cc6755fe63..b4791a1a43 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -248,7 +248,7 @@ private: if( flags & OPEN_READWRITE ) oflags |= O_RDWR; if( flags & OPEN_ATOMIC_WRITE_AND_CREATE ) oflags |= O_TRUNC; #if defined(__linux__) - if ( flags & OPEN_UNBUFFERED ) oflags |= O_DIRECT; + if ( flags & OPEN_UNBUFFERED && !FLOW_KNOBS->DISABLE_ODIRECT ) oflags |= O_DIRECT; #endif return oflags; } diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 48267acb63..867bcf6799 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -63,7 +63,7 @@ Future< Reference > Net2FileSystem::open( std::string filename // cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to EIO instead // of Kernel AIO. if ((flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) && - !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO) + !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO && !FLOW_KNOBS->DISABLE_ODIRECT) f = AsyncFileKAIO::open(filename, flags, mode, NULL); else #endif diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 9149285246..3911480daf 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -87,6 +87,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { //AsyncFileEIO init( EIO_MAX_PARALLELISM, 4 ); + init( DISABLE_ODIRECT, 0 ); //AsyncFileKAIO init( MAX_OUTSTANDING, 64 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index eb79e95663..d7f49bd4c9 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -107,6 +107,7 @@ public: //AsyncFileEIO int EIO_MAX_PARALLELISM; + int DISABLE_ODIRECT; //AsyncFileKAIO int MAX_OUTSTANDING; From f23392ec5a32469bcd9b61ee92a403e6516c116b Mon Sep 17 00:00:00 2001 From: mpilman Date: Thu, 24 Oct 2019 11:39:55 -0700 Subject: [PATCH 085/184] Don't use O_DIRECT in EIO by default --- fdbrpc/AsyncFileEIO.actor.h | 2 +- flow/Knobs.cpp | 3 ++- flow/Knobs.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fdbrpc/AsyncFileEIO.actor.h b/fdbrpc/AsyncFileEIO.actor.h index b4791a1a43..f3450af847 100644 --- a/fdbrpc/AsyncFileEIO.actor.h +++ b/fdbrpc/AsyncFileEIO.actor.h @@ -248,7 +248,7 @@ private: if( flags & OPEN_READWRITE ) oflags |= O_RDWR; if( flags & OPEN_ATOMIC_WRITE_AND_CREATE ) oflags |= O_TRUNC; #if defined(__linux__) - if ( flags & OPEN_UNBUFFERED && !FLOW_KNOBS->DISABLE_ODIRECT ) oflags |= O_DIRECT; + if ( flags & OPEN_UNBUFFERED && FLOW_KNOBS->EIO_USE_ODIRECT ) oflags |= O_DIRECT; #endif return oflags; } diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 3911480daf..37b6843ea4 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -87,7 +87,8 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { //AsyncFileEIO init( EIO_MAX_PARALLELISM, 4 ); - init( DISABLE_ODIRECT, 0 ); + init( EIO_USE_ODIRECT, 0 ); + init( DISABLE_ODIRECT, 0 ); //AsyncFileKAIO init( MAX_OUTSTANDING, 64 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index d7f49bd4c9..c993004af2 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -107,6 +107,7 @@ public: //AsyncFileEIO int EIO_MAX_PARALLELISM; + int EIO_USE_ODIRECT; int DISABLE_ODIRECT; //AsyncFileKAIO From 325a8e421308599d2f994890f4d07e345ef38005 Mon Sep 17 00:00:00 2001 From: mpilman Date: Thu, 24 Oct 2019 11:44:03 -0700 Subject: [PATCH 086/184] remove confusing USE_ODIRECT knob --- fdbrpc/Net2FileSystem.cpp | 2 +- flow/Knobs.cpp | 1 - flow/Knobs.h | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 867bcf6799..48267acb63 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -63,7 +63,7 @@ Future< Reference > Net2FileSystem::open( std::string filename // cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to EIO instead // of Kernel AIO. if ((flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) && - !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO && !FLOW_KNOBS->DISABLE_ODIRECT) + !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO) f = AsyncFileKAIO::open(filename, flags, mode, NULL); else #endif diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 37b6843ea4..4549761093 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -88,7 +88,6 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { //AsyncFileEIO init( EIO_MAX_PARALLELISM, 4 ); init( EIO_USE_ODIRECT, 0 ); - init( DISABLE_ODIRECT, 0 ); //AsyncFileKAIO init( MAX_OUTSTANDING, 64 ); diff --git a/flow/Knobs.h b/flow/Knobs.h index c993004af2..7875df9503 100644 --- a/flow/Knobs.h +++ b/flow/Knobs.h @@ -108,7 +108,6 @@ public: //AsyncFileEIO int EIO_MAX_PARALLELISM; int EIO_USE_ODIRECT; - int DISABLE_ODIRECT; //AsyncFileKAIO int MAX_OUTSTANDING; From 92ce9ef5dca937f28f988ef19163fc7b098e19ab Mon Sep 17 00:00:00 2001 From: mpilman Date: Thu, 24 Oct 2019 11:45:32 -0700 Subject: [PATCH 087/184] updated comment --- fdbrpc/Net2FileSystem.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fdbrpc/Net2FileSystem.cpp b/fdbrpc/Net2FileSystem.cpp index 48267acb63..ea5e3e3539 100644 --- a/fdbrpc/Net2FileSystem.cpp +++ b/fdbrpc/Net2FileSystem.cpp @@ -61,7 +61,8 @@ Future< Reference > Net2FileSystem::open( std::string filename // In the vast majority of cases, we wish to use Kernel AIO. However, some systems // don’t properly support kernel async I/O without O_DIRECT or AIO at all. In such // cases, DISABLE_POSIX_KERNEL_AIO knob can be enabled to fallback to EIO instead - // of Kernel AIO. + // of Kernel AIO. And EIO_USE_ODIRECT can be used to turn on or off O_DIRECT within + // EIO. if ((flags & IAsyncFile::OPEN_UNBUFFERED) && !(flags & IAsyncFile::OPEN_NO_AIO) && !FLOW_KNOBS->DISABLE_POSIX_KERNEL_AIO) f = AsyncFileKAIO::open(filename, flags, mode, NULL); From 60d26ff5d7b6b8db3bd7a745937950738b9afd8a Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 24 Oct 2019 12:47:51 -0700 Subject: [PATCH 088/184] FastRestore:Resolve review comments --- fdbserver/RestoreLoader.actor.cpp | 8 ++++---- fdbserver/RestoreRoleCommon.actor.cpp | 5 +---- fdbserver/RestoreRoleCommon.actor.h | 2 +- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 7e936f0faf..ba5ab54adf 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -39,8 +39,8 @@ void splitMutation(Reference self, MutationRef m, Arena& mvec void _parseSerializedMutation(VersionedMutationsMap* kvOps, SerializedMutationListMap* mutationMap, bool isSampling = false); -void handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self); -void handleSetApplierKeyRangeVectorRequest(RestoreSetApplierKeyRangeVectorRequest req, +void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self); +void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVectorRequest& req, Reference self); ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Reference self, bool isSampling = false); @@ -110,7 +110,7 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } // Assume: Only update the local data if it (applierInterf) has not been set -void handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self) { +void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self) { TraceEvent("FastRestore").detail("HandleRestoreSysInfoRequest", self->id()); ASSERT(self.isValid()); @@ -126,7 +126,7 @@ void handleRestoreSysInfoRequest(RestoreSysInfoRequest req, Reference self) { // Idempodent operation. OK to re-execute the duplicate cmd if (self->rangeToApplier.empty()) { diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index b6c2e51deb..c2ca3f1b4e 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -39,11 +39,10 @@ struct RestoreWorkerData; ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id) { wait(delayJittered(5.0)); // Random jitter reduces heat beat monitor's pressure req.reply.send(RestoreCommonReply(id)); - return Void(); } -void handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self) { +void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference self) { if (self->versionBatchStart) { self->versionBatchStart = false; } @@ -54,7 +53,6 @@ void handleFinishRestoreRequest(RestoreVersionBatchRequest req, Referenceid()); req.reply.send(RestoreCommonReply(self->id())); - return; } @@ -66,7 +64,6 @@ ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, .detail("Node", self->id()); req.reply.send(RestoreCommonReply(self->id())); - return Void(); } diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index de02d4630b..3015fef333 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -55,7 +55,7 @@ typedef std::map>> VersionedMutations ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id); ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); -void handleFinishRestoreRequest(RestoreVersionBatchRequest req, Reference self); +void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference self); // Helper class for reading restore data from a buffer and throwing the right errors. // This struct is mostly copied from StringRefReader. We add a sanity check in this struct. From f70000184e8ffa9c2beb2c06deb68409d6313d75 Mon Sep 17 00:00:00 2001 From: Xin Dong Date: Thu, 24 Oct 2019 13:05:23 -0700 Subject: [PATCH 089/184] Log the number of samples captured for the read bandwidth to verify the assumption. --- fdbserver/StorageMetrics.actor.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fdbserver/StorageMetrics.actor.h b/fdbserver/StorageMetrics.actor.h index 63e7a8f2d4..02988f3a25 100644 --- a/fdbserver/StorageMetrics.actor.h +++ b/fdbserver/StorageMetrics.actor.h @@ -221,9 +221,13 @@ struct StorageServerMetrics { notifyMetrics.bytesPerKSecond = bandwidthSample.addAndExpire( key, metrics.bytesPerKSecond, expire ) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; if (metrics.iosPerKSecond) notifyMetrics.iosPerKSecond = iopsSample.addAndExpire( key, metrics.iosPerKSecond, expire ) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - if (metrics.bytesReadPerKSecond) + if (metrics.bytesReadPerKSecond) { notifyMetrics.bytesReadPerKSecond = bytesReadSample.addAndExpire(key, metrics.bytesReadPerKSecond, expire) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; + if (deterministicRandom()->random01() < 0.01) { + TraceEvent("BytesReadSampleCountX100").detail("SampleCount", bytesReadSample.queue.size()); + } + } if (!notifyMetrics.allZero()) { auto& v = waitMetricsMap[key]; for(int i=0; i Date: Thu, 24 Oct 2019 13:06:50 -0700 Subject: [PATCH 090/184] FastRestore:Convert handleInitVersionBatchRequest to plain func --- fdbserver/RestoreApplier.actor.cpp | 2 +- fdbserver/RestoreLoader.actor.cpp | 3 +-- fdbserver/RestoreRoleCommon.actor.cpp | 4 ++-- fdbserver/RestoreRoleCommon.actor.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 61e7b1b1d7..ffd1ddf84b 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -65,7 +65,7 @@ ACTOR Future restoreApplierCore(RestoreApplierInterface applierInterf, int } when(RestoreVersionBatchRequest req = waitNext(applierInterf.initVersionBatch.getFuture())) { requestTypeStr = "initVersionBatch"; - actors.add(handleInitVersionBatchRequest(req, self)); + handleInitVersionBatchRequest(req, self); } when(RestoreVersionBatchRequest req = waitNext(applierInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index ba5ab54adf..291c346bf5 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -86,7 +86,7 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.initVersionBatch.getFuture())) { requestTypeStr = "initVersionBatch"; - actors.add(handleInitVersionBatchRequest(req, self)); + handleInitVersionBatchRequest(req, self); } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; @@ -133,7 +133,6 @@ void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVector self->rangeToApplier = req.rangeToApplier; } req.reply.send(RestoreCommonReply(self->id())); - return; } diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index c2ca3f1b4e..aaf8c7fc4c 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -56,7 +56,7 @@ void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference return; } -ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { +void handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { self->resetPerVersionBatch(); TraceEvent("FastRestore") .detail("InitVersionBatch", req.batchID) @@ -64,7 +64,7 @@ ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, .detail("Node", self->id()); req.reply.send(RestoreCommonReply(self->id())); - return Void(); + return; } //-------Helper functions diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index 3015fef333..6b4b84ec22 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -54,7 +54,7 @@ struct RestoreSimpleRequest; typedef std::map>> VersionedMutationsMap; ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id); -ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); +void handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference self); // Helper class for reading restore data from a buffer and throwing the right errors. From 7903b47b8240ff966def73d869ea3a5e9255281e Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 24 Oct 2019 13:09:24 -0700 Subject: [PATCH 091/184] FastRestore:Remove unnecessary return --- fdbserver/RestoreLoader.actor.cpp | 4 ---- fdbserver/RestoreRoleCommon.actor.cpp | 2 -- 2 files changed, 6 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 291c346bf5..4263cad3d4 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -123,7 +123,6 @@ void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, ReferenceappliersInterf = req.sysInfo.appliers; req.reply.send(RestoreCommonReply(self->id())); - return; } void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVectorRequest& req, @@ -133,7 +132,6 @@ void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVector self->rangeToApplier = req.rangeToApplier; } req.reply.send(RestoreCommonReply(self->id())); - return; } ACTOR Future _processLoadingParam(LoadingParam param, Reference self) { @@ -345,8 +343,6 @@ void splitMutation(Reference self, MutationRef m, Arena& mvec mvector.push_back_deep(mvector_arena, curm); nodeIDs.push_back(nodeIDs_arena, itApplier->second); } - - return; } // key_input format: diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index aaf8c7fc4c..5feac650a8 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -53,7 +53,6 @@ void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference .detail("Node", self->id()); req.reply.send(RestoreCommonReply(self->id())); - return; } void handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { @@ -64,7 +63,6 @@ void handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Referenceid()); req.reply.send(RestoreCommonReply(self->id())); - return; } //-------Helper functions From b74e5b15433c12a1cd3f134876339d950fd3c50b Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Thu, 24 Oct 2019 13:10:59 -0700 Subject: [PATCH 092/184] added sample file for attrition test outside of simulation --- tests/CMakeLists.txt | 1 + tests/SampleNoSimAttrition.txt | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tests/SampleNoSimAttrition.txt diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1981d554dd..296d437306 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -67,6 +67,7 @@ add_fdb_test(TEST_FILES RedwoodCorrectnessBTree.txt IGNORE) add_fdb_test(TEST_FILES fast/RedwoodCorrectnessBTree.txt IGNORE) add_fdb_test(TEST_FILES RedwoodCorrectness.txt IGNORE) add_fdb_test(TEST_FILES RedwoodPerfTests.txt IGNORE) +add_fdb_test(TEST_FILES SampleNoSimAttrition.txt IGNORE) add_fdb_test(TEST_FILES SimpleExternalTest.txt) add_fdb_test(TEST_FILES SlowTask.txt IGNORE) add_fdb_test(TEST_FILES SpecificUnitTest.txt IGNORE) diff --git a/tests/SampleNoSimAttrition.txt b/tests/SampleNoSimAttrition.txt new file mode 100644 index 0000000000..597c8c18b1 --- /dev/null +++ b/tests/SampleNoSimAttrition.txt @@ -0,0 +1,19 @@ +testTitle=Temp + testName=Cycle + transactionsPerSecond=2500.0 + testDuration=10.0 + expectedRate=0 + + testName=Attrition + killDc=true + targetId=2 + reboot=true + testDuration=10.0 + suspendDuration=5.0 + + testName=Attrition + killMachine=true + targetId=1 + reboot=true + testDuration=10.0 + suspendDuration=2.0 \ No newline at end of file From 2f34ee684f06523e89e6f88038dde8da797e6000 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Thu, 24 Oct 2019 13:21:28 -0700 Subject: [PATCH 093/184] fixed indentation issues --- tests/SampleNoSimAttrition.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/SampleNoSimAttrition.txt b/tests/SampleNoSimAttrition.txt index 597c8c18b1..dbebe495dd 100644 --- a/tests/SampleNoSimAttrition.txt +++ b/tests/SampleNoSimAttrition.txt @@ -5,15 +5,15 @@ testTitle=Temp expectedRate=0 testName=Attrition - killDc=true - targetId=2 + killDc=true + targetId=2 reboot=true testDuration=10.0 - suspendDuration=5.0 + suspendDuration=5.0 - testName=Attrition - killMachine=true - targetId=1 + testName=Attrition + killMachine=true + targetId=1 reboot=true testDuration=10.0 - suspendDuration=2.0 \ No newline at end of file + suspendDuration=2.0 \ No newline at end of file From 2383c291232084f5e610012eb49bb2925bbb4f2e Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 24 Oct 2019 13:54:44 -0700 Subject: [PATCH 094/184] FastRestore:Use reference for handleInitVersionBatchRequest func --- fdbserver/RestoreRoleCommon.actor.cpp | 2 +- fdbserver/RestoreRoleCommon.actor.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index 5feac650a8..eb0f8ecc1b 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -55,7 +55,7 @@ void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference req.reply.send(RestoreCommonReply(self->id())); } -void handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { +void handleInitVersionBatchRequest(const RestoreVersionBatchRequest& req, Reference self) { self->resetPerVersionBatch(); TraceEvent("FastRestore") .detail("InitVersionBatch", req.batchID) diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index 6b4b84ec22..98a567cffd 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -54,7 +54,7 @@ struct RestoreSimpleRequest; typedef std::map>> VersionedMutationsMap; ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id); -void handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); +void handleInitVersionBatchRequest(const RestoreVersionBatchRequest& req, Reference self); void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference self); // Helper class for reading restore data from a buffer and throwing the right errors. From acbfc70373c856c1ca6c4eea641e7dce5c293b42 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 24 Oct 2019 17:02:56 -0700 Subject: [PATCH 095/184] update versions target to 6.2.8 --- versions.target | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.target b/versions.target index 562119bab1..99a6f62e05 100644 --- a/versions.target +++ b/versions.target @@ -1,7 +1,7 @@ - 6.2.7 + 6.2.8 6.2 From 9682528372214a8d3b9c87bbd099b6a5a2c4e8e0 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 24 Oct 2019 17:02:56 -0700 Subject: [PATCH 096/184] update installer WIX GUID following release --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index 620add1a09..72aa8d3851 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ Date: Thu, 24 Oct 2019 17:05:45 -0700 Subject: [PATCH 097/184] update cmake to 6.2.8 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d648cf38a..311b32c3e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ # limitations under the License. cmake_minimum_required(VERSION 3.12) project(foundationdb - VERSION 6.2.7 + VERSION 6.2.8 DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions." HOMEPAGE_URL "http://www.foundationdb.org/" LANGUAGES C CXX ASM) From ec0789f2e7ea3a98e95926bcbb3f428e68439a43 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 25 Sep 2019 22:10:02 -0700 Subject: [PATCH 098/184] Build in Debug mode by default for OPEN_FOR_IDE build --- CMakeLists.txt | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8ca7d2842d..6a4c3bfdf4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,18 +29,23 @@ if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}") message(FATAL_ERROR "In-source builds are forbidden") endif() +set(OPEN_FOR_IDE OFF CACHE BOOL "Open this in an IDE (won't compile/link)") + if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(STATUS "Setting build type to 'Release' as none was specified") - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" - "MinSizeRel" "RelWithDebInfo") + if (OPEN_FOR_IDE) + message(STATUS "Defaulting build type to 'Debug' for OPEN_FOR_IDE") + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build" FORCE) + else() + message(STATUS "Setting build type to 'Release' as none was specified") + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build" FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" + "MinSizeRel" "RelWithDebInfo") + endif() endif() set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin) set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib) -set(OPEN_FOR_IDE OFF CACHE BOOL "Open this in an IDE (won't compile/link)") - ################################################################################ # Packages used for bindings ################################################################################ From d4de608bb6988cda817f0a1af136cadcc6c90d60 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 25 Sep 2019 23:19:42 -0700 Subject: [PATCH 099/184] Fix OPEN_FOR_IDE build --- fdbbackup/backup.actor.cpp | 6 +++--- fdbcli/fdbcli.actor.cpp | 4 ++-- fdbclient/SystemData.h | 2 +- fdbserver/CMakeLists.txt | 2 +- fdbserver/DataDistribution.actor.cpp | 1 - fdbserver/FDBExecHelper.actor.cpp | 2 +- fdbserver/MemoryPager.actor.cpp | 4 ++-- fdbserver/RestoreApplier.actor.h | 4 ++-- fdbserver/RestoreCommon.actor.cpp | 4 ++-- fdbserver/RestoreLoader.actor.h | 4 ++-- fdbserver/RestoreMaster.actor.cpp | 4 ++-- fdbserver/RestoreRoleCommon.actor.h | 4 ++-- fdbserver/RestoreWorker.actor.h | 4 ++-- ...terface.h => RestoreWorkerInterface.actor.h} | 17 +++++++++++------ fdbserver/SimulatedCluster.actor.cpp | 2 -- fdbserver/fdbserver.actor.cpp | 2 +- fdbserver/fdbserver.vcxproj.filters | 2 +- ...ackupAndParallelRestoreCorrectness.actor.cpp | 17 ++--------------- fdbserver/workloads/ConfigureDatabase.actor.cpp | 1 - fdbserver/workloads/MachineAttrition.actor.cpp | 6 +++--- fdbserver/workloads/Mako.actor.cpp | 2 +- fdbserver/workloads/ParallelRestore.actor.cpp | 2 +- fdbserver/workloads/SnapTest.actor.cpp | 3 +-- 23 files changed, 43 insertions(+), 56 deletions(-) rename fdbserver/{RestoreWorkerInterface.h => RestoreWorkerInterface.actor.h} (96%) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 1a78d1f807..ebcb1ed1b8 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -3948,7 +3948,7 @@ ACTOR static Future _fastRestore(Database cx, Key tagName, Key url, boo ACTOR Future fastRestore(Database cx, Standalone tagName, Standalone url, bool waitForComplete, long targetVersion, bool verbose, Standalone range, Standalone addPrefix, Standalone removePrefix) { - Version targetVersion = + Version result = wait(_fastRestore(cx, tagName, url, waitForComplete, targetVersion, verbose, range, addPrefix, removePrefix)); - return targetVersion; -} \ No newline at end of file + return result; +} diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index e077bb8a2e..7bf4ab54ab 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -3502,7 +3502,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { printf("Data distribution is turned off.\n"); } else if (tokencmp(tokens[1], "disable")) { if (tokencmp(tokens[2], "ssfailure")) { - bool _ = wait(makeInterruptable(setHealthyZone(db, ignoreSSFailuresZoneString, 0))); + wait(success(makeInterruptable(setHealthyZone(db, ignoreSSFailuresZoneString, 0)))); printf("Data distribution is disabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, true))); @@ -3514,7 +3514,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { } } else if (tokencmp(tokens[1], "enable")) { if (tokencmp(tokens[2], "ssfailure")) { - bool _ = wait(makeInterruptable(clearHealthyZone(db, false, true))); + wait(success(makeInterruptable(clearHealthyZone(db, false, true)))); printf("Data distribution is enabled for storage server failures.\n"); } else if (tokencmp(tokens[2], "rebalance")) { wait(makeInterruptable(setDDIgnoreRebalanceSwitch(db, false))); diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 066c3e5dc1..a80eaf5283 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -26,7 +26,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/StorageServerInterface.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbserver/RestoreWorkerInterface.actor.h" struct RestoreLoaderInterface; struct RestoreApplierInterface; diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 6e69968ed4..3def051534 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -76,7 +76,7 @@ set(FDBSERVER_SRCS RestoreLoader.actor.cpp RestoreWorker.actor.h RestoreWorker.actor.cpp - RestoreWorkerInterface.h + RestoreWorkerInterface.actor.h Resolver.actor.cpp ResolverInterface.h ServerDBInfo.h diff --git a/fdbserver/DataDistribution.actor.cpp b/fdbserver/DataDistribution.actor.cpp index 4a71fbb5e2..a65493e4f3 100644 --- a/fdbserver/DataDistribution.actor.cpp +++ b/fdbserver/DataDistribution.actor.cpp @@ -4986,7 +4986,6 @@ TEST_CASE("/DataDistribution/AddTeamsBestOf/NotEnoughServers") { state int desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * processSize; state int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * processSize; state int teamSize = 3; - state int targetTeamsPerServer = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (teamSize + 1) / 2; state DDTeamCollection* collection = testTeamCollection(teamSize, policy, processSize); collection->addTeam(std::set({ UID(1, 0), UID(2, 0), UID(3, 0) }), true); diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index 3daa798036..f9608acefc 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -142,7 +142,7 @@ ACTOR Future spawnProcess(std::string binPath, std::vector par #endif ACTOR Future execHelper(ExecCmdValueString* execArg, UID snapUID, std::string folder, std::string role) { - state Standalone uidStr = snapUID.toString(); + state Standalone uidStr = Standalone(snapUID.toString()); state int err = 0; state Future cmdErr; state double maxWaitTime = SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT; diff --git a/fdbserver/MemoryPager.actor.cpp b/fdbserver/MemoryPager.actor.cpp index 9e6474dd01..656e3f3a0a 100644 --- a/fdbserver/MemoryPager.actor.cpp +++ b/fdbserver/MemoryPager.actor.cpp @@ -354,7 +354,7 @@ void writePage(IPager *pager, Reference page, LogicalPageID pageID, Versi ACTOR Future commit(IPager *pager) { static int commitNum = 1; - state int myCommit = commitNum++; + state [[maybe_unused]] int myCommit = commitNum++; debug_printf("Commit%d\n", myCommit); wait(pager->commit()); @@ -364,7 +364,7 @@ ACTOR Future commit(IPager *pager) { ACTOR Future read(IPager *pager, LogicalPageID pageID, Version version, Version expectedVersion=-1) { static int readNum = 1; - state int myRead = readNum++; + state [[maybe_unused]] int myRead = readNum++; state Reference readSnapshot = pager->getReadSnapshot(version); debug_printf("Read%d\n", myRead); Reference readPage = wait(readSnapshot->getPhysicalPage(pageID, true)); diff --git a/fdbserver/RestoreApplier.actor.h b/fdbserver/RestoreApplier.actor.h index 37f9b78b08..0fa0efc785 100644 --- a/fdbserver/RestoreApplier.actor.h +++ b/fdbserver/RestoreApplier.actor.h @@ -34,7 +34,7 @@ #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbserver/CoordinationInterface.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbserver/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreRoleCommon.actor.h" @@ -128,4 +128,4 @@ struct RestoreApplierData : RestoreRoleData, public ReferenceCounted restoreApplierCore(RestoreApplierInterface applierInterf, int nodeIndex, Database cx); #include "flow/unactorcompiler.h" -#endif \ No newline at end of file +#endif diff --git a/fdbserver/RestoreCommon.actor.cpp b/fdbserver/RestoreCommon.actor.cpp index ac6e638f4c..ca2da8901c 100644 --- a/fdbserver/RestoreCommon.actor.cpp +++ b/fdbserver/RestoreCommon.actor.cpp @@ -32,6 +32,7 @@ #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/MutationList.h" #include "fdbclient/BackupContainer.h" +#include "flow/actorcompiler.h" // This must be the last #include. // Split RestoreConfigFR defined in FileBackupAgent.actor.cpp to declaration in Restore.actor.h and implementation in // RestoreCommon.actor.cpp @@ -268,7 +269,6 @@ ACTOR Future RestoreConfigFR::getFullStatus_impl(Reference progress = restore->getProgress(tr); // restore might no longer be valid after the first wait so make sure it is not needed anymore. - state UID uid = restore->getUid(); wait(success(ranges) && success(addPrefix) && success(removePrefix) && success(url) && success(restoreVersion) && success(progress)); @@ -433,4 +433,4 @@ ACTOR Future>> decodeLogFileBlock(Reference restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx); #include "flow/unactorcompiler.h" -#endif \ No newline at end of file +#endif diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index e9ed9bd593..16fd3e4182 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -193,7 +193,7 @@ ACTOR Future startProcessRestoreRequests(Reference self for (restoreIndex = 0; restoreIndex < restoreRequests.size(); restoreIndex++) { RestoreRequest& request = restoreRequests[restoreIndex]; TraceEvent("FastRestore").detail("RestoreRequestInfo", request.toString()); - Version ver = wait(processRestoreRequest(self, cx, request)); + wait(success(processRestoreRequest(self, cx, request))); } } catch (Error& e) { TraceEvent(SevError, "FastRestoreFailed").detail("RestoreRequest", restoreRequests[restoreIndex].toString()); @@ -514,4 +514,4 @@ ACTOR static Future notifyRestoreCompleted(Reference se TraceEvent("FastRestore").detail("RestoreMaster", "RestoreCompleted"); return Void(); -} \ No newline at end of file +} diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index 98a567cffd..f4c58c5528 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -35,7 +35,7 @@ #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbserver/CoordinationInterface.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbserver/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "flow/actorcompiler.h" // has to be last include @@ -135,4 +135,4 @@ public: }; #include "flow/unactorcompiler.h" -#endif \ No newline at end of file +#endif diff --git a/fdbserver/RestoreWorker.actor.h b/fdbserver/RestoreWorker.actor.h index b17fe984c1..615ce18e39 100644 --- a/fdbserver/RestoreWorker.actor.h +++ b/fdbserver/RestoreWorker.actor.h @@ -34,7 +34,7 @@ #include #include -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbserver/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreCommon.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" @@ -70,4 +70,4 @@ struct RestoreWorkerData : NonCopyable, public ReferenceCounted #include "flow/Stats.h" @@ -35,6 +38,7 @@ #include "fdbserver/CoordinationInterface.h" #include "fdbserver/Knobs.h" #include "fdbserver/RestoreUtil.h" +#include "flow/actorcompiler.h" // This must be the last #include. class RestoreConfigFR; @@ -467,7 +471,8 @@ struct RestoreRequest { std::string getRoleStr(RestoreRole role); ////--- Interface functions -Future _restoreWorker(Database const& cx, LocalityData const& locality); -Future restoreWorker(Reference const& ccf, LocalityData const& locality); +ACTOR Future _restoreWorker(Database cx, LocalityData locality); +ACTOR Future restoreWorker(Reference ccf, LocalityData locality); -#endif \ No newline at end of file +#include "flow/unactorcompiler.h" +#endif diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index efd80242f4..4c56421b1f 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1395,8 +1395,6 @@ ACTOR void setupAndRun(std::string dataFolder, const char *testFile, bool reboot state int extraDB = 0; state int minimumReplication = 0; state int minimumRegions = 0; - state float timeout = 5400; // old default is 5400 seconds - state float buggify_timeout = 36000.0; // old default is 36000 seconds checkExtraDB(testFile, extraDB, minimumReplication, minimumRegions); // TODO (IPv6) Use IPv6? diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 0f1b533bc0..5439bdf11b 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -34,7 +34,7 @@ #include "fdbclient/FailureMonitorClient.h" #include "fdbserver/CoordinationInterface.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbserver/RestoreWorkerInterface.actor.h" #include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/MoveKeys.actor.h" diff --git a/fdbserver/fdbserver.vcxproj.filters b/fdbserver/fdbserver.vcxproj.filters index 653b3324ff..c215e3a9c2 100644 --- a/fdbserver/fdbserver.vcxproj.filters +++ b/fdbserver/fdbserver.vcxproj.filters @@ -330,7 +330,7 @@ - + diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp index 8266883298..6928f68e7b 100644 --- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp @@ -23,7 +23,7 @@ #include "fdbclient/BackupContainer.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbserver/RestoreWorkerInterface.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. // A workload which test the correctness of backup and restore process @@ -251,23 +251,19 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { state int retryCount = 0; loop { try { - tr.reset(); - state Version v = wait(tr.getReadVersion()); state Standalone data = wait( tr.getRange(firstGreaterOrEqual(doubleToTestKey(0.0, keyPrefix)), firstGreaterOrEqual(doubleToTestKey(1.0, keyPrefix)), std::numeric_limits::max())); printf("dump DB, at %s. retryCount:%d Data size:%d, rangeResultInfo:%s\n", when.c_str(), retryCount, data.size(), data.contents().toString().c_str()); dumpDBKVs(data, self); - break; + return Void(); } catch (Error& e) { retryCount++; TraceEvent(retryCount > 20 ? SevWarnAlways : SevWarn, "dumpDBError").error(e); wait(tr.onError(e)); } } - - return Void(); } virtual std::string description() { return "BackupAndParallelRestoreCorrectness"; } @@ -755,15 +751,6 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { state int64_t taskCount = wait(backupAgent.getTaskCount(tr)); state int waitCycles = 0; - if ((taskCount) && (0)) { - TraceEvent("BARW_EndingNonzeroTaskCount", randomID) - .detail("BackupTag", printable(self->backupTag)) - .detail("TaskCount", taskCount) - .detail("WaitCycles", waitCycles); - printf("EndingNonZeroTasks: %ld\n", (long)taskCount); - wait(TaskBucket::debugPrintRange(cx, LiteralStringRef("\xff"), StringRef())); - } - loop { waitCycles++; diff --git a/fdbserver/workloads/ConfigureDatabase.actor.cpp b/fdbserver/workloads/ConfigureDatabase.actor.cpp index eec11a54d4..1dcaf853f7 100644 --- a/fdbserver/workloads/ConfigureDatabase.actor.cpp +++ b/fdbserver/workloads/ConfigureDatabase.actor.cpp @@ -267,7 +267,6 @@ struct ConfigureDatabaseWorkload : TestWorkload { ACTOR Future singleDB( ConfigureDatabaseWorkload *self, Database cx ) { state Transaction tr; - state int i; loop { if(g_simulator.speedUpSimulation) { return Void(); diff --git a/fdbserver/workloads/MachineAttrition.actor.cpp b/fdbserver/workloads/MachineAttrition.actor.cpp index 9cd608b0e6..b9bef9f5c6 100644 --- a/fdbserver/workloads/MachineAttrition.actor.cpp +++ b/fdbserver/workloads/MachineAttrition.actor.cpp @@ -40,7 +40,7 @@ static std::set const& normalAttritionErrors() { ACTOR Future ignoreSSFailuresForDuration(Database cx, double duration) { // duration doesn't matter since this won't timeout TraceEvent("IgnoreSSFailureStart"); - bool _ = wait(setHealthyZone(cx, ignoreSSFailuresZoneString, 0)); + wait(success(setHealthyZone(cx, ignoreSSFailuresZoneString, 0))); TraceEvent("IgnoreSSFailureWait"); wait(delay(duration)); TraceEvent("IgnoreSSFailureClear"); @@ -306,8 +306,8 @@ struct MachineAttritionWorkload : TestWorkload { state LocalityData targetMachine = self->machines.back(); if(BUGGIFY_WITH_PROB(0.01)) { TEST(true); //Marked a zone for maintenance before killing it - bool _ = - wait(setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20)); + wait(success( + setHealthyZone(cx, targetMachine.zoneId().get(), deterministicRandom()->random01() * 20))); } else if (BUGGIFY_WITH_PROB(0.005)) { TEST(true); // Disable DD for all storage server failures self->ignoreSSFailures = diff --git a/fdbserver/workloads/Mako.actor.cpp b/fdbserver/workloads/Mako.actor.cpp index c8482a5402..044ee49cbe 100644 --- a/fdbserver/workloads/Mako.actor.cpp +++ b/fdbserver/workloads/Mako.actor.cpp @@ -427,7 +427,7 @@ struct MakoWorkload : TestWorkload { ACTOR template static Future logLatency(Future f, ContinuousSample* opLatencies){ state double opBegin = now(); - T value = wait(f); + wait(success(f)); opLatencies->addSample(now() - opBegin); return Void(); } diff --git a/fdbserver/workloads/ParallelRestore.actor.cpp b/fdbserver/workloads/ParallelRestore.actor.cpp index d9f24c212c..5148476298 100644 --- a/fdbserver/workloads/ParallelRestore.actor.cpp +++ b/fdbserver/workloads/ParallelRestore.actor.cpp @@ -23,7 +23,7 @@ #include "fdbclient/BackupContainer.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" -#include "fdbserver/RestoreWorkerInterface.h" +#include "fdbserver/RestoreWorkerInterface.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. // A workload which test the correctness of backup and restore process diff --git a/fdbserver/workloads/SnapTest.actor.cpp b/fdbserver/workloads/SnapTest.actor.cpp index aaed65ce11..78cd7580ae 100644 --- a/fdbserver/workloads/SnapTest.actor.cpp +++ b/fdbserver/workloads/SnapTest.actor.cpp @@ -159,7 +159,6 @@ public: // workload functions keys.push_back(deterministicRandom()->randomInt64(0, INT64_MAX - 2)); } - state int retry = 0; tr.reset(); loop { try { @@ -190,6 +189,7 @@ public: // workload functions ACTOR Future _start(Database cx, SnapTestWorkload* self) { state Transaction tr(cx); + state bool snapFailed = false; if (self->testID == 0) { // create even keys before the snapshot @@ -202,7 +202,6 @@ public: // workload functions wait(delay(toDelay)); state int retry = 0; - state bool snapFailed = false; loop { self->snapUID = deterministicRandom()->randomUniqueID(); try { From daeb0e9ed622935b767d4f25872b811ca64db121 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 25 Sep 2019 23:53:06 -0700 Subject: [PATCH 100/184] Attempt to fix Makefile --- fdbserver/fdbserver.vcxproj | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 783bcb160c..58adb8f6f3 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -220,6 +220,9 @@ false + + false + false From 3f62d2b506cbff5856fa7df931f89f7d718ddd0a Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 26 Sep 2019 00:29:16 -0700 Subject: [PATCH 101/184] Fix actual build --- fdbserver/RestoreWorkerInterface.actor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/RestoreWorkerInterface.actor.h b/fdbserver/RestoreWorkerInterface.actor.h index 01c33fc1a2..805b2b0a1c 100644 --- a/fdbserver/RestoreWorkerInterface.actor.h +++ b/fdbserver/RestoreWorkerInterface.actor.h @@ -22,8 +22,8 @@ // which are RestoreMaster, RestoreLoader, and RestoreApplier #pragma once -#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_H) - #define FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_H +#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_G_H) + #define FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_G_H #include "fdbserver/RestoreWorkerInterface.actor.g.h" #elif !defined(FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_H) #define FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_H From de8921b6602c5ee0f3d3ee604122c81210bc076d Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 26 Sep 2019 00:18:37 -0700 Subject: [PATCH 102/184] Move RestoreWorkerInterface to fdbclient --- fdbclient/CMakeLists.txt | 1 + .../RestoreWorkerInterface.actor.h | 10 +++++----- fdbclient/SystemData.h | 2 +- fdbclient/fdbclient.vcxproj | 3 +++ fdbserver/CMakeLists.txt | 1 - fdbserver/RestoreApplier.actor.h | 2 +- fdbserver/RestoreLoader.actor.h | 2 +- fdbserver/RestoreRoleCommon.actor.h | 2 +- fdbserver/RestoreWorker.actor.h | 2 +- fdbserver/fdbserver.actor.cpp | 2 +- fdbserver/fdbserver.vcxproj.filters | 1 - .../BackupAndParallelRestoreCorrectness.actor.cpp | 2 +- fdbserver/workloads/ParallelRestore.actor.cpp | 2 +- 13 files changed, 17 insertions(+), 15 deletions(-) rename {fdbserver => fdbclient}/RestoreWorkerInterface.actor.h (98%) diff --git a/fdbclient/CMakeLists.txt b/fdbclient/CMakeLists.txt index d47bdb8334..da58789a11 100644 --- a/fdbclient/CMakeLists.txt +++ b/fdbclient/CMakeLists.txt @@ -48,6 +48,7 @@ set(FDBCLIENT_SRCS Notified.h ReadYourWrites.actor.cpp ReadYourWrites.h + RestoreWorkerInterface.actor.h RunTransaction.actor.h RYWIterator.cpp RYWIterator.h diff --git a/fdbserver/RestoreWorkerInterface.actor.h b/fdbclient/RestoreWorkerInterface.actor.h similarity index 98% rename from fdbserver/RestoreWorkerInterface.actor.h rename to fdbclient/RestoreWorkerInterface.actor.h index 805b2b0a1c..15f89a5a80 100644 --- a/fdbserver/RestoreWorkerInterface.actor.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -22,11 +22,11 @@ // which are RestoreMaster, RestoreLoader, and RestoreApplier #pragma once -#if defined(NO_INTELLISENSE) && !defined(FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_G_H) - #define FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_G_H - #include "fdbserver/RestoreWorkerInterface.actor.g.h" -#elif !defined(FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_H) - #define FDBSERVER_RESTORE_WORKER_INTERFACE_ACTOR_H +#if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H) + #define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H + #include "fdbclient/RestoreWorkerInterface.actor.g.h" +#elif !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H) + #define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H #include #include "flow/Stats.h" diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index a80eaf5283..35e6e8ca30 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -26,7 +26,7 @@ #include "fdbclient/FDBTypes.h" #include "fdbclient/StorageServerInterface.h" -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" struct RestoreLoaderInterface; struct RestoreApplierInterface; diff --git a/fdbclient/fdbclient.vcxproj b/fdbclient/fdbclient.vcxproj index be793d900d..974aa896a8 100644 --- a/fdbclient/fdbclient.vcxproj +++ b/fdbclient/fdbclient.vcxproj @@ -89,6 +89,9 @@ + + false + diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 3def051534..11f3d1f203 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -76,7 +76,6 @@ set(FDBSERVER_SRCS RestoreLoader.actor.cpp RestoreWorker.actor.h RestoreWorker.actor.cpp - RestoreWorkerInterface.actor.h Resolver.actor.cpp ResolverInterface.h ServerDBInfo.h diff --git a/fdbserver/RestoreApplier.actor.h b/fdbserver/RestoreApplier.actor.h index 0fa0efc785..038d3c3d4a 100644 --- a/fdbserver/RestoreApplier.actor.h +++ b/fdbserver/RestoreApplier.actor.h @@ -34,7 +34,7 @@ #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbserver/CoordinationInterface.h" -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreRoleCommon.actor.h" diff --git a/fdbserver/RestoreLoader.actor.h b/fdbserver/RestoreLoader.actor.h index b893eecba7..0c1f6023b2 100644 --- a/fdbserver/RestoreLoader.actor.h +++ b/fdbserver/RestoreLoader.actor.h @@ -34,7 +34,7 @@ #include "fdbrpc/fdbrpc.h" #include "fdbserver/CoordinationInterface.h" #include "fdbrpc/Locality.h" -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreCommon.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index f4c58c5528..b47a68998e 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -35,7 +35,7 @@ #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbserver/CoordinationInterface.h" -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "flow/actorcompiler.h" // has to be last include diff --git a/fdbserver/RestoreWorker.actor.h b/fdbserver/RestoreWorker.actor.h index 615ce18e39..7b26899ab9 100644 --- a/fdbserver/RestoreWorker.actor.h +++ b/fdbserver/RestoreWorker.actor.h @@ -34,7 +34,7 @@ #include #include -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/RestoreUtil.h" #include "fdbserver/RestoreCommon.actor.h" #include "fdbserver/RestoreRoleCommon.actor.h" diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index 5439bdf11b..cac1789297 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -34,7 +34,7 @@ #include "fdbclient/FailureMonitorClient.h" #include "fdbserver/CoordinationInterface.h" #include "fdbserver/WorkerInterface.actor.h" -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "fdbserver/ClusterRecruitmentInterface.h" #include "fdbserver/ServerDBInfo.h" #include "fdbserver/MoveKeys.actor.h" diff --git a/fdbserver/fdbserver.vcxproj.filters b/fdbserver/fdbserver.vcxproj.filters index c215e3a9c2..348278eea7 100644 --- a/fdbserver/fdbserver.vcxproj.filters +++ b/fdbserver/fdbserver.vcxproj.filters @@ -330,7 +330,6 @@ - diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp index 6928f68e7b..0047633a13 100644 --- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp @@ -23,7 +23,7 @@ #include "fdbclient/BackupContainer.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. // A workload which test the correctness of backup and restore process diff --git a/fdbserver/workloads/ParallelRestore.actor.cpp b/fdbserver/workloads/ParallelRestore.actor.cpp index 5148476298..aac39b592d 100644 --- a/fdbserver/workloads/ParallelRestore.actor.cpp +++ b/fdbserver/workloads/ParallelRestore.actor.cpp @@ -23,7 +23,7 @@ #include "fdbclient/BackupContainer.h" #include "fdbserver/workloads/workloads.actor.h" #include "fdbserver/workloads/BulkSetup.actor.h" -#include "fdbserver/RestoreWorkerInterface.actor.h" +#include "fdbclient/RestoreWorkerInterface.actor.h" #include "flow/actorcompiler.h" // This must be the last #include. // A workload which test the correctness of backup and restore process From aed9dfd1481c6dfb05743a90f764820f5817f7aa Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 26 Sep 2019 00:39:31 -0700 Subject: [PATCH 103/184] Fix flow header guard --- fdbclient/RestoreWorkerInterface.actor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/RestoreWorkerInterface.actor.h b/fdbclient/RestoreWorkerInterface.actor.h index 15f89a5a80..d5155c3168 100644 --- a/fdbclient/RestoreWorkerInterface.actor.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -25,8 +25,8 @@ #if defined(NO_INTELLISENSE) && !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H) #define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H #include "fdbclient/RestoreWorkerInterface.actor.g.h" -#elif !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H) - #define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_G_H +#elif !defined(FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_H) + #define FDBCLIENT_RESTORE_WORKER_INTERFACE_ACTOR_H #include #include "flow/Stats.h" From a3d9e549eef13e5b38f1d92b3c6910238bb18bcf Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 26 Sep 2019 00:42:21 -0700 Subject: [PATCH 104/184] Remove rule from vcxproj --- fdbserver/fdbserver.vcxproj | 3 --- 1 file changed, 3 deletions(-) diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 58adb8f6f3..783bcb160c 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -220,9 +220,6 @@ false - - false - false From b893374c6851ef139ab4653e6d049c593a8d5c01 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 26 Sep 2019 09:34:01 -0700 Subject: [PATCH 105/184] Add -Wno-attributes for gcc --- cmake/ConfigureCompiler.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/ConfigureCompiler.cmake b/cmake/ConfigureCompiler.cmake index 04cbd377db..47f95b6d22 100644 --- a/cmake/ConfigureCompiler.cmake +++ b/cmake/ConfigureCompiler.cmake @@ -217,7 +217,13 @@ else() else() add_compile_options(-Werror) endif() - add_compile_options($<$:-Wno-pragmas>) + if (GCC) + add_compile_options(-Wno-pragmas) + + # Otherwise `state [[maybe_unused]] int x;` will issue a warning. + # https://stackoverflow.com/questions/50646334/maybe-unused-on-member-variable-gcc-warns-incorrectly-that-attribute-is + add_compile_options(-Wno-attributes) + endif() add_compile_options(-Wno-error=format -Wunused-variable -Wno-deprecated From a00f04eb203ddb3da4ddc7e0f478b4d7be27848a Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 26 Sep 2019 09:43:51 -0700 Subject: [PATCH 106/184] Fix gcc with Make --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 875ca76593..79f2cb05ec 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,8 @@ ifeq ($(PLATFORM),Linux) ifneq '' '$(findstring clang++,$(CXX))' CXXFLAGS += -Wno-undefined-var-template -Wno-unknown-warning-option -Wno-unused-command-line-argument -Wno-register -Wno-logical-op-parentheses + else + CXXFLAGS += -Wno-attributes endif CXXFLAGS += -std=c++17 From e4acd2e318db3490940e6abf0d7fac50ea8f0cfe Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 2 Oct 2019 11:57:30 -0700 Subject: [PATCH 107/184] Disable TLS temporarily for OPEN_FOR_IDE build --- flow/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 84184156c3..233e4e369f 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -92,7 +92,8 @@ target_link_libraries(flow PUBLIC boost_target Threads::Threads ${CMAKE_DL_LIBS} if(USE_VALGRIND) target_link_libraries(flow PUBLIC Valgrind) endif() -if(NOT WITH_TLS) +# TODO(atn34) Re-enable TLS for OPEN_FOR_IDE build once #2201 is resolved +if(NOT WITH_TLS OR OPEN_FOR_IDE) target_compile_definitions(flow PUBLIC TLS_DISABLED) else() target_link_libraries(flow PUBLIC FDBLibTLS) From 1827e77f2ed146d382cfc2b400911eb1d42680fc Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Thu, 24 Oct 2019 15:56:22 -0700 Subject: [PATCH 108/184] Update fdbserver/FDBExecHelper.actor.cpp Co-Authored-By: Jingyu Zhou --- fdbserver/FDBExecHelper.actor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fdbserver/FDBExecHelper.actor.cpp b/fdbserver/FDBExecHelper.actor.cpp index f9608acefc..d435320989 100644 --- a/fdbserver/FDBExecHelper.actor.cpp +++ b/fdbserver/FDBExecHelper.actor.cpp @@ -142,7 +142,7 @@ ACTOR Future spawnProcess(std::string binPath, std::vector par #endif ACTOR Future execHelper(ExecCmdValueString* execArg, UID snapUID, std::string folder, std::string role) { - state Standalone uidStr = Standalone(snapUID.toString()); + state Standalone uidStr(snapUID.toString()); state int err = 0; state Future cmdErr; state double maxWaitTime = SERVER_KNOBS->SNAP_CREATE_MAX_TIMEOUT; From 0953bf376d1449e710662e427dfe7423e936dad1 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 25 Oct 2019 13:38:04 -0700 Subject: [PATCH 109/184] fixed Javadoc headings --- bindings/java/src/main/com/apple/foundationdb/FDB.java | 6 +++--- .../java/src/main/com/apple/foundationdb/tuple/Tuple.java | 4 ++-- bindings/java/src/main/overview.html.in | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bindings/java/src/main/com/apple/foundationdb/FDB.java b/bindings/java/src/main/com/apple/foundationdb/FDB.java index e20fa90432..621417256d 100644 --- a/bindings/java/src/main/com/apple/foundationdb/FDB.java +++ b/bindings/java/src/main/com/apple/foundationdb/FDB.java @@ -30,7 +30,7 @@ import java.util.concurrent.atomic.AtomicInteger; /** * The starting point for accessing FoundationDB. *
- *

Setting API version

+ *

Setting API version

* The FoundationDB API is accessed with a call to {@link #selectAPIVersion(int)}. * This call is required before using any other part of the API. The call allows * an error to be thrown at this point to prevent client code from accessing a later library @@ -49,11 +49,11 @@ import java.util.concurrent.atomic.AtomicInteger; * being used to connect to the cluster. In particular, you should not advance * the API version of your application after upgrading your client until the * cluster has also been upgraded.
- *

Getting a database

+ *

Getting a database

* Once the API version has been set, the easiest way to get a {@link Database} object to use is * to call {@link #open}. *
- *

Client networking

+ *

Client networking

* The network is started either implicitly with a call to a variant of {@link #open()} * or started explicitly with a call to {@link #startNetwork()}. *
diff --git a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java index e5556faaa6..70dde8d2b5 100644 --- a/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java +++ b/bindings/java/src/main/com/apple/foundationdb/tuple/Tuple.java @@ -39,7 +39,7 @@ import com.apple.foundationdb.Range; * the same order in which they would sort in FoundationDB. {@code Tuple}s sort * first by the first element, then by the second, etc. This makes the tuple layer * ideal for building a variety of higher-level data models.
- *

Types

+ *

Types

* A {@code Tuple} can * contain byte arrays ({@code byte[]}), {@link String}s, {@link Number}s, {@link UUID}s, * {@code boolean}s, {@link List}s, {@link Versionstamp}s, other {@code Tuple}s, and {@code null}. @@ -50,7 +50,7 @@ import com.apple.foundationdb.Range; * a {@code long} integral value, so the range will be constrained to * [{@code -2^63}, {@code 2^63-1}]. Note that for numbers outside this range the way that Java * truncates integral values may yield unexpected results.
- *

{@code null} values

+ *

{@code null} values

* The FoundationDB tuple specification has a special type-code for {@code None}; {@code nil}; or, * as Java would understand it, {@code null}. * The behavior of the layer in the presence of {@code null} varies by type with the intention diff --git a/bindings/java/src/main/overview.html.in b/bindings/java/src/main/overview.html.in index d594b769e3..648a4e3478 100644 --- a/bindings/java/src/main/overview.html.in +++ b/bindings/java/src/main/overview.html.in @@ -2,7 +2,7 @@ This documents the client API for using FoundationDB from Java.

-

Installation

+

Installation

FoundationDB's Java bindings rely on native libraries that are installed as part of the FoundationDB client binaries installation (see @@ -10,7 +10,7 @@ Installing FoundationDB client binaries). The JAR can be downloaded from our website and then added to your classpath.

-

Getting started

+

Getting started

To start using FoundationDB from Java, create an instance of the {@link com.apple.foundationdb.FDB FoundationDB API interface} with the version of the API that you want to use (this release of the FoundationDB Java API supports versions between {@code 510} and {@code 620}). @@ -50,7 +50,7 @@ public class Example { } } -

FoundationDB {@link com.apple.foundationdb.tuple Tuple API}

+

FoundationDB {@link com.apple.foundationdb.tuple Tuple API}

The {@link com.apple.foundationdb.tuple Tuple API} is provided with the core Java API for FoundationDB. This layer is provided in some form in all official language bindings. It enables cross-language support for storing and retrieving typed data from the @@ -60,7 +60,7 @@ binary data that FoundationDB supports. And, just as importantly, data packed in and general Tuple documentation for information about how Tuples sort and can be used to efficiently model data.
-

FoundationDB {@link com.apple.foundationdb.directory Directory API}

+

FoundationDB {@link com.apple.foundationdb.directory Directory API}

The {@link com.apple.foundationdb.directory Directory API} is provided with the core Java API for FoundationDB. This layer is provided in some form in all official language bindings. The FoundationDB API provides directories as a tool for From 2ee1782c19bf21b51a5c26bc218798e6f5ef5b66 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Fri, 25 Oct 2019 14:52:06 -0700 Subject: [PATCH 110/184] Bug fixes in Redwood. BTree height was not being reset when a new empty root is written. IKeyValueStore wrapper was not obeying the row limit in a reverse range query. Added yields to and delays to break up tasks and set IO priorities. --- fdbserver/VersionedBTree.actor.cpp | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 9f5db9a5f7..10f9636178 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1092,6 +1092,10 @@ public: // If the user chosen physical page size is larger, then there will be a gap of unused space after // between the end of page 1 and the start of page 2. ACTOR static Future> readHeaderPage(COWPager *self, PhysicalPageID pageID) { + if(g_network->getCurrentTask() > TaskPriority::DiskRead) { + wait(delay(0, TaskPriority::DiskRead)); + } + state Reference page(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)); int readBytes = wait(self->pageFile->read(page->mutate(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); debug_printf("COWPager(%s) header op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); @@ -1100,6 +1104,10 @@ public: } ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { + if(g_network->getCurrentTask() > TaskPriority::DiskRead) { + wait(delay(0, TaskPriority::DiskRead)); + } + state Reference page = self->newPageBuffer(); debug_printf("COWPager(%s) op=read_physical_start %s\n", self->filename.c_str(), toString(pageID).c_str()); int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); @@ -1200,11 +1208,17 @@ public: debug_printf("COWPager(%s) Syncing\n", self->filename.c_str()); // Sync everything except the header + if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + wait(delay(0, TaskPriority::DiskWrite)); + } wait(self->pageFile->sync()); debug_printf("COWPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), self->pHeader->committedVersion); // Update header on disk and sync again. wait(self->writeHeaderPage(0, self->headerPage)); + if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { + wait(delay(0, TaskPriority::DiskWrite)); + } wait(self->pageFile->sync()); debug_printf("COWPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), self->pHeader->committedVersion); @@ -2275,10 +2289,10 @@ struct BTreePage { } }; -static void makeEmptyPage(Reference page, uint8_t newFlags) { +static void makeEmptyRoot(Reference page) { BTreePage *btpage = (BTreePage *)page->begin(); btpage->formatVersion = BTreePage::FORMAT_VERSION; - btpage->flags = newFlags; + btpage->flags = BTreePage::IS_LEAF; btpage->height = 1; btpage->kvBytes = 0; btpage->itemCount = 0; @@ -2641,7 +2655,7 @@ public: self->m_header.height = 1; ++latest; Reference page = self->m_pager->newPageBuffer(); - makeEmptyPage(page, BTreePage::IS_LEAF); + makeEmptyRoot(page); self->m_pager->updatePage(id, page); self->m_pager->setCommitVersion(latest); @@ -3232,6 +3246,7 @@ private: childPageID.push_back(records.arena(), id); } } + wait(yield()); // Update activity counts ++counts.pageWrites; @@ -3331,7 +3346,7 @@ private: debug_printf("readPage() op=readForDeferredClear %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); } - wait(delay(0, TaskPriority::DiskRead)); + wait(yield()); state Reference page; @@ -3815,7 +3830,8 @@ private: debug_printf("Writing new empty root.\n"); LogicalPageID newRootID = wait(self->m_pager->newPageID()); Reference page = self->m_pager->newPageBuffer(); - makeEmptyPage(page, BTreePage::IS_LEAF); + makeEmptyRoot(page); + self->m_header.height = 1; self->m_pager->updatePage(newRootID, page); rootPageID = BTreePageID((LogicalPageID *)&newRootID, 1); } @@ -4513,7 +4529,7 @@ public: KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); result.push_back(result.arena(), kv); - if(--rowLimit == 0 || accumulatedBytes >= byteLimit) { + if(++rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } wait(cur->prev(true)); From b7b5d2ead35ef43db89d85a6d9702d00291f320d Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Sat, 26 Oct 2019 14:29:05 -0700 Subject: [PATCH 111/184] Remove several nonsensical const uses These seem to be all the ones that clang's -Wignored-qualifiers complains about --- fdbclient/FDBTypes.h | 2 +- fdbclient/FileBackupAgent.actor.cpp | 4 ++-- fdbclient/SystemData.cpp | 2 +- fdbclient/SystemData.h | 2 +- fdbserver/RestoreCommon.actor.cpp | 4 ++-- fdbserver/RestoreRoleCommon.actor.h | 8 ++++---- flow/Arena.h | 2 +- flow/ObjectSerializerTraits.h | 6 +++--- flow/flat_buffers.h | 2 +- flow/flow.h | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 76c74c41b9..a83c56a7d8 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -93,7 +93,7 @@ struct struct_like_traits : std::true_type { } template - static const void assign(Member& m, const Type& t, Context&) { + static void assign(Member& m, const Type& t, Context&) { if constexpr (i == 0) { m.id = t; } else { diff --git a/fdbclient/FileBackupAgent.actor.cpp b/fdbclient/FileBackupAgent.actor.cpp index 84efc5013b..4dd057a48e 100644 --- a/fdbclient/FileBackupAgent.actor.cpp +++ b/fdbclient/FileBackupAgent.actor.cpp @@ -572,8 +572,8 @@ namespace fileBackup { // Functions for consuming big endian (network byte order) integers. // Consumes a big endian number, swaps it to little endian, and returns it. - const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());} - const uint32_t consumeNetworkUInt32() { return bigEndian32( consume());} + int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume< int32_t>());} + uint32_t consumeNetworkUInt32() { return bigEndian32( consume());} bool eof() { return rptr == end; } diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 8d80d50f3e..5f1b4b03d7 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -678,7 +678,7 @@ const Value restoreRequestTriggerValue(UID randomID, int const numRequests) { wr << randomID; return wr.toValue(); } -const int decodeRestoreRequestTriggerValue(ValueRef const& value) { +int decodeRestoreRequestTriggerValue(ValueRef const& value) { int s; UID randomID; BinaryReader reader(value, IncludeVersion()); diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index 35e6e8ca30..dd40289902 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -302,7 +302,7 @@ const Key restoreWorkerKeyFor(UID const& workerID); const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& server); RestoreWorkerInterface decodeRestoreWorkerInterfaceValue(ValueRef const& value); const Value restoreRequestTriggerValue(UID randomUID, int const numRequests); -const int decodeRestoreRequestTriggerValue(ValueRef const& value); +int decodeRestoreRequestTriggerValue(ValueRef const& value); const Value restoreRequestDoneVersionValue(Version readVersion); Version decodeRestoreRequestDoneVersionValue(ValueRef const& value); const Key restoreRequestKeyFor(int const& index); diff --git a/fdbserver/RestoreCommon.actor.cpp b/fdbserver/RestoreCommon.actor.cpp index ca2da8901c..d8689d136f 100644 --- a/fdbserver/RestoreCommon.actor.cpp +++ b/fdbserver/RestoreCommon.actor.cpp @@ -322,8 +322,8 @@ struct StringRefReader { // Functions for consuming big endian (network byte order) integers. // Consumes a big endian number, swaps it to little endian, and returns it. - const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } - const uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } + int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } + uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } bool eof() { return rptr == end; } diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index b47a68998e..81120d87b7 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -90,12 +90,12 @@ struct StringRefReaderMX { // Functions for consuming big endian (network byte oselfer) integers. // Consumes a big endian number, swaps it to little endian, and returns it. - const int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } - const uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } + int32_t consumeNetworkInt32() { return (int32_t)bigEndian32((uint32_t)consume()); } + uint32_t consumeNetworkUInt32() { return bigEndian32(consume()); } // Convert big Endian value (e.g., encoded in log file) into a littleEndian uint64_t value. - const int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume()); } - const uint64_t consumeNetworkUInt64() { return bigEndian64(consume()); } + int64_t consumeNetworkInt64() { return (int64_t)bigEndian64((uint32_t)consume()); } + uint64_t consumeNetworkUInt64() { return bigEndian64(consume()); } bool eof() { return rptr == end; } diff --git a/flow/Arena.h b/flow/Arena.h index 3af189c8b4..4d8b5aa914 100644 --- a/flow/Arena.h +++ b/flow/Arena.h @@ -468,7 +468,7 @@ struct union_like_traits> : std::true_type { } template - static const void assign(Member& member, const U& t, Context&) { + static void assign(Member& member, const U& t, Context&) { member = t; } }; diff --git a/flow/ObjectSerializerTraits.h b/flow/ObjectSerializerTraits.h index 2f560f441c..dc3dd8c9ae 100644 --- a/flow/ObjectSerializerTraits.h +++ b/flow/ObjectSerializerTraits.h @@ -133,7 +133,7 @@ struct union_like_traits : std::false_type { static const index_t& get(const Member&, Context&); template - static const void assign(Member&, const Alternative&, Context&); + static void assign(Member&, const Alternative&, Context&); template static void done(Member&, Context&); @@ -150,7 +150,7 @@ struct struct_like_traits : std::false_type { static const index_t& get(const Member&, Context&); template - static const void assign(Member&, const index_t&, Context&); + static void assign(Member&, const index_t&, Context&); template static void done(Member&, Context&); @@ -175,7 +175,7 @@ struct union_like_traits> : std::true_type { } template - static const void assign(Member& member, const Alternative& a, Context&) { + static void assign(Member& member, const Alternative& a, Context&) { static_assert(std::is_same_v, Alternative>); member = a; } diff --git a/flow/flat_buffers.h b/flow/flat_buffers.h index 4794773a85..33e1cbedc9 100644 --- a/flow/flat_buffers.h +++ b/flow/flat_buffers.h @@ -73,7 +73,7 @@ struct struct_like_traits> : std::true_type { } template - static const void assign(Member& m, const Type& t, Context&) { + static void assign(Member& m, const Type& t, Context&) { std::get(m) = t; } }; diff --git a/flow/flow.h b/flow/flow.h index ecf25397d8..67e8bf6706 100644 --- a/flow/flow.h +++ b/flow/flow.h @@ -225,7 +225,7 @@ struct union_like_traits> : std::true_type { } template - static const void assign(Member& m, const Alternative& a, Context&) { + static void assign(Member& m, const Alternative& a, Context&) { if constexpr (i == 0) { m = a; } else { From 0d993522d3e054de4a3dfe0f457736394a4f27b2 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 28 Oct 2019 04:00:37 -0700 Subject: [PATCH 112/184] CommitSubtree() will now return an empty page set even for the tree root because commit_impl() handles this correctly. Improved commitSubtree() debug output related to which mutations are relevant to a subtree. Added random setting of range clear boundaries after clear() in Redwood correctness to make sure mutation buffer logic handles this correctly. B+Tree's dbEnd mutation is represented as a clear to prevent unnecessary rightmost subtree traversal during commit. --- fdbserver/IPager.h | 3 ++ fdbserver/VersionedBTree.actor.cpp | 86 +++++++++++++++++------------- 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 35549ac096..25def8487d 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -209,6 +209,9 @@ public: virtual StorageBytes getStorageBytes() = 0; + // Count of pages in use by the pager client + virtual int64_t getUserPageCount() = 0; + // Future returned is ready when pager has been initialized from disk and is ready for reads and writes. // It is invalid to call most other functions until init() is ready. // TODO: Document further. diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 10f9636178..b0d7e40c0e 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1310,7 +1310,7 @@ public: } // Get the number of pages in use but not by the pager itself. - int64_t getUserPageCount() { + int64_t getUserPageCount() override { int userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries; debug_printf("COWPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 "\n", filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries); return userPages; @@ -2716,10 +2716,13 @@ public: // When starting a new mutation buffer its start version must be greater than the last write version ASSERT(v > m_writeVersion); m_pBuffer = &m_mutationBuffers[v]; + // Create range representing the entire keyspace. This reduces edge cases to applying mutations // because now all existing keys are within some range in the mutation map. - (*m_pBuffer)[dbBegin.key]; - (*m_pBuffer)[dbEnd.key]; + (*m_pBuffer)[dbBegin.key] = RangeMutation(); + // Setting the dbEnd key to be cleared prevents having to treat a range clear to dbEnd as a special + // case in order to avoid traversing down the rightmost edge of the tree. + (*m_pBuffer)[dbEnd.key].startKeyMutations[0] = SingleKeyMutation(); } else { // It's OK to set the write version to the same version repeatedly so long as m_pBuffer is not null @@ -2750,14 +2753,19 @@ public: self->setWriteVersion(self->getLatestVersion() + 1); } + // The lazy delete queue should now be empty and contain only the new page to start writing to + // on the next commit. LazyDeleteQueueT::QueueState s = self->m_lazyDeleteQueue.getState(); ASSERT(s.numEntries == 0); ASSERT(s.numPages == 1); - debug_printf("rootPageCount %d\n", self->m_header.root.count); + // The btree should now be a single non-oversized root page. ASSERT(self->m_header.height == 1); - // All that should be in use now is the root page and the lazy delete queue empty page. - ASSERT(((COWPager *)self->m_pager)->getUserPageCount() == self->m_header.root.count + 1); + ASSERT(self->m_header.root.count == 1); + + // From the pager's perspective the only pages that should be in use are the btree root and + // the previously mentioned lazy delete queue page. + ASSERT(self->m_pager->getUserPageCount() == 2); return Void(); } @@ -3033,22 +3041,6 @@ private: LazyDeleteQueueT m_lazyDeleteQueue; int m_maxPartSize; - void printMutationBuffer(MutationBufferT::const_iterator begin, MutationBufferT::const_iterator end) const { -#if REDWOOD_DEBUG - debug_printf("-------------------------------------\n"); - debug_printf("BUFFER\n"); - while(begin != end) { - debug_printf("'%s': %s\n", printable(begin->first).c_str(), begin->second.toString().c_str()); - ++begin; - } - debug_printf("-------------------------------------\n"); -#endif - } - - void printMutationBuffer(MutationBufferT *buf) const { - return printMutationBuffer(buf->begin(), buf->end()); - } - // Find or create a mutation buffer boundary for bound and return an iterator to it MutationBufferT::iterator insertMutationBoundary(Key boundary) { ASSERT(m_pBuffer != nullptr); @@ -3413,7 +3405,16 @@ private: state MutationBufferT::const_iterator iMutationBoundaryEnd = mutationBuffer->lower_bound(upperBound->key); if(REDWOOD_DEBUG) { - self->printMutationBuffer(iMutationBoundary, iMutationBoundaryEnd); + debug_printf("%s ---------MUTATION BUFFER SLICE ---------------------\n", context.c_str()); + auto begin = iMutationBoundary; + while(1) { + debug_printf("%s Mutation: '%s': %s\n", context.c_str(), printable(begin->first).c_str(), begin->second.toString().c_str()); + if(begin == iMutationBoundaryEnd) { + break; + } + ++begin; + } + debug_printf("%s -------------------------------------\n", context.c_str()); } // If the boundary range iterators are the same then upperbound and lowerbound have the same key. @@ -3437,6 +3438,8 @@ private: return results; } + // If one mutation range covers the entire subtree, then check if the entire subtree is modified, + // unmodified, or possibly/partially modified. MutationBufferT::const_iterator iMutationBoundaryNext = iMutationBoundary; ++iMutationBoundaryNext; // If one mutation range covers the entire page @@ -3479,20 +3482,13 @@ private: cursor.moveFirst(); state Version writeVersion; - state bool isRoot = (rootID == self->m_header.root.get()); // Leaf Page if(page->flags & BTreePage::IS_LEAF) { ASSERT(isLeaf); state Standalone> merged; - debug_printf("%s MERGING EXISTING DATA WITH MUTATIONS:\n", context.c_str()); - if(REDWOOD_DEBUG) { - self->printMutationBuffer(iMutationBoundary, iMutationBoundaryEnd); - } - - // It's a given that the mutation map is not empty so it's safe to do this - Key mutationRangeStart = iMutationBoundary->first; + debug_printf("%s Leaf page, merging changes.\n", context.c_str()); // If replacement pages are written they will be at the minimum version seen in the mutations for this leaf Version minVersion = invalidVersion; @@ -3635,7 +3631,7 @@ private: writeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : minVersion; // If everything in the page was deleted then this page should be deleted as of the new version // Note that if a single range clear covered the entire page then we should not get this far - if(merged.empty() && !isRoot) { + if(merged.empty()) { debug_printf("%s All leaf page contents were cleared, returning %s\n", context.c_str(), toString(results).c_str()); self->freeBtreePage(rootID, writeVersion); return results; @@ -3812,10 +3808,6 @@ private: state Version latestVersion = self->m_pager->getLatestVersion(); debug_printf("%s: pager latestVersion %" PRId64 "\n", self->m_name.c_str(), latestVersion); - if(REDWOOD_DEBUG) { - self->printMutationBuffer(mutations); - } - state Standalone rootPageID = self->m_header.root.get(); state RedwoodRecordRef lowerBound = dbBegin.withPageID(rootPageID); Standalone versionedRoots = wait(commitSubtree(self, mutations, self->m_pager->getReadSnapshot(latestVersion), rootPageID, self->m_header.height == 1, &lowerBound, &dbEnd, &lowerBound, &dbEnd)); @@ -4368,12 +4360,12 @@ private: return Void(); } + debug_printf("readFullKVPair: Split, first record %s\n", rec.toString().c_str()); + // Split value, need to coalesce split value parts into a buffer in arena, // after which cur1 will point to the first part and kv.key will reference its key ASSERT(rec.chunk.start + rec.value.get().size() == rec.chunk.total); - debug_printf("readFullKVPair: Split, totalsize %d %s\n", rec.chunk.total, self->toString().c_str()); - // Allocate space for the entire value in the same arena as the key state int bytesLeft = rec.chunk.total; state StringRef dst = makeString(bytesLeft, self->m_arena); @@ -5401,6 +5393,7 @@ TEST_CASE("!/redwood/correctness/btree") { state int maxCommitSize = shortTest ? 1000 : randomSize(std::min((maxKeySize + maxValueSize) * 20000, 10e6)); state int mutationBytesTarget = shortTest ? 5000 : randomSize(std::min(maxCommitSize * 100, 100e6)); state double clearProbability = deterministicRandom()->random01() * .1; + state double clearPostSetProbability = deterministicRandom()->random01() * .1; state double coldStartProbability = deterministicRandom()->random01(); state double advanceOldVersionProbability = deterministicRandom()->random01(); state double maxWallClockDuration = 60; @@ -5415,6 +5408,7 @@ TEST_CASE("!/redwood/correctness/btree") { printf("maxCommitSize: %d\n", maxCommitSize); printf("mutationBytesTarget: %d\n", mutationBytesTarget); printf("clearProbability: %f\n", clearProbability); + printf("clearPostSetProbability: %f\n", clearPostSetProbability); printf("coldStartProbability: %f\n", coldStartProbability); printf("advanceOldVersionProbability: %f\n", advanceOldVersionProbability); printf("\n"); @@ -5518,6 +5512,22 @@ TEST_CASE("!/redwood/correctness/btree") { } btree->clear(range); + + // Sometimes set the range start after the clear + if(deterministicRandom()->random01() < clearPostSetProbability) { + KeyValue kv = randomKV(0, maxValueSize); + kv.key = range.begin; + btree->set(kv); + written[std::make_pair(kv.key.toString(), version)] = kv.value.toString(); + } + + // Sometimes set the range end after the clear + if(deterministicRandom()->random01() < clearPostSetProbability) { + KeyValue kv = randomKV(0, maxValueSize); + kv.key = range.end; + btree->set(kv); + written[std::make_pair(kv.key.toString(), version)] = kv.value.toString(); + } } else { // Set a key From 40d53e23f5cdb1650b82eac1232f88df5e6b82dc Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 28 Oct 2019 16:05:11 -0700 Subject: [PATCH 113/184] Optimization, only the first btree mutation boundary for a subtree needs to be compared to the subtree's lower bound. Also removed a check for a condition which is no longer possible due to other changes. --- fdbserver/VersionedBTree.actor.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index b0d7e40c0e..c890bca3fb 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3495,24 +3495,22 @@ private: int changes = 0; // Now, process each mutation range and merge changes with existing data. + bool firstMutationBoundary = true; while(iMutationBoundary != iMutationBoundaryEnd) { debug_printf("%s New mutation boundary: '%s': %s\n", context.c_str(), printable(iMutationBoundary->first).c_str(), iMutationBoundary->second.toString().c_str()); SingleKeyMutationsByVersion::const_iterator iMutations; - // If the mutation boundary key is less than the lower bound key then skip startKeyMutations for - // this bounary, we're only processing this mutation range here to apply any clears to existing data. - if(iMutationBoundary->first < lowerBound->key) { + // For the first mutation boundary only, if the boundary key is less than the lower bound for the page + // then skip startKeyMutations for this boundary, we're only processing this mutation range here to apply + // a possible clear to existing data. + if(firstMutationBoundary && iMutationBoundary->first < lowerBound->key) { iMutations = iMutationBoundary->second.startKeyMutations.end(); } - // If the mutation boundary key is the same as the page lowerBound key then start reading single - // key mutations at the first version greater than the lowerBound key's version. - else if(!self->singleVersion && iMutationBoundary->first == lowerBound->key) { - iMutations = iMutationBoundary->second.startKeyMutations.upper_bound(lowerBound->version); - } else { iMutations = iMutationBoundary->second.startKeyMutations.begin(); } + firstMutationBoundary = false; SingleKeyMutationsByVersion::const_iterator iMutationsEnd = iMutationBoundary->second.startKeyMutations.end(); From 9c0d671d071bca85a83fede74f4571bf0a505c65 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 29 Oct 2019 01:31:59 -0700 Subject: [PATCH 114/184] Two bug fixes in Redwood related to split KV pairs and one was masking the other. The first bug resulted in an incomplete erasure of fragments for a split KV pair and the second bug would generate an unnecessary explicit null record for the same key which would cause reads to correctly see the key as missing. Redwood correctness test now clears the tree and verifies expected resulting pager footprint, which succeeds due to the bug fixes. --- fdbserver/VersionedBTree.actor.cpp | 124 +++++++++++++++++++++++------ 1 file changed, 99 insertions(+), 25 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index c890bca3fb..23da056d72 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2950,6 +2950,18 @@ private: // A clear range version, if cleared, for the range starting immediately AFTER the start key Optional rangeClearVersion; + bool keyCleared() const { + return startKeyMutations.size() == 1 && startKeyMutations.begin()->second.isClear(); + } + + bool keyChanged() const { + return !startKeyMutations.empty(); + } + + bool rangeCleared() const { + return rangeClearVersion.present(); + } + // Returns true if this RangeMutation doesn't actually mutate anything bool noChanges() const { return !rangeClearVersion.present() && startKeyMutations.empty(); @@ -3417,10 +3429,15 @@ private: debug_printf("%s -------------------------------------\n", context.c_str()); } - // If the boundary range iterators are the same then upperbound and lowerbound have the same key. - // If the key is being mutated, them remove this subtree. + // iMutationBoundary is greatest boundary <= lowerBound->key + // iMutationBoundaryEnd is least boundary >= upperBound->key + + // If the boundary range iterators are the same then this subtree only has one unique key, which is the same key as the boundary + // record the iterators are pointing to. There only two outcomes possible: Clearing the subtree or leaving it alone. + // If there are any changes to the one key then the entire subtree should be deleted as the changes for the key + // do not go into this subtree. if(iMutationBoundary == iMutationBoundaryEnd) { - if(!iMutationBoundary->second.startKeyMutations.empty()) { + if(iMutationBoundary->second.keyChanged()) { debug_printf("%s lower and upper bound key/version match and key is modified so deleting page, returning %s\n", context.c_str(), toString(results).c_str()); Version firstKeyChangeVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : iMutationBoundary->second.startKeyMutations.begin()->first; if(isLeaf) { @@ -3442,25 +3459,60 @@ private: // unmodified, or possibly/partially modified. MutationBufferT::const_iterator iMutationBoundaryNext = iMutationBoundary; ++iMutationBoundaryNext; - // If one mutation range covers the entire page if(iMutationBoundaryNext == iMutationBoundaryEnd) { - // If there are no changes in the range (no clear, no boundary key mutations) - // OR there are changes but for a key that is less than the page lower boundary and therefore not part of this page - if(iMutationBoundary->second.noChanges() || - ( !iMutationBoundary->second.rangeClearVersion.present() && iMutationBoundary->first < lowerBound->key) - ) { + // Cleared means the entire range covering the subtree was cleared. It is assumed true + // if the range starting after the lower mutation boundary was cleared, and then proven false + // below if possible. + bool cleared = iMutationBoundary->second.rangeCleared(); + // Unchanged means the entire range covering the subtree was unchanged, it is assumed to be the + // opposite of cleared() and then proven false below if possible. + bool unchanged = !cleared; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + + // If the lower mutation boundary key is the same as the subtree lower bound then whether or not + // that key is being changed or cleared affects this subtree. + if(iMutationBoundary->first == lowerBound->key) { + // If subtree will be cleared (so far) but the lower boundary key is not cleared then the subtree is not cleared + if(cleared && !iMutationBoundary->second.keyCleared()) { + cleared = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + // If the subtree looked unchanged (so far) but the lower boundary is is changed then the subtree is changed + if(unchanged && iMutationBoundary->second.keyChanged()) { + unchanged = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + } + + // If the higher mutation boundary key is the same as the subtree upper bound key then whether + // or not it is being changed or cleared affects this subtree. + if((cleared || unchanged) && iMutationBoundaryEnd->first == upperBound->key) { + // If the key is being changed then the records in this subtree with the same key must be removed + // so the subtree is definitely not unchanged, though it may be cleared to achieve the same effect. + if(iMutationBoundaryEnd->second.keyChanged()) { + unchanged = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + else { + // If the key is not being changed then the records in this subtree can't be removed so the + // subtree is not being cleared. + cleared = false; + debug_printf("%s cleared=%d unchanged=%d\n", context.c_str(), cleared, unchanged); + } + } + + // The subtree cannot be both cleared and unchanged. + ASSERT(!(cleared && unchanged)); + + // If no changes in subtree + if(unchanged) { results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); debug_printf("%s no changes on this subtree, returning %s\n", context.c_str(), toString(results).c_str()); return results; } - // If the range is cleared and there either no sets or the sets aren't relevant to this subtree then delete it - // The last if subexpression is checking that either the next key in the mutation buffer is being changed or - // the upper bound key of this page isn't the same. - if(iMutationBoundary->second.rangeClearVersion.present() - && (iMutationBoundary->second.startKeyMutations.empty() || iMutationBoundary->first < lowerBound->key) - && (!iMutationBoundaryEnd->second.startKeyMutations.empty() || upperBound->key != iMutationBoundaryEnd->first) - ) { + // If subtree is cleared + if(cleared) { debug_printf("%s %s cleared, deleting it, returning %s\n", context.c_str(), isLeaf ? "Page" : "Subtree", toString(results).c_str()); Version clearVersion = self->singleVersion ? self->getLastCommittedVersion() + 1 : iMutationBoundary->second.rangeClearVersion.get(); if(isLeaf) { @@ -3492,7 +3544,6 @@ private: // If replacement pages are written they will be at the minimum version seen in the mutations for this leaf Version minVersion = invalidVersion; - int changes = 0; // Now, process each mutation range and merge changes with existing data. bool firstMutationBoundary = true; @@ -3515,11 +3566,13 @@ private: SingleKeyMutationsByVersion::const_iterator iMutationsEnd = iMutationBoundary->second.startKeyMutations.end(); // Iterate over old versions of the mutation boundary key, outputting if necessary + bool boundaryKeyWritten = false; while(cursor.valid() && cursor.get().key == iMutationBoundary->first) { // If not in single version mode or there were no changes to the key if(!self->singleVersion || iMutationBoundary->second.noChanges()) { merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + boundaryKeyWritten = true; } else { ASSERT(self->singleVersion); @@ -3534,16 +3587,26 @@ private: while(iMutations != iMutationsEnd) { const SingleKeyMutation &m = iMutations->second; if(m.isClear() || m.value.size() <= self->m_maxPartSize) { - if(iMutations->first < minVersion || minVersion == invalidVersion) - minVersion = iMutations->first; - ++changes; - merged.push_back(merged.arena(), m.toRecord(iMutationBoundary->first, iMutations->first)); - debug_printf("%s Added non-split %s [mutation, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + // If the boundary key was not yet written to the merged list then clears can be skipped. + // Note that in a more complex scenario where there are multiple sibling pages for the same key, with different + // versions and/or part numbers, this is still a valid thing to do. This is because a changing boundary + // key (set or clear) will result in any instances (different versions, split parts) of this key + // on sibling pages to the left of this page to be removed, so an explicit clear need only be stored + // if a record with the mutation boundary key was already written to this page. + if(!boundaryKeyWritten && iMutations->second.isClear()) { + debug_printf("%s Skipped %s [mutation, unnecessary boundary key clear]\n", context.c_str(), m.toRecord(iMutationBoundary->first, iMutations->first).toString().c_str()); + } + else { + merged.push_back(merged.arena(), m.toRecord(iMutationBoundary->first, iMutations->first)); + debug_printf("%s Added non-split %s [mutation, boundary start]\n", context.c_str(), merged.back().toString().c_str()); + if(iMutations->first < minVersion || minVersion == invalidVersion) + minVersion = iMutations->first; + boundaryKeyWritten = true; + } } else { if(iMutations->first < minVersion || minVersion == invalidVersion) minVersion = iMutations->first; - ++changes; int bytesLeft = m.value.size(); int start = 0; RedwoodRecordRef whole(iMutationBoundary->first, iMutations->first, m.value); @@ -3555,6 +3618,7 @@ private: start += partSize; debug_printf("%s Added split %s [mutation, boundary start] bytesLeft %d\n", context.c_str(), merged.back().toString().c_str(), bytesLeft); } + boundaryKeyWritten = true; } ++iMutations; } @@ -3595,7 +3659,6 @@ private: Version clearVersion = clearRangeVersion.get(); if(clearVersion < minVersion || minVersion == invalidVersion) minVersion = clearVersion; - ++changes; merged.push_back(merged.arena(), RedwoodRecordRef(cursor.get().key, clearVersion)); debug_printf("%s Added %s [existing, middle clear]\n", context.c_str(), merged.back().toString().c_str()); } @@ -3608,7 +3671,17 @@ private: } // Write any remaining existing keys, which are not subject to clears as they are beyond the cleared range. + bool upperMutationBoundaryKeyChanged = iMutationBoundaryEnd->second.keyChanged(); while(cursor.valid()) { + // If the upper mutation boundary is being changed and the cursor's key matches it then stop because none of the earlier + // versions or fragments of that key should be written. + if(upperMutationBoundaryKeyChanged && cursor.get().key == iMutationBoundaryEnd->first) { + debug_printf("%s Skipped %s and beyond [existing, matches changed upper mutation boundary]\n", context.c_str(), cursor.get().toString().c_str()); + Version changedVersion = iMutationBoundaryEnd->second.startKeyMutations.begin()->first; + if(changedVersion < minVersion || minVersion == invalidVersion) + minVersion = changedVersion; + break; + } merged.push_back(merged.arena(), cursor.get()); debug_printf("%s Added %s [existing, tail]\n", context.c_str(), merged.back().toString().c_str()); cursor.moveNext(); @@ -3620,7 +3693,6 @@ private: if(minVersion == invalidVersion) { results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(results).c_str()); - ASSERT(changes == 0); return results; } @@ -5642,6 +5714,8 @@ TEST_CASE("!/redwood/correctness/btree") { if(errorCount != 0) throw internal_error(); + wait(btree->destroyAndCheckSanity()); + Future closedFuture = btree->onClosed(); btree->close(); wait(closedFuture); From 6c28da9093bf17e4f325d44b353492dc3de81004 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Tue, 29 Oct 2019 13:26:43 -0700 Subject: [PATCH 115/184] Clean up some memory after network thread exits --- flow/IRandom.h | 1 + flow/Net2.actor.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/flow/IRandom.h b/flow/IRandom.h index 24a2449a4c..cc1dfd7f24 100644 --- a/flow/IRandom.h +++ b/flow/IRandom.h @@ -90,6 +90,7 @@ namespace std { class IRandom { public: + virtual ~IRandom() = default; virtual double random01() = 0; // return random value in [0, 1] virtual int randomInt(int min, int maxPlusOne) = 0; virtual int64_t randomInt64(int64_t min, int64_t maxPlusOne) = 0; diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 92aef230ea..08dccfbb35 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -732,6 +732,10 @@ void Net2::run() { #ifdef WIN32 timeEndPeriod(1); #endif + + // clean up memory + delete this; + thread_network = nullptr; } void Net2::trackMinPriority( TaskPriority minTaskID, double now ) { From 199a34b827b2369db795cb936cf70be90e4661c5 Mon Sep 17 00:00:00 2001 From: Xin Dong Date: Wed, 30 Oct 2019 10:04:19 -0700 Subject: [PATCH 116/184] Defined a minimum read cost (a penalty) for empty read or read size smaller than it. Fixed several review comments. --- fdbserver/Knobs.cpp | 3 ++- fdbserver/Knobs.h | 1 + fdbserver/StorageMetrics.actor.h | 6 +----- fdbserver/storageserver.actor.cpp | 16 ++++++++-------- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index 4db024fed4..6b062e36ed 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -454,7 +454,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( SPLIT_JITTER_AMOUNT, 0.05 ); if( randomize && BUGGIFY ) SPLIT_JITTER_AMOUNT = 0.2; init( IOPS_UNITS_PER_SAMPLE, 10000 * 1000 / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 100 ); init( BANDWIDTH_UNITS_PER_SAMPLE, SHARD_MIN_BYTES_PER_KSEC / STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS / 25 ); - init( BYTES_READ_UNITS_PER_SAMPLE, 100000 ); + init( BYTES_READ_UNITS_PER_SAMPLE, 100000 ); // 100K bytes + init( EMPTY_READ_PENALTY, 20 ); // 20 bytes //Storage Server init( STORAGE_LOGGING_DELAY, 5.0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index c2f194e4b9..4e0b1895cc 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -392,6 +392,7 @@ public: int64_t IOPS_UNITS_PER_SAMPLE; int64_t BANDWIDTH_UNITS_PER_SAMPLE; int64_t BYTES_READ_UNITS_PER_SAMPLE; + int64_t EMPTY_READ_PENALTY; //Storage Server double STORAGE_LOGGING_DELAY; diff --git a/fdbserver/StorageMetrics.actor.h b/fdbserver/StorageMetrics.actor.h index 02988f3a25..63e7a8f2d4 100644 --- a/fdbserver/StorageMetrics.actor.h +++ b/fdbserver/StorageMetrics.actor.h @@ -221,13 +221,9 @@ struct StorageServerMetrics { notifyMetrics.bytesPerKSecond = bandwidthSample.addAndExpire( key, metrics.bytesPerKSecond, expire ) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; if (metrics.iosPerKSecond) notifyMetrics.iosPerKSecond = iopsSample.addAndExpire( key, metrics.iosPerKSecond, expire ) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - if (metrics.bytesReadPerKSecond) { + if (metrics.bytesReadPerKSecond) notifyMetrics.bytesReadPerKSecond = bytesReadSample.addAndExpire(key, metrics.bytesReadPerKSecond, expire) * SERVER_KNOBS->STORAGE_METRICS_AVERAGE_INTERVAL_PER_KSECONDS; - if (deterministicRandom()->random01() < 0.01) { - TraceEvent("BytesReadSampleCountX100").detail("SampleCount", bytesReadSample.queue.size()); - } - } if (!notifyMetrics.allZero()) { auto& v = waitMetricsMap[key]; for(int i=0; iversionLag; }); specialCounter(cc, "LocalRate", [self]{ return self->currentRate() * 100; }); + specialCounter(cc, "BytesReadSampleCount", [self]() { return self->metrics.bytesReadSample.queue.size(); }); + specialCounter(cc, "FetchKeysFetchActive", [self](){ return self->fetchKeysParallelismLock.activePermits(); }); specialCounter(cc, "FetchKeysWaiting", [self](){ return self->fetchKeysParallelismLock.waiters(); }); @@ -892,8 +894,8 @@ ACTOR Future getValueQ( StorageServer* data, GetValueRequest req ) { StorageMetrics metrics; // If the read yields no value, randomly sample the empty read. metrics.bytesReadPerKSecond = - v.present() ? (int64_t)(req.key.size() + v.get().size()) - : deterministicRandom()->random01() > 0.95 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; + v.present() ? std::max((int64_t)(req.key.size() + v.get().size()), SERVER_KNOBS->EMPTY_READ_PENALTY) + : SERVER_KNOBS->EMPTY_READ_PENALTY; data->metrics.notify(req.key, metrics); if( req.debugID.present() ) @@ -1272,7 +1274,7 @@ ACTOR Future readRange( StorageServer* data, Version version, result.more = limit == 0 || *pLimitBytes<=0; // FIXME: Does this have to be exact? result.version = version; StorageMetrics metrics; - metrics.bytesReadPerKSecond = readSize; + metrics.bytesReadPerKSecond = std::max(readSize, SERVER_KNOBS->EMPTY_READ_PENALTY); data->metrics.notify(limit >= 0 ? range.begin : range.end, metrics); return result; } @@ -1328,15 +1330,13 @@ ACTOR Future findKey( StorageServer* data, KeySelectorRef sel, Version vers *pOffset = 0; StorageMetrics metrics; - metrics.bytesReadPerKSecond = (int64_t)rep.data[index].key.size(); + metrics.bytesReadPerKSecond = std::max((int64_t)rep.data[index].key.size(), SERVER_KNOBS->EMPTY_READ_PENALTY); data->metrics.notify(sel.getKey(), metrics); return rep.data[ index ].key; } else { StorageMetrics metrics; - // Randomly sample an empty read - metrics.bytesReadPerKSecond = - deterministicRandom()->random01() > 0.95 ? SERVER_KNOBS->BYTES_READ_UNITS_PER_SAMPLE : 0; + metrics.bytesReadPerKSecond = SERVER_KNOBS->EMPTY_READ_PENALTY; data->metrics.notify(sel.getKey(), metrics); // FIXME: If range.begin=="" && !forward, return success? @@ -1468,7 +1468,7 @@ ACTOR Future getKeyValues( StorageServer* data, GetKeyValuesRequest req ) for (int i = 0; i < r.data.size(); i++) { StorageMetrics m; - m.bytesReadPerKSecond = r.data[i].expectedSize(); + m.bytesReadPerKSecond = std::max((int64_t)r.data[i].expectedSize(), SERVER_KNOBS->EMPTY_READ_PENALTY); data->metrics.notify(r.data[i].key, m); } From f175ed30b3ca9fecaacc187b52e4b76f8e6ec598 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Thu, 31 Oct 2019 09:52:21 -0700 Subject: [PATCH 117/184] Cleanup the fdbbackup cleanup command output. Add cleanup to the usage output printed for fdbbackup. --- fdbbackup/backup.actor.cpp | 7 ++++++- fdbclient/BackupAgentBase.actor.cpp | 32 ++++++++++++++++------------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 9e4a109648..b57c26ddfd 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -905,7 +905,7 @@ void printBackupContainerInfo() { static void printBackupUsage(bool devhelp) { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("Usage: %s (start | status | abort | wait | discontinue | pause | resume | expire | delete | describe | list) [OPTIONS]\n\n", exeBackup.toString().c_str()); + printf("Usage: %s (start | status | abort | wait | discontinue | pause | resume | expire | delete | describe | list | cleanup) [OPTIONS]\n\n", exeBackup.toString().c_str()); printf(" -C CONNFILE The path of a file containing the connection string for the\n" " FoundationDB cluster. The default is first the value of the\n" " FDB_CLUSTER_FILE environment variable, then `./fdb.cluster',\n" @@ -956,6 +956,11 @@ static void printBackupUsage(bool devhelp) { printf(" --trace_format FORMAT\n" " Select the format of the trace files. xml (the default) and json are supported.\n" " Has no effect unless --log is specified.\n"); + printf(" --max_cleanup_seconds SECONDS\n" + " Specifies the amount of time a backup or DR needs to be stale before cleanup will\n" + " remove mutations for it. By default this is set to one hour.\n"); + printf(" --delete_data\n" + " This flag will cause cleanup to remove mutations for the most stale backup or DR.\n"); #ifndef TLS_DISABLED printf(TLS_HELP); #endif diff --git a/fdbclient/BackupAgentBase.actor.cpp b/fdbclient/BackupAgentBase.actor.cpp index 5627a1a349..6a02bac4b3 100644 --- a/fdbclient/BackupAgentBase.actor.cpp +++ b/fdbclient/BackupAgentBase.actor.cpp @@ -862,29 +862,33 @@ ACTOR Future cleanupLogMutations(Database cx, Value destUidValue, bool del wait(success(foundDRKey) && success(foundBackupKey)); if(foundDRKey.get().present() && foundBackupKey.get().present()) { - printf("WARNING: Found a tag which looks like both a backup and a DR. This tag was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("WARNING: Found a tag that looks like both a backup and a DR. This tag is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else if(foundDRKey.get().present() && !foundBackupKey.get().present()) { - printf("Found a DR which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("Found a DR that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else if(!foundDRKey.get().present() && foundBackupKey.get().present()) { - printf("Found a Backup which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("Found a Backup that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else { - printf("WARNING: Found a unknown tag which was %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("WARNING: Found an unknown tag that is %.4f hours behind.\n", (readVer - currVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } loggedLogUids.insert(currLogUid); } } - if( readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND && deleteData && (!removingLogUid.present() || minVersionLogUid == removingLogUid.get()) ) { - removingLogUid = minVersionLogUid; - wait(eraseLogData(tr, minVersionLogUid, destUidValue)); - wait(tr->commit()); - printf("\nSuccessfully removed the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); - } else if(removingLogUid.present() && minVersionLogUid != removingLogUid.get()) { - printf("\nWARNING: The oldest tag was possibly removed, run again without `--delete_data' to check.\n"); - } else if( deleteData ) { - printf("\nWARNING: Did not delete data because the tag was not at least %.4f hours behind. Change `--min_cleanup_seconds' to adjust this threshold.\n", CLIENT_KNOBS->MIN_CLEANUP_SECONDS/3600.0); + if(deleteData) { + if(readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND && (!removingLogUid.present() || minVersionLogUid == removingLogUid.get())) { + removingLogUid = minVersionLogUid; + wait(eraseLogData(tr, minVersionLogUid, destUidValue)); + wait(tr->commit()); + printf("\nSuccessfully removed the tag that was %.4f hours behind.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + } else if(removingLogUid.present() && minVersionLogUid != removingLogUid.get()) { + printf("\nWARNING: The oldest tag was possibly removed, run again without `--delete_data' to check.\n\n"); + } else { + printf("\nWARNING: Did not delete data because the tag is not at least %.4f hours behind. Change `--min_cleanup_seconds' to adjust this threshold.\n\n", CLIENT_KNOBS->MIN_CLEANUP_SECONDS/3600.0); + } + } else if(readVer - minVersion > CLIENT_KNOBS->MIN_CLEANUP_SECONDS*CLIENT_KNOBS->CORE_VERSIONSPERSECOND) { + printf("\nPassing `--delete_data' would delete the tag that is %.4f hours behind.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } else { - printf("\nPassing `--delete_data' would delete the tag which was %.4f hours behind.\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); + printf("\nPassing `--delete_data' would not delete the tag that is %.4f hours behind. Change `--min_cleanup_seconds' to adjust the cleanup threshold.\n\n", (readVer - minVersion)/(3600.0*CLIENT_KNOBS->CORE_VERSIONSPERSECOND)); } return Void(); From 8f0348d5e02a325b98a4e1d8fdb0bc91f844fa7e Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 31 Oct 2019 16:38:33 -0700 Subject: [PATCH 118/184] fix: merges which cross over systemKeys.begin did not properly decrement the systemSizeEstimate --- documentation/sphinx/source/release-notes.rst | 1 - fdbserver/DataDistributionTracker.actor.cpp | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 6dabb859b9..18027022ff 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -133,7 +133,6 @@ Fixes only impacting 6.2.0+ * The cluster controller would saturate its CPU for a few seconds when sending configuration information to all of the worker processes. [6.2.4] `(PR #2086) `_. * The data distributor would build all possible team combinations if it was tracking an unhealthy server with less than 10 teams. [6.2.4] `(PR #2099) `_. * The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) `_. -* The cluster controller could crash if a coordinator was unreachable when compiling cluster status. [6.2.4] `(PR #2065) `_. * A storage server could crash if it took longer than 10 minutes to fetch a key range from another server. [6.2.5] `(PR #2170) `_. * Excluding or including servers would restart the data distributor. [6.2.5] `(PR #2170) `_. * The data distributor could read invalid memory when estimating database size. [6.2.6] `(PR #2225) `_. diff --git a/fdbserver/DataDistributionTracker.actor.cpp b/fdbserver/DataDistributionTracker.actor.cpp index 2a785a2882..90756a2063 100644 --- a/fdbserver/DataDistributionTracker.actor.cpp +++ b/fdbserver/DataDistributionTracker.actor.cpp @@ -402,6 +402,7 @@ Future shardMerger( bool forwardComplete = false; KeyRangeRef merged; StorageMetrics endingStats = shardSize->get().get(); + int64_t systemBytes = keys.begin >= systemKeys.begin ? shardSize->get().get().bytes : 0; loop { Optional newMetrics; @@ -439,6 +440,9 @@ Future shardMerger( merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end ); endingStats += newMetrics.get(); + if((forwardComplete ? prevIter->range().begin : nextIter->range().begin) >= systemKeys.begin) { + systemBytes += newMetrics.get().bytes; + } shardsMerged++; auto shardBounds = getShardSizeBounds( merged, maxShardSize ); @@ -457,6 +461,9 @@ Future shardMerger( // If going forward, remove most recently added range endingStats -= newMetrics.get(); + if(nextIter->range().begin >= systemKeys.begin) { + systemBytes -= newMetrics.get().bytes; + } shardsMerged--; --nextIter; merged = KeyRangeRef( prevIter->range().begin, nextIter->range().end ); @@ -473,6 +480,9 @@ Future shardMerger( .detail("EndingSize", endingStats.bytes) .detail("BatchedMerges", shardsMerged); + if(mergeRange.begin < systemKeys.begin) { + self->systemSizeEstimate -= systemBytes; + } restartShardTrackers( self, mergeRange, endingStats ); self->shardsAffectedByTeamFailure->defineShard( mergeRange ); self->output.send( RelocateShard( mergeRange, SERVER_KNOBS->PRIORITY_MERGE_SHARD ) ); From 7f75eca7cbf661e26b63449010ed3c25d8ac4bc6 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Thu, 31 Oct 2019 17:06:58 -0700 Subject: [PATCH 119/184] updated release notes --- documentation/sphinx/source/release-notes.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 18027022ff..98e18f76fc 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,6 +2,14 @@ Release Notes ############# +6.2.8 +===== + +Fixes +----- + +* The ``system_kv_size_bytes`` status field could report a size much larger than the actual size of the system keyspace. `(PR #2305) `_. + 6.2.7 ===== From 00b3c8f48a68f59de9116605c180e42ca1c5dbad Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 1 Nov 2019 11:05:31 -0700 Subject: [PATCH 120/184] Revert "Clean up some memory after network thread exits" --- flow/IRandom.h | 1 - flow/Net2.actor.cpp | 4 ---- 2 files changed, 5 deletions(-) diff --git a/flow/IRandom.h b/flow/IRandom.h index cc1dfd7f24..24a2449a4c 100644 --- a/flow/IRandom.h +++ b/flow/IRandom.h @@ -90,7 +90,6 @@ namespace std { class IRandom { public: - virtual ~IRandom() = default; virtual double random01() = 0; // return random value in [0, 1] virtual int randomInt(int min, int maxPlusOne) = 0; virtual int64_t randomInt64(int64_t min, int64_t maxPlusOne) = 0; diff --git a/flow/Net2.actor.cpp b/flow/Net2.actor.cpp index 08dccfbb35..92aef230ea 100644 --- a/flow/Net2.actor.cpp +++ b/flow/Net2.actor.cpp @@ -732,10 +732,6 @@ void Net2::run() { #ifdef WIN32 timeEndPeriod(1); #endif - - // clean up memory - delete this; - thread_network = nullptr; } void Net2::trackMinPriority( TaskPriority minTaskID, double now ) { From 85c315f6848d44fd596d5c580b834f347376c57d Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 1 Nov 2019 14:02:44 -0700 Subject: [PATCH 121/184] Fix: parallelPeekMore was not enabled when peeking from log routers --- fdbserver/LogRouter.actor.cpp | 3 +++ fdbserver/OldTLogServer_6_0.actor.cpp | 10 +++++----- fdbserver/TLogServer.actor.cpp | 10 +++++----- fdbserver/TagPartitionedLogSystem.actor.cpp | 12 ++++++------ 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 840227759c..53fa69b163 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -339,6 +339,9 @@ ACTOR Future logRouterPeekMessages( LogRouterData* self, TLogPeekRequest r try { peekId = req.sequence.get().first; sequence = req.sequence.get().second; + if (sequence >= SERVER_KNOBS->PARALLEL_GET_MORE_REQUESTS && self->peekTracker.find(peekId) == self->peekTracker.end()) { + throw timed_out(); + } auto& trackerData = self->peekTracker[peekId]; if (sequence == 0 && trackerData.sequence_version.find(0) == trackerData.sequence_version.end()) { trackerData.sequence_version[0].send(std::make_pair(req.begin, req.onlySpilled)); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 12a5bd6d94..10626eb241 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -1732,7 +1732,7 @@ void removeLog( TLogData* self, Reference logData ) { } } -ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted, bool parallelGetMore ) { +ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted ) { state Future dbInfoChange = Void(); state Reference r; state Version tagAt = beginVersion; @@ -1746,7 +1746,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st } when( wait( dbInfoChange ) ) { if( logData->logSystem->get() ) { - r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, parallelGetMore ); + r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, true ); } else { r = Reference(); } @@ -1883,7 +1883,7 @@ ACTOR Future tLogCore( TLogData* self, Reference logData, TLogInt if(!logData->isPrimary) { std::vector tags; tags.push_back(logData->remoteTag); - logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true, true) ); + logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true) ); } try { @@ -2247,10 +2247,10 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit logData->logRouterPopToVersion = req.recoverAt; std::vector tags; tags.push_back(logData->remoteTag); - wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true, false) || logData->removed); + wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true) || logData->removed); } else if(!req.recoverTags.empty()) { ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion); - wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false, true) || logData->removed); + wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false) || logData->removed); } pulledRecoveryVersions = true; logData->knownCommittedVersion = req.recoverAt; diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index b5578bedd7..a4c85f6ead 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -2130,7 +2130,7 @@ void removeLog( TLogData* self, Reference logData ) { } } -ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted, bool parallelGetMore ) { +ACTOR Future pullAsyncData( TLogData* self, Reference logData, std::vector tags, Version beginVersion, Optional endVersion, bool poppedIsKnownCommitted ) { state Future dbInfoChange = Void(); state Reference r; state Version tagAt = beginVersion; @@ -2148,7 +2148,7 @@ ACTOR Future pullAsyncData( TLogData* self, Reference logData, st } when( wait( dbInfoChange ) ) { if( logData->logSystem->get() ) { - r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, parallelGetMore ); + r = logData->logSystem->get()->peek( logData->logId, tagAt, endVersion, tags, true ); } else { r = Reference(); } @@ -2285,7 +2285,7 @@ ACTOR Future tLogCore( TLogData* self, Reference logData, TLogInt if(!logData->isPrimary) { std::vector tags; tags.push_back(logData->remoteTag); - logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true, true) ); + logData->addActor.send( pullAsyncData(self, logData, tags, pulledRecoveryVersions ? logData->recoveredAt + 1 : logData->unrecoveredBefore, Optional(), true) ); } try { @@ -2678,10 +2678,10 @@ ACTOR Future tLogStart( TLogData* self, InitializeTLogRequest req, Localit logData->logRouterPopToVersion = req.recoverAt; std::vector tags; tags.push_back(logData->remoteTag); - wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true, false) || logData->removed); + wait(pullAsyncData(self, logData, tags, logData->unrecoveredBefore, req.recoverAt, true) || logData->removed); } else if(!req.recoverTags.empty()) { ASSERT(logData->unrecoveredBefore > req.knownCommittedVersion); - wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false, true) || logData->removed); + wait(pullAsyncData(self, logData, req.recoverTags, req.knownCommittedVersion + 1, req.recoverAt, false) || logData->removed); } pulledRecoveryVersions = true; logData->knownCommittedVersion = req.recoverAt; diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 9aa91105e8..35616454d8 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -553,21 +553,21 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); + return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } if(begin >= lastBegin) { TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - return Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, begin, getPeekEnd(), false, std::vector(), Reference(), 0 ) ); + return Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, begin, getPeekEnd(), parallelGetMore, std::vector(), Reference(), 0 ) ); } else { std::vector< Reference > cursors; std::vector< LogMessageVersion > epochEnds; TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - cursors.emplace_back(new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, lastBegin, getPeekEnd(), false, std::vector(), Reference(), 0 ) ); + cursors.emplace_back(new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, lastBegin, getPeekEnd(), parallelGetMore, std::vector(), Reference(), 0 ) ); int i = 0; while(begin < lastBegin) { if(i == oldLogData.size()) { TraceEvent("TLogPeekRemoteDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size()); - return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); + return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } int bestOldSet = -1; @@ -584,14 +584,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); + return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } if(thisBegin < lastBegin) { TraceEvent("TLogPeekRemoteAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestOldSet", bestOldSet).detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) .detail("LastBegin", lastBegin).detail("ThisBegin", thisBegin).detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion); cursors.emplace_back(new ILogSystem::MergedPeekCursor(oldLogData[i].tLogs[bestOldSet]->logRouters, -1, (int)oldLogData[i].tLogs[bestOldSet]->logRouters.size(), tag, - thisBegin, lastBegin, false, std::vector(), Reference(), 0)); + thisBegin, lastBegin, parallelGetMore, std::vector(), Reference(), 0)); epochEnds.emplace_back(lastBegin); lastBegin = thisBegin; } From f4143c4f50efde7ddb49c50196872dea00740187 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Fri, 1 Nov 2019 14:07:01 -0700 Subject: [PATCH 122/184] updated release notes --- documentation/sphinx/source/release-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 98e18f76fc..a761cd2389 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -8,6 +8,7 @@ Release Notes Fixes ----- +* Significantly improved the rate at which the transaction logs in a remote region can pull data from the primary region. `(PR #2307) `_. * The ``system_kv_size_bytes`` status field could report a size much larger than the actual size of the system keyspace. `(PR #2305) `_. 6.2.7 From 8f84fbc4b981275e435eb81a70b8f8be2b2afa56 Mon Sep 17 00:00:00 2001 From: tclinken Date: Sun, 3 Nov 2019 16:13:32 -0800 Subject: [PATCH 123/184] Only print 'waiting for DD to end...' if test actually waits --- fdbserver/tester.actor.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fdbserver/tester.actor.cpp b/fdbserver/tester.actor.cpp index 973b5f10c2..b88bcfc475 100644 --- a/fdbserver/tester.actor.cpp +++ b/fdbserver/tester.actor.cpp @@ -1093,11 +1093,12 @@ ACTOR Future runTests( Reference runTests( Reference Date: Sun, 3 Nov 2019 17:16:21 -0800 Subject: [PATCH 124/184] FastRestore:ApplyToDB:BugFix:Serialize integer as bigEndian to ensure lexico order --- fdbclient/SystemData.cpp | 10 ++++++++- fdbclient/SystemData.h | 1 + fdbserver/RestoreApplier.actor.cpp | 34 ++++++++++++++++++++++++++---- 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 5f1b4b03d7..8db79b42ce 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -643,11 +643,19 @@ const KeyRef restoreApplierTxnValue = LiteralStringRef("1"); // restoreApplierKeys: track atomic transaction progress to ensure applying atomicOp exactly once const Key restoreApplierKeyFor(UID const& applierID, Version version) { BinaryWriter wr(Unversioned()); - wr.serializeBytes(restoreWorkersKeys.begin); + wr.serializeBytes(restoreApplierKeys.begin); wr << applierID << version; return wr.toValue(); } +std::pair decodeRestoreApplierKey(ValueRef const& key) { + BinaryReader rd(key, Unversioned()); + UID applierID; + Version version; + rd >> applierID >> version; + return std::make_pair(applierID, version); +} + // Encode restore worker key for workerID const Key restoreWorkerKeyFor(UID const& workerID) { BinaryWriter wr(Unversioned()); diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index dd40289902..bc133a6f96 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -298,6 +298,7 @@ extern const KeyRangeRef restoreApplierKeys; extern const KeyRef restoreApplierTxnValue; const Key restoreApplierKeyFor(UID const& applierID, Version version); +std::pair decodeRestoreApplierKey(ValueRef const& key); const Key restoreWorkerKeyFor(UID const& workerID); const Value restoreWorkerInterfaceValue(RestoreWorkerInterface const& server); RestoreWorkerInterface decodeRestoreWorkerInterfaceValue(ValueRef const& value); diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index ffd1ddf84b..696ac345d0 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -271,6 +271,30 @@ ACTOR Future applyToDB(Reference self, Database cx) { } state Reference tr(new ReadYourWritesTransaction(cx)); + // Sanity check the restoreApplierKeys, which should be empty at this point + loop { + try { + tr->reset(); + tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); + tr->setOption(FDBTransactionOptions::LOCK_AWARE); + Key begin = restoreApplierKeyFor( + self->id(), bigEndian64(0)); // Integer must be BigEndian to maintain ordering in lexical order + Key end = restoreApplierKeyFor(self->id(), bigEndian64(std::numeric_limits::max())); + Standalone txnIds = wait(tr->getRange(KeyRangeRef(begin, end), CLIENT_KNOBS->TOO_MANY)); + if (txnIds.size() > 0) { + TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean").detail("TxnIds", txnIds.size()); + for (auto& kv : txnIds) { + std::pair applierInfo = decodeRestoreApplierKey(kv.key); + TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean") + .detail("Applier", applierInfo.first) + .detail("ResidueTxnID", applierInfo.second); + } + } + break; + } catch (Error& e) { + wait(tr->onError(e)); + } + } loop { // Transaction retry loop try { @@ -279,7 +303,8 @@ ACTOR Future applyToDB(Reference self, Database cx) { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Optional txnSucceeded = wait(tr->get(restoreApplierKeyFor(self->id(), progress.curTxnId))); + Optional txnSucceeded = + wait(tr->get(restoreApplierKeyFor(self->id(), bigEndian64(progress.curTxnId)))); if (!txnSucceeded.present()) { progress.rollback(); continue; @@ -305,7 +330,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { .detail("Version", progress.curItInCurTxn->first); // restoreApplierKeyFor(self->id(), curTxnId) to tell if txn succeeds at an unknown error - tr->set(restoreApplierKeyFor(self->id(), progress.curTxnId), restoreApplierTxnValue); + tr->set(restoreApplierKeyFor(self->id(), bigEndian64(progress.curTxnId)), restoreApplierTxnValue); while (1) { // Loop: Accumulate mutations in a transaction MutationRef m = progress.getCurrentMutation(); @@ -383,8 +408,9 @@ ACTOR Future applyToDB(Reference self, Database cx) { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - tr->clear(KeyRangeRef(restoreApplierKeyFor(self->id(), 0), - restoreApplierKeyFor(self->id(), progress.curTxnId + 1))); + // Clear txnIds in [0, progress.curTxnId). We add 100 to curTxnId just to be safe. + tr->clear(KeyRangeRef(restoreApplierKeyFor(self->id(), bigEndian64(0)), + restoreApplierKeyFor(self->id(), bigEndian64(progress.curTxnId + 100)))); wait(tr->commit()); break; } catch (Error& e) { From 63359bfc8bc2c94bb415d91e144e3c731c1e5707 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Sun, 3 Nov 2019 17:20:41 -0800 Subject: [PATCH 125/184] FastRestore:handleInitVersionBatchRequest:Ensure exact once execution Also increase the test workload for BackupAndParallelRestoreWithAtomicOp test --- fdbserver/RestoreMaster.actor.h | 11 ++++++++++- fdbserver/RestoreRoleCommon.actor.cpp | 16 +++++++++++----- fdbserver/RestoreRoleCommon.actor.h | 3 +++ ...arallelRestoreCorrectnessAtomicOpTinyData.txt | 8 ++++---- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/fdbserver/RestoreMaster.actor.h b/fdbserver/RestoreMaster.actor.h index 1ec8819c37..9d5e28345d 100644 --- a/fdbserver/RestoreMaster.actor.h +++ b/fdbserver/RestoreMaster.actor.h @@ -68,7 +68,8 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCountedbegin(); versionBatch != versionBatches->end(); versionBatch++) { std::sort(versionBatch->second.rangeFiles.begin(), versionBatch->second.rangeFiles.end()); std::sort(versionBatch->second.logFiles.begin(), versionBatch->second.logFiles.end()); for (auto& logFile : versionBatch->second.logFiles) { logFile.fileIndex = ++fileIndex; + TraceEvent("FastRestore") + .detail("VersionBatchId", versionBatchId) + .detail("LogFile", logFile.toString()); } for (auto& rangeFile : versionBatch->second.rangeFiles) { rangeFile.fileIndex = ++fileIndex; + TraceEvent("FastRestore") + .detail("VersionBatchId", versionBatchId) + .detail("RangeFile", rangeFile.toString()); } + versionBatchId++; } TraceEvent("FastRestore").detail("VersionBatches", versionBatches->size()); diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index eb0f8ecc1b..8f378f08d3 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -56,11 +56,17 @@ void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference } void handleInitVersionBatchRequest(const RestoreVersionBatchRequest& req, Reference self) { - self->resetPerVersionBatch(); - TraceEvent("FastRestore") - .detail("InitVersionBatch", req.batchID) - .detail("Role", getRoleStr(self->role)) - .detail("Node", self->id()); + // batchId is continuous. (req.batchID-1) is the id of the just finished batch. + self->versionBatchId.whenAtLeast(req.batchID - 1); + + if (self->versionBatchId.get() == req.batchID - 1) { + self->resetPerVersionBatch(); + TraceEvent("FastRestore") + .detail("InitVersionBatch", req.batchID) + .detail("Role", getRoleStr(self->role)) + .detail("Node", self->id()); + self->versionBatchId.set(req.batchID); + } req.reply.send(RestoreCommonReply(self->id())); } diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index 81120d87b7..765e1b46fd 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -32,6 +32,7 @@ #include "flow/Stats.h" #include "fdbclient/FDBTypes.h" #include "fdbclient/CommitTransaction.h" +#include "fdbclient/Notified.h" #include "fdbrpc/fdbrpc.h" #include "fdbrpc/Locality.h" #include "fdbserver/CoordinationInterface.h" @@ -114,6 +115,8 @@ public: std::map appliersInterf; RestoreApplierInterface masterApplierInterf; + NotifiedVersion versionBatchId; // Continuously increase for each versionBatch + bool versionBatchStart = false; uint32_t inProgressFlag = 0; diff --git a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt index 39dc51032e..c61ba6255d 100644 --- a/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt +++ b/tests/slow/ParallelRestoreCorrectnessAtomicOpTinyData.txt @@ -1,10 +1,10 @@ testTitle=BackupAndParallelRestoreWithAtomicOp testName=AtomicOps -; nodeCount=30000 + nodeCount=30000 ; Make ops space only 1 key per group - nodeCount=100 -; transactionsPerSecond=2500.0 - transactionsPerSecond=500.0 +; nodeCount=100 + transactionsPerSecond=2500.0 +; transactionsPerSecond=500.0 ; transactionsPerSecond=100.0 ; nodeCount=4 ; transactionsPerSecond=250.0 From 7cf87e9ae3d28ee65d88921f016ebb7641192362 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Sun, 3 Nov 2019 17:31:54 -0800 Subject: [PATCH 126/184] FastRestore:Add ParallelRestoreCorrectnessCycle.txt test --- .../slow/ParallelRestoreCorrectnessCycle.txt | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tests/slow/ParallelRestoreCorrectnessCycle.txt diff --git a/tests/slow/ParallelRestoreCorrectnessCycle.txt b/tests/slow/ParallelRestoreCorrectnessCycle.txt new file mode 100644 index 0000000000..e6126f3dcc --- /dev/null +++ b/tests/slow/ParallelRestoreCorrectnessCycle.txt @@ -0,0 +1,76 @@ +testTitle=BackupAndRestore + testName=Cycle + ; nodeCount=30000 + nodeCount=1000 + transactionsPerSecond=500.0 + ; transactionsPerSecond=2500.0 + testDuration=30.0 + expectedRate=0 + clearAfterTest=false + ; keyPrefix=! + + ; testName=Cycle + ;; nodeCount=1000 + ; transactionsPerSecond=2500.0 + ; testDuration=30.0 + ; expectedRate=0 + ; clearAfterTest=false + ; keyPrefix=z + ; + ; testName=Cycle + ;; nodeCount=1000 + ; transactionsPerSecond=2500.0 + ; testDuration=30.0 + ; expectedRate=0 + ; clearAfterTest=false + ; keyPrefix=A + ; + ; testName=Cycle + ;; nodeCount=1000 + ; transactionsPerSecond=2500.0 + ; testDuration=30.0 + ; expectedRate=0 + ; clearAfterTest=false + ; keyPrefix=Z + + ; Each testName=RunRestoreWorkerWorkload creates a restore worker + ; We need at least 3 restore workers: master, loader, and applier + testName=RunRestoreWorkerWorkload + + ; Test case for parallel restore + testName=BackupAndParallelRestoreCorrectness + backupAfter=10.0 + restoreAfter=60.0 + clearAfterTest=false + simBackupAgents=BackupToFile + ; backupRangesCount<0 means backup the entire normal keyspace + backupRangesCount=-1 + ; TODO: Support abortAndRestartAfter test by commenting it out + abortAndRestartAfter=0 + + testName=RandomClogging + testDuration=90.0 + + ; testName=Rollback + ; meanDelay=90.0 + ; testDuration=90.0 + + ; Do NOT consider machine crash yet + ; testName=Attrition + ; machinesToKill=10 + ; machinesToLeave=3 + ; reboot=true + ; testDuration=90.0 + + ; testName=Attrition + ; machinesToKill=10 + ; machinesToLeave=3 + ; reboot=true + ; testDuration=90.0 + + ; Disable buggify for parallel restore + buggify=off + ;testDuration=360000 ;not work + ;timeout is in seconds + timeout=360000 + From 27c7ef09a35f065f62cf84ebc2a2387deabebf77 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Sun, 3 Nov 2019 20:20:58 -0800 Subject: [PATCH 127/184] FastRestore:Revise code in self review When we read the txnId from decodeRestoreApplierKey func, we should convert the integer to little endian. --- fdbserver/RestoreApplier.actor.cpp | 2 +- fdbserver/RestoreMaster.actor.h | 3 +-- tests/CMakeLists.txt | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 696ac345d0..a58ad8db73 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -287,7 +287,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { std::pair applierInfo = decodeRestoreApplierKey(kv.key); TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean") .detail("Applier", applierInfo.first) - .detail("ResidueTxnID", applierInfo.second); + .detail("ResidueTxnID", bigEndian64(applierInfo.second)); } } break; diff --git a/fdbserver/RestoreMaster.actor.h b/fdbserver/RestoreMaster.actor.h index 9d5e28345d..7f8822e829 100644 --- a/fdbserver/RestoreMaster.actor.h +++ b/fdbserver/RestoreMaster.actor.h @@ -68,8 +68,7 @@ struct RestoreMasterData : RestoreRoleData, public ReferenceCounted Date: Mon, 4 Nov 2019 03:04:03 -0800 Subject: [PATCH 128/184] COWPager can now internally remap page IDs by version and has been renamed to DWALPager. This causes the B+Tree to no longer have to rewrite all ancestors of an updated page. FIFOQueue now has a read-only cursor and a peekAll() method to read an entire queue without popping it. Fixed some valgrind false positives, made some debug logging improvements. Fixed bug in pager shutdown where it could wait on an ActorCollection containing canceled futures. --- fdbserver/IPager.h | 11 +- fdbserver/VersionedBTree.actor.cpp | 439 ++++++++++++++++++++--------- 2 files changed, 317 insertions(+), 133 deletions(-) diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index 25def8487d..dc58461e47 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -29,7 +29,8 @@ #define REDWOOD_DEBUG 0 -#define debug_printf_always(...) { fprintf(stdout, "%s %f (%s:%d) ", g_network->getLocalAddress().toString().c_str(), now(), __FUNCTION__, __LINE__), fprintf(stdout, __VA_ARGS__); fflush(stdout); } +#define debug_printf_stream stderr +#define debug_printf_always(...) { fprintf(debug_printf_stream, "%s %f %04d ", g_network->getLocalAddress().toString().c_str(), now(), __LINE__); fprintf(debug_printf_stream, __VA_ARGS__); fflush(debug_printf_stream); } #define debug_printf_noop(...) @@ -44,8 +45,8 @@ #define debug_printf printf #endif -#define BEACON fprintf(stderr, "%s: %s line %d \n", __FUNCTION__, __FILE__, __LINE__) -#define TRACE fprintf(stderr, "%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); +#define BEACON debug_printf_always("HERE\n") +#define TRACE debug_printf_always("%s: %s line %d %s\n", __FUNCTION__, __FILE__, __LINE__, platform::get_backtrace().c_str()); #ifndef VALGRIND #define VALGRIND_MAKE_MEM_UNDEFINED(x, y) @@ -53,7 +54,7 @@ #endif typedef uint32_t LogicalPageID; // uint64_t? -static const int invalidLogicalPageID = LogicalPageID(-1); +static const LogicalPageID invalidLogicalPageID = std::numeric_limits::max(); class IPage { public: @@ -210,7 +211,7 @@ public: virtual StorageBytes getStorageBytes() = 0; // Count of pages in use by the pager client - virtual int64_t getUserPageCount() = 0; + virtual Future getUserPageCount() = 0; // Future returned is ready when pager has been initialized from disk and is ready for reads and writes. // It is invalid to call most other functions until init() is ready. diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 8801643bcd..8b2cd3e9d6 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -189,7 +189,8 @@ public: struct Cursor { enum Mode { NONE, - READ, + POP, + READONLY, WRITE }; @@ -213,7 +214,7 @@ public: Cursor() : mode(NONE) { } - // Initialize a cursor. Since cursors can have async operations pending they can't be copied cleanly. + // Initialize a cursor. void init(FIFOQueue *q = nullptr, Mode m = NONE, LogicalPageID initialPageID = invalidLogicalPageID, int readOffset = 0, LogicalPageID endPage = invalidLogicalPageID) { if(operation.isValid()) { operation.cancel(); @@ -225,7 +226,7 @@ public: endPageID = endPage; page.clear(); - if(mode == READ) { + if(mode == POP || mode == READONLY) { // If cursor is not pointed at the end page then start loading it. // The end page will not have been written to disk yet. pageID = initialPageID; @@ -244,8 +245,15 @@ public: } } + // Since cursors can have async operations pending which modify their state they can't be copied cleanly Cursor(const Cursor &other) = delete; + // A read cursor can be initialized from a pop cursor + void initReadOnly(const Cursor &c) { + ASSERT(c.mode == READONLY || c.mode == POP); + init(c.queue, READONLY, c.pageID, c.offset, c.endPageID); + } + ~Cursor() { operation.cancel(); } @@ -254,7 +262,7 @@ public: if(mode == WRITE) { return format("{WriteCursor %s:%p pos=%s:%d endOffset=%d}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1); } - if(mode == READ) { + if(mode == POP || mode == READONLY) { return format("{ReadCursor %s:%p pos=%s:%d endOffset=%d endPage=%s}", queue->name.c_str(), this, ::toString(pageID).c_str(), offset, page ? raw()->endOffset : -1, ::toString(endPageID).c_str()); } ASSERT(mode == NONE); @@ -295,7 +303,7 @@ public: } Future loadPage() { - ASSERT(mode == READ); + ASSERT(mode == POP | mode == READONLY); debug_printf("FIFOQueue::Cursor(%s) loadPage\n", toString().c_str()); return map(queue->pager->readPage(pageID, true), [=](Reference p) { page = p; @@ -380,9 +388,9 @@ public: p.send(Void()); } - // Read the next item at the cursor, moving to a new page first if the current page is exhausted + // Read the next item at the cursor (if <= upperBound), moving to a new page first if the current page is exhausted ACTOR static Future> readNext_impl(Cursor *self, Optional upperBound, Future start) { - ASSERT(self->mode == READ); + ASSERT(self->mode == POP || self->mode == READONLY); // Wait for the previous operation to finish state Future previous = self->operation; @@ -414,7 +422,9 @@ public: } self->offset += bytesRead; - --self->queue->numEntries; + if(self->mode == POP) { + --self->queue->numEntries; + } debug_printf("FIFOQueue::Cursor(%s) after read of %s\n", self->toString().c_str(), ::toString(result).c_str()); ASSERT(self->offset <= p->endOffset); @@ -423,21 +433,26 @@ public: LogicalPageID oldPageID = self->pageID; self->pageID = p->nextPageID; self->offset = p->nextOffset; - --self->queue->numPages; + if(self->mode == POP) { + --self->queue->numPages; + } self->page.clear(); - debug_printf("FIFOQueue::Cursor(%s) Page exhausted, moved to new page\n", self->toString().c_str()); + debug_printf("FIFOQueue::Cursor(%s) readNext page exhausted, moved to new page\n", self->toString().c_str()); - // Freeing the old page must happen after advancing the cursor and clearing the page reference because - // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this - // very same queue. - // Queue pages are freed at page 0 because they can be reused after the next commit. - self->queue->pager->freePage(oldPageID, 0); + if(self->mode == POP) { + // Freeing the old page must happen after advancing the cursor and clearing the page reference because + // freePage() could cause a push onto a queue that causes a newPageID() call which could pop() from this + // very same queue. + // Queue pages are freed at page 0 because they can be reused after the next commit. + self->queue->pager->freePage(oldPageID, 0); + } } - debug_printf("FIFOQueue(%s) pop(upperBound=%s) -> %s\n", self->queue->name.c_str(), ::toString(upperBound).c_str(), ::toString(result).c_str()); + debug_printf("FIFOQueue(%s) %s(upperBound=%s) -> %s\n", self->queue->name.c_str(), (self->mode == POP ? "pop" : "peek"), ::toString(upperBound).c_str(), ::toString(result).c_str()); return result; } + // Read and move past the next item if is <= upperBound or if upperBound is not present Future> readNext(const Optional &upperBound = {}) { if(mode == NONE) { return Optional(); @@ -463,13 +478,13 @@ public: // Create a new queue at newPageID void create(IPager2 *p, LogicalPageID newPageID, std::string queueName) { - debug_printf("FIFOQueue(%s) create from page id %u\n", queueName.c_str(), newPageID); + debug_printf("FIFOQueue(%s) create from page %s\n", queueName.c_str(), toString(newPageID).c_str()); pager = p; name = queueName; numPages = 1; numEntries = 0; dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); - headReader.init(this, Cursor::READ, newPageID, 0, newPageID); + headReader.init(this, Cursor::POP, newPageID, 0, newPageID); tailWriter.init(this, Cursor::WRITE, newPageID); headWriter.init(this, Cursor::WRITE); newTailPage = invalidLogicalPageID; @@ -484,13 +499,35 @@ public: numPages = qs.numPages; numEntries = qs.numEntries; dataBytesPerPage = pager->getUsablePageSize() - sizeof(typename Cursor::RawPage); - headReader.init(this, Cursor::READ, qs.headPageID, qs.headOffset, qs.tailPageID); + headReader.init(this, Cursor::POP, qs.headPageID, qs.headOffset, qs.tailPageID); tailWriter.init(this, Cursor::WRITE, qs.tailPageID); headWriter.init(this, Cursor::WRITE); newTailPage = invalidLogicalPageID; debug_printf("FIFOQueue(%s) recovered\n", queueName.c_str()); } + ACTOR static Future>> peekAll_impl(FIFOQueue *self) { + state Standalone> results; + state Cursor c; + c.initReadOnly(self->headReader); + results.reserve(results.arena(), self->numEntries); + + loop { + Optional x = wait(c.readNext()); + if(!x.present()) { + break; + } + results.push_back(results.arena(), x.get()); + } + + return results; + } + + Future>> peekAll() { + return peekAll_impl(this); + } + + // Pop the next item on front of queue if it is <= upperBound or if upperBound is not present Future> pop(Optional upperBound = {}) { return headReader.readNext(upperBound); } @@ -787,13 +824,23 @@ ACTOR template Future forwardError(Future f, Promise target } } -class COWPagerSnapshot; +class DWALPagerSnapshot; -class COWPager : public IPager2 { +// An implementation of IPager2 that supports atomicUpdate() of a page without forcing a change to new page ID. +// It does this internally mapping the original page ID to alternate page IDs by write version. +// The page id remaps are kept in memory and also logged to a "remap queue" which must be reloaded on cold start. +// To prevent the set of remaps from growing unboundedly, once a remap is old enough to be at or before the +// oldest pager version being maintained the remap can be "undone" by popping it from the remap queue, +// copying the alternate page ID's data over top of the original page ID's data, and deleting the remap from memory. +// This process basically describes a "Delayed" Write-Ahead-Log (DWAL) because the remap queue and the newly allocated +// alternate pages it references basically serve as a write ahead log for pages that will eventially be copied +// back to their original location once the original version is no longer needed. +class DWALPager : public IPager2 { public: typedef FastAllocatedPage Page; typedef FIFOQueue LogicalPageQueueT; +#pragma pack(push, 1) struct DelayedFreePage { Version version; LogicalPageID pageID; @@ -803,15 +850,32 @@ public: } std::string toString() const { - return format("{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); + return format("DelayedFreePage{%s @%" PRId64 "}", ::toString(pageID).c_str(), version); } }; - typedef FIFOQueue VersionedLogicalPageQueueT; + struct RemappedPage { + Version version; + LogicalPageID originalPageID; + LogicalPageID newPageID; + + bool operator<(const RemappedPage &rhs) { + return version < rhs.version; + } + + std::string toString() const { + return format("RemappedPage(%s -> %s @%" PRId64 "}", ::toString(originalPageID).c_str(), ::toString(newPageID).c_str(), version); + } + }; + +#pragma pack(pop) + + typedef FIFOQueue DelayedFreePageQueueT; + typedef FIFOQueue RemapQueueT; // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 for default - COWPager(int desiredPageSize, std::string filename, int pageCacheSizeBytes) + DWALPager(int desiredPageSize, std::string filename, int pageCacheSizeBytes) : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) { if(pageCacheBytes == 0) { @@ -838,9 +902,11 @@ public: memcpy(lastCommittedHeaderPage->mutate(), headerPage->begin(), smallestPhysicalBlock); } - ACTOR static Future recover(COWPager *self) { + ACTOR static Future recover(DWALPager *self) { ASSERT(!self->recoverFuture.isValid()); + self->remapUndoFuture = Void(); + int64_t flags = IAsyncFile::OPEN_UNCACHED | IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK; state bool exists = fileExists(self->filename); if(!exists) { @@ -859,13 +925,13 @@ public: wait(store(fileSize, self->pageFile->size())); } - debug_printf("COWPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize); + debug_printf("DWALPager(%s) recover exists=%d fileSize=%" PRId64 "\n", self->filename.c_str(), exists, fileSize); // TODO: If the file exists but appears to never have been successfully committed is this an error or // should recovery proceed with a new pager instance? // If there are at least 2 pages then try to recover the existing file if(exists && fileSize >= (self->smallestPhysicalBlock * 2)) { - debug_printf("COWPager(%s) recovering using existing file\n"); + debug_printf("DWALPager(%s) recovering using existing file\n"); state bool recoveredHeader = false; @@ -874,7 +940,7 @@ public: // If the checksum fails for the header page, try to recover committed header backup from page 1 if(!self->headerPage.castTo()->verifyChecksum(0)) { - TraceEvent(SevWarn, "COWPagerRecoveringHeader").detail("Filename", self->filename); + TraceEvent(SevWarn, "DWALPagerRecoveringHeader").detail("Filename", self->filename); wait(store(self->headerPage, self->readHeaderPage(self, 1))); @@ -885,7 +951,7 @@ public: } Error e = checksum_failed(); - TraceEvent(SevError, "COWPagerRecoveryFailed") + TraceEvent(SevError, "DWALPagerRecoveryFailed") .detail("Filename", self->filename) .error(e); throw e; @@ -897,7 +963,7 @@ public: self->setPageSize(self->pHeader->pageSize); if(self->logicalPageSize != self->desiredPageSize) { - TraceEvent(SevWarn, "COWPagerPageSizeNotDesired") + TraceEvent(SevWarn, "DWALPagerPageSizeNotDesired") .detail("Filename", self->filename) .detail("ExistingPageSize", self->logicalPageSize) .detail("DesiredPageSize", self->desiredPageSize); @@ -905,6 +971,14 @@ public: self->freeList.recover(self, self->pHeader->freeList, "FreeListRecovered"); self->delayedFreeList.recover(self, self->pHeader->delayedFreeList, "DelayedFreeListRecovered"); + self->remapQueue.recover(self, self->pHeader->remapQueue, "RemapQueueRecovered"); + + Standalone> remaps = wait(self->remapQueue.peekAll()); + for(auto &r : remaps) { + if(r.newPageID != invalidLogicalPageID) { + self->remappedPages[r.originalPageID][r.version] = r.newPageID; + } + } // If the header was recovered from the backup at Page 1 then write and sync it to Page 0 before continuing. // If this fails, the backup header is still in tact for the next recovery attempt. @@ -917,7 +991,7 @@ public: // Sync header wait(self->pageFile->sync()); - debug_printf("COWPager(%s) Header recovery complete.\n", self->filename.c_str()); + debug_printf("DWALPager(%s) Header recovery complete.\n", self->filename.c_str()); } // Update the last committed header with the one that was recovered (which is the last known committed header) @@ -929,7 +1003,7 @@ public: // A new pager will be created in its place. // TODO: Is the right behavior? - debug_printf("COWPager(%s) creating new pager\n"); + debug_printf("DWALPager(%s) creating new pager\n"); self->headerPage = self->newPageBuffer(); self->pHeader = (Header *)self->headerPage->begin(); @@ -949,15 +1023,17 @@ public: // Page 1 - header backup self->pHeader->pageCount = 2; - // Create a new free list + // Create queues self->freeList.create(self, self->newLastPageID(), "FreeList"); self->delayedFreeList.create(self, self->newLastPageID(), "delayedFreeList"); + self->remapQueue.create(self, self->newLastPageID(), "remapQueue"); // The first commit() below will flush the queues and update the queue states in the header, // but since the queues will not be used between now and then their states will not change. // In order to populate lastCommittedHeader, update the header now with the queue states. self->pHeader->freeList = self->freeList.getState(); self->pHeader->delayedFreeList = self->delayedFreeList.getState(); + self->pHeader->remapQueue = self->remapQueue.getState(); // Set remaining header bytes to \xff memset(self->headerPage->mutate() + self->pHeader->size(), 0xff, self->headerPage->size() - self->pHeader->size()); @@ -968,7 +1044,7 @@ public: wait(self->commit()); } - debug_printf("COWPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, self->physicalPageSize); + debug_printf("DWALPager(%s) recovered. committedVersion=%" PRId64 " logicalPageSize=%d physicalPageSize=%d\n", self->filename.c_str(), self->pHeader->committedVersion, self->logicalPageSize, self->physicalPageSize); return Void(); } @@ -984,11 +1060,11 @@ public: // Get a new, previously available page ID. The page will be considered in-use after the next commit // regardless of whether or not it was written to, until it is returned to the pager via freePage() - ACTOR static Future newPageID_impl(COWPager *self) { + ACTOR static Future newPageID_impl(DWALPager *self) { // First try the free list Optional freePageID = wait(self->freeList.pop()); if(freePageID.present()) { - debug_printf("COWPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); + debug_printf("DWALPager(%s) newPageID() returning %s from free list\n", self->filename.c_str(), toString(freePageID.get()).c_str()); return freePageID.get(); } @@ -996,13 +1072,13 @@ public: ASSERT(!self->snapshots.empty()); Optional delayedFreePageID = wait(self->delayedFreeList.pop(DelayedFreePage{self->effectiveOldestVersion(), 0})); if(delayedFreePageID.present()) { - debug_printf("COWPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); + debug_printf("DWALPager(%s) newPageID() returning %s from delayed free list\n", self->filename.c_str(), toString(delayedFreePageID.get()).c_str()); return delayedFreePageID.get().pageID; } // Lastly, add a new page to the pager LogicalPageID id = self->newLastPageID(); - debug_printf("COWPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str()); + debug_printf("DWALPager(%s) newPageID() returning %s at end of file\n", self->filename.c_str(), toString(id).c_str()); return id; }; @@ -1018,13 +1094,13 @@ public: } Future writeHeaderPage(PhysicalPageID pageID, Reference page) { - debug_printf("COWPager(%s) header op=write %s\n", filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) header op=write %s\n", filename.c_str(), toString(pageID).c_str()); ((Page *)page.getPtr())->updateChecksum(pageID); return holdWhile(page, pageFile->write(page->begin(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); } Future writePhysicalPage(PhysicalPageID pageID, Reference page) { - debug_printf("COWPager(%s) op=write %s\n", filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) op=write %s\n", filename.c_str(), toString(pageID).c_str()); ((Page *)page.getPtr())->updateChecksum(pageID); return holdWhile(page, pageFile->write(page->begin(), physicalPageSize, (int64_t)pageID * physicalPageSize)); } @@ -1032,7 +1108,7 @@ public: void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places // the new content in the cache entry when the write is launched, not when it is completed. @@ -1044,46 +1120,57 @@ public: return Void(); }); } + // If the page is being written, wait for this write before issuing the new write + else if(cacheEntry.writing()) { + cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { + writePhysicalPage(pageID, data); + return Void(); + }); + } else { - // If the page is being written, wait for this write before issuing the new write - if(cacheEntry.writing()) { - cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { - writePhysicalPage(pageID, data); - return Void(); - }); - } - else { - cacheEntry.writeFuture = writePhysicalPage(pageID, data); - } + cacheEntry.writeFuture = writePhysicalPage(pageID, data); } - operations.add(forwardError(cacheEntry.writeFuture, errorPromise)); + cacheEntry.writeFuture = forwardError(cacheEntry.writeFuture, errorPromise); + operations.add(cacheEntry.writeFuture); // Always update the page contents immediately regardless of what happened above. cacheEntry.readFuture = data; } Future atomicUpdatePage(LogicalPageID pageID, Reference data, Version v) override { - debug_printf("COWPager(%s) op=writeAtomic %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); + debug_printf("DWALPager(%s) op=writeAtomic %s @%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v); // This pager does not support atomic update, so it always allocates and uses a new pageID Future f = map(newPageID(), [=](LogicalPageID newPageID) { updatePage(newPageID, data); - freePage(pageID, v); - return newPageID; + // TODO: Possibly limit size of remap queue since it must be recovered on cold start + RemappedPage r{v, pageID, newPageID}; + remapQueue.pushBack(r); + remappedPages[pageID][v] = newPageID; + debug_printf("DWALPager(%s) pushed %s\n", filename.c_str(), RemappedPage(r).toString().c_str()); + return pageID; }); - return forwardError(f, errorPromise); + // No need for forwardError here because newPageID() is already wrapped in forwardError + return f; } void freePage(LogicalPageID pageID, Version v) override { + // If pageID has been remapped, then it can't be freed until all existing remaps for that page have been undone, so queue it for later deletion + if(remappedPages.find(pageID) != remappedPages.end()) { + debug_printf("DWALPager(%s) op=freeRemapped %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + remapQueue.pushBack(RemappedPage{v, pageID, invalidLogicalPageID}); + return; + } + // If v is older than the oldest version still readable then mark pageID as free as of the next commit if(v < effectiveOldestVersion()) { - debug_printf("COWPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + debug_printf("DWALPager(%s) op=freeNow %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); freeList.pushBack(pageID); } else { // Otherwise add it to the delayed free list - debug_printf("COWPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); + debug_printf("DWALPager(%s) op=freeLater %s @%" PRId64 " oldestVersion=%" PRId64 "\n", filename.c_str(), toString(pageID).c_str(), v, pLastCommittedHeader->oldestVersion); delayedFreeList.pushBack({v, pageID}); } }; @@ -1091,33 +1178,33 @@ public: // Header pages use a page size of smallestPhysicalBlock // If the user chosen physical page size is larger, then there will be a gap of unused space after // between the end of page 1 and the start of page 2. - ACTOR static Future> readHeaderPage(COWPager *self, PhysicalPageID pageID) { + ACTOR static Future> readHeaderPage(DWALPager *self, PhysicalPageID pageID) { if(g_network->getCurrentTask() > TaskPriority::DiskRead) { wait(delay(0, TaskPriority::DiskRead)); } state Reference page(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)); int readBytes = wait(self->pageFile->read(page->mutate(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); - debug_printf("COWPager(%s) header op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); + debug_printf("DWALPager(%s) header op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); ASSERT(readBytes == smallestPhysicalBlock); return page; } - ACTOR static Future> readPhysicalPage(COWPager *self, PhysicalPageID pageID) { + ACTOR static Future> readPhysicalPage(DWALPager *self, PhysicalPageID pageID) { if(g_network->getCurrentTask() > TaskPriority::DiskRead) { wait(delay(0, TaskPriority::DiskRead)); } state Reference page = self->newPageBuffer(); - debug_printf("COWPager(%s) op=read_physical_start %s\n", self->filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) op=read_physical_start %s\n", self->filename.c_str(), toString(pageID).c_str()); int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); - debug_printf("COWPager(%s) op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); + debug_printf("DWALPager(%s) op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); ASSERT(readBytes == self->physicalPageSize); Page *p = (Page *)page.getPtr(); if(!p->verifyChecksum(pageID)) { - debug_printf("COWPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); Error e = checksum_failed(); - TraceEvent(SevError, "COWPagerChecksumFailed") + TraceEvent(SevError, "DWALPagerChecksumFailed") .detail("Filename", self->filename.c_str()) .detail("PageID", pageID) .detail("PageSize", self->physicalPageSize) @@ -1135,24 +1222,45 @@ public: // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache if(!cacheable) { - debug_printf("COWPager(%s) op=read_nocache %s\n", filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) op=read_nocache %s\n", filename.c_str(), toString(pageID).c_str()); PageCacheEntry *pCacheEntry = pageCache.getIfExists(pageID); if(pCacheEntry != nullptr) { + debug_printf("DWALPager(%s) op=read_nocache_hit %s\n", filename.c_str(), toString(pageID).c_str()); return pCacheEntry->readFuture; } + debug_printf("DWALPager(%s) op=read_nocache_miss %s\n", filename.c_str(), toString(pageID).c_str()); return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); } PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("COWPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); if(!cacheEntry.readFuture.isValid()) { - debug_printf("COWPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); cacheEntry.readFuture = readPhysicalPage(this, (PhysicalPageID)pageID); } - return forwardError(cacheEntry.readFuture, errorPromise); + cacheEntry.readFuture = forwardError(cacheEntry.readFuture, errorPromise); + return cacheEntry.readFuture; + } + + Future> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable) { + auto i = remappedPages.find(pageID); + + if(i != remappedPages.end()) { + auto j = i->second.upper_bound(v); + if(j != i->second.begin()) { + --j; + debug_printf("DWALPager(%s) read %s @%" PRId64 " -> %s\n", filename.c_str(), toString(pageID).c_str(), v, toString(j->second).c_str()); + pageID = j->second; + } + } + else { + debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(), toString(pageID).c_str(), v); + } + + return readPage(pageID, cacheable); } // Get snapshot as of the most recent committed version of the pager @@ -1178,12 +1286,69 @@ public: return std::min(pLastCommittedHeader->oldestVersion, snapshots.front().version); } - ACTOR static Future commit_impl(COWPager *self) { - debug_printf("COWPager(%s) commit begin\n", self->filename.c_str()); + ACTOR static Future undoRemaps(DWALPager *self) { + state RemappedPage cutoff; + cutoff.version = self->effectiveOldestVersion(); + + // TODO: Use parallel reads + // TODO: One run of this actor might write to the same original page more than once, in which case just unmap the latest + loop { + if(self->remapUndoStop) { + break; + } + state Optional p = wait(self->remapQueue.pop(cutoff)); + if(!p.present()) { + break; + } + debug_printf("DWALPager(%s) undoRemaps popped %s\n", self->filename.c_str(), p.get().toString().c_str()); + + if(p.get().newPageID == invalidLogicalPageID) { + debug_printf("DWALPager(%s) undoRemaps freeing %s\n", self->filename.c_str(), p.get().toString().c_str()); + self->freePage(p.get().originalPageID, p.get().version); + } + else { + // Read the data from the page that the original was mapped to + Reference data = wait(self->readPage(p.get().newPageID, false)); + + // Some page reads will mark the unused portion of the page as undefined to catch bugs with valgrind. + // We are blindly copying the page data to a new location regardless of its format so mark all of it defined. + VALGRIND_MAKE_MEM_DEFINED(data->begin(), data->size()); + + // Write the data to the original page so it can be read using its original pageID + self->updatePage(p.get().originalPageID, data); + + // Remove the remap from this page, deleting the entry for the pageID if its map becomes empty + auto i = self->remappedPages.find(p.get().originalPageID); + if(i->second.size() == 1) { + self->remappedPages.erase(i); + } + else { + i->second.erase(p.get().version); + } + + // Now that the remap has been undone nothing will read this page so it can be freed as of the next commit. + self->freePage(p.get().newPageID, 0); + } + } + + debug_printf("DWALPager(%s) undoRemaps stopped, remapQueue size is %d\n", self->filename.c_str(), self->remapQueue.numEntries); + return Void(); + } + + ACTOR static Future commit_impl(DWALPager *self) { + debug_printf("DWALPager(%s) commit begin\n", self->filename.c_str()); // Write old committed header to Page 1 self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); + // Trigger the remap eraser to stop and then wait for it. + self->remapUndoStop = true; + wait(self->remapUndoFuture); + + // Flush remap queue separately, it's not involved in free page management + wait(self->remapQueue.flush()); + self->pHeader->remapQueue = self->remapQueue.getState(); + // Flush the free list and delayed free list queues together as they are used by freePage() and newPageID() loop { state bool freeBusy = wait(self->freeList.preFlush()); @@ -1203,16 +1368,16 @@ public: self->pHeader->delayedFreeList = self->delayedFreeList.getState(); // Wait for all outstanding writes to complete - debug_printf("COWPager(%s) waiting for outstanding writes\n", self->filename.c_str()); + debug_printf("DWALPager(%s) waiting for outstanding writes\n", self->filename.c_str()); wait(self->operations.signalAndCollapse()); - debug_printf("COWPager(%s) Syncing\n", self->filename.c_str()); + debug_printf("DWALPager(%s) Syncing\n", self->filename.c_str()); // Sync everything except the header if(g_network->getCurrentTask() > TaskPriority::DiskWrite) { wait(delay(0, TaskPriority::DiskWrite)); } wait(self->pageFile->sync()); - debug_printf("COWPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), self->pHeader->committedVersion); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 1\n", self->filename.c_str(), self->pHeader->committedVersion); // Update header on disk and sync again. wait(self->writeHeaderPage(0, self->headerPage)); @@ -1220,7 +1385,7 @@ public: wait(delay(0, TaskPriority::DiskWrite)); } wait(self->pageFile->sync()); - debug_printf("COWPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), self->pHeader->committedVersion); + debug_printf("DWALPager(%s) commit version %" PRId64 " sync 2\n", self->filename.c_str(), self->pHeader->committedVersion); // Update the last committed header for use in the next commit. self->updateCommittedHeader(); @@ -1229,6 +1394,11 @@ public: // Try to expire snapshots up to the oldest version, in case some were being kept around due to being in use, // because maybe some are no longer in use. self->expireSnapshots(self->pHeader->oldestVersion); + + // Start unmapping pages for expired versions + self->remapUndoStop = false; + self->remapUndoFuture = undoRemaps(self); + return Void(); } @@ -1252,20 +1422,21 @@ public: pHeader->setMetaKey(metaKey); } - ACTOR void shutdown(COWPager *self, bool dispose) { + ACTOR void shutdown(DWALPager *self, bool dispose) { self->recoverFuture.cancel(); self->commitFuture.cancel(); + self->remapUndoFuture.cancel(); - if(self->errorPromise.canBeSet()) + if(self->errorPromise.canBeSet()) { self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress + } + self->operations.clear(); // Destroy the cache, cancelling reads and writes in progress self->pageCache.destroy(); - wait(ready(self->operations.signal())); - + // Unreference the file and clear self->pageFile.clear(); - if(dispose) { wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(self->filename, true)); } @@ -1306,10 +1477,13 @@ public: } // Get the number of pages in use but not by the pager itself. - int64_t getUserPageCount() override { - int userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries; - debug_printf("COWPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 "\n", filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries); - return userPages; + Future getUserPageCount() override { + return map(remapUndoFuture, [=](Void) { + int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages; + debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", + filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries, remapQueue.numPages, remapQueue.numEntries); + return userPages; + }); } Future init() override { @@ -1321,7 +1495,7 @@ public: } private: - ~COWPager() {} + ~DWALPager() {} // Try to expire snapshots up to but not including v, but do not expire any snapshots that are in use. void expireSnapshots(Version v); @@ -1335,6 +1509,7 @@ private: int64_t pageCount; FIFOQueue::QueueState freeList; FIFOQueue::QueueState delayedFreeList; + FIFOQueue::QueueState remapQueue; Version committedVersion; Version oldestVersion; int32_t metaKeySize; @@ -1410,18 +1585,23 @@ private: Future commitFuture; SignalableActorCollection operations; Future recoverFuture; + Future remapUndoFuture; + bool remapUndoStop; Reference pageFile; LogicalPageQueueT freeList; + // The delayed free list will be approximately in Version order. // TODO: Make this an ordered container some day. - VersionedLogicalPageQueueT delayedFreeList; + DelayedFreePageQueueT delayedFreeList; + + RemapQueueT remapQueue; struct SnapshotEntry { Version version; Promise expired; - Reference snapshot; + Reference snapshot; }; struct SnapshotEntryLessThanVersion { @@ -1434,22 +1614,25 @@ private: } }; + // TODO: Better data structure + std::unordered_map> remappedPages; + std::deque snapshots; }; // Prevents pager from reusing freed pages from version until the snapshot is destroyed -class COWPagerSnapshot : public IPagerSnapshot, public ReferenceCounted { +class DWALPagerSnapshot : public IPagerSnapshot, public ReferenceCounted { public: - COWPagerSnapshot(COWPager *pager, Key meta, Version version, Future expiredFuture) : pager(pager), metaKey(meta), version(version), expired(expiredFuture) { + DWALPagerSnapshot(DWALPager *pager, Key meta, Version version, Future expiredFuture) : pager(pager), metaKey(meta), version(version), expired(expiredFuture) { } - virtual ~COWPagerSnapshot() { + virtual ~DWALPagerSnapshot() { } Future> getPhysicalPage(LogicalPageID pageID, bool cacheable) override { if(expired.isError()) { throw expired.getError(); } - return map(pager->readPage(pageID, cacheable), [=](Reference p) { + return map(pager->readPageAtVersion(pageID, version, cacheable), [=](Reference p) { return Reference(p); }); } @@ -1463,23 +1646,23 @@ public: } void addref() override { - ReferenceCounted::addref(); + ReferenceCounted::addref(); } void delref() override { - ReferenceCounted::delref(); + ReferenceCounted::delref(); } - COWPager *pager; + DWALPager *pager; Future expired; Version version; Key metaKey; }; -void COWPager::expireSnapshots(Version v) { - debug_printf("COWPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); +void DWALPager::expireSnapshots(Version v) { + debug_printf("DWALPager(%s) expiring snapshots through %" PRId64 " snapshot count %d\n", filename.c_str(), v, (int)snapshots.size()); while(snapshots.size() > 1 && snapshots.front().version < v && snapshots.front().snapshot->isSoleOwner()) { - debug_printf("COWPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); + debug_printf("DWALPager(%s) expiring snapshot for %" PRId64 " soleOwner=%d\n", filename.c_str(), snapshots.front().version, snapshots.front().snapshot->isSoleOwner()); // The snapshot contract could be made such that the expired promise isn't need anymore. In practice it // probably is already not needed but it will gracefully handle the case where a user begins a page read // with a snapshot reference, keeps the page read future, and drops the snapshot reference. @@ -1488,7 +1671,7 @@ void COWPager::expireSnapshots(Version v) { } } -Reference COWPager::getReadSnapshot(Version v) { +Reference DWALPager::getReadSnapshot(Version v) { ASSERT(!snapshots.empty()); auto i = std::upper_bound(snapshots.begin(), snapshots.end(), v, SnapshotEntryLessThanVersion()); @@ -1499,12 +1682,12 @@ Reference COWPager::getReadSnapshot(Version v) { return i->snapshot; } -void COWPager::addLatestSnapshot() { +void DWALPager::addLatestSnapshot() { Promise expired; snapshots.push_back({ pLastCommittedHeader->committedVersion, expired, - Reference(new COWPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) + Reference(new DWALPagerSnapshot(this, pLastCommittedHeader->getMetaKey(), pLastCommittedHeader->committedVersion, expired.getFuture())) }); } @@ -2573,11 +2756,10 @@ public: m_latestCommit = m_init; } - ACTOR static Future incrementalLazyDelete(VersionedBTree *self, bool *pStop = nullptr, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { + ACTOR static Future incrementalSubtreeClear(VersionedBTree *self, bool *pStop = nullptr, unsigned int minPages = 0, int maxPages = std::numeric_limits::max()) { // TODO: Is it contractually okay to always to read at the latest version? state Reference snapshot = self->m_pager->getReadSnapshot(self->m_pager->getLatestVersion()); state int freedPages = 0; - loop { // take a page from front of queue state Optional q = wait(self->m_lazyDeleteQueue.pop()); @@ -2736,12 +2918,13 @@ public: ACTOR static Future destroyAndCheckSanity_impl(VersionedBTree *self) { ASSERT(g_network->isSimulated()); + debug_printf("Clearing tree.\n"); self->setWriteVersion(self->getLatestVersion() + 1); self->clear(KeyRangeRef(dbBegin.key, dbEnd.key)); loop { - int freedPages = wait(self->incrementalLazyDelete(self)); - debug_printf("incrementalLazyDelete freed %d\n", freedPages); + int freedPages = wait(self->incrementalSubtreeClear(self)); + debug_printf("incrementalSubtreeClear freed %d\n", freedPages); wait(self->commit()); if(self->m_lazyDeleteQueue.numEntries == 0) { break; @@ -2749,6 +2932,12 @@ public: self->setWriteVersion(self->getLatestVersion() + 1); } + // Forget all but the latest version of the tree. + debug_printf("Discarding all old versions.\n"); + self->setOldestVersion(self->getLastCommittedVersion()); + self->setWriteVersion(self->getLatestVersion() + 1); + wait(self->commit()); + // The lazy delete queue should now be empty and contain only the new page to start writing to // on the next commit. LazyDeleteQueueT::QueueState s = self->m_lazyDeleteQueue.getState(); @@ -2761,7 +2950,8 @@ public: // From the pager's perspective the only pages that should be in use are the btree root and // the previously mentioned lazy delete queue page. - ASSERT(self->m_pager->getUserPageCount() == 2); + int64_t userPageCount = wait(self->m_pager->getUserPageCount()); + ASSERT(userPageCount == 2); return Void(); } @@ -3217,7 +3407,7 @@ private: rptr += blockSize; pages.push_back(std::move(page)); } - delete (uint8_t *)btPage; + delete [] (uint8_t *)btPage; } // Write this btree page, which is made of 1 or more pager pages. @@ -3310,7 +3500,7 @@ private: } virtual ~SuperPage() { - delete m_data; + delete [] m_data; } virtual void addref() const { @@ -3683,14 +3873,15 @@ private: cursor.moveNext(); } - debug_printf("%s Done merging mutations into existing leaf contents, made %d changes\n", context.c_str(), changes); - // No changes were actually made. This could happen if the only mutations are clear ranges which do not match any records. if(minVersion == invalidVersion) { results.push_back_deep(results.arena(), VersionAndChildrenRef(0, VectorRef((RedwoodRecordRef *)decodeLowerBound, 1), *decodeUpperBound)); debug_printf("%s No changes were made during mutation merge, returning %s\n", context.c_str(), toString(results).c_str()); return results; } + else { + debug_printf("%s Changes were made, writing.\n", context.c_str()); + } // TODO: Make version and key splits based on contents of merged list, if keeping history @@ -3868,7 +4059,7 @@ private: debug_printf("%s: Beginning commit of version %" PRId64 ", new oldest version set to %" PRId64 "\n", self->m_name.c_str(), writeVersion, self->m_newOldestVersion); state bool lazyDeleteStop = false; - state Future lazyDelete = incrementalLazyDelete(self, &lazyDeleteStop); + state Future lazyDelete = incrementalSubtreeClear(self, &lazyDeleteStop); // Get the latest version from the pager, which is what we will read at state Version latestVersion = self->m_pager->getLatestVersion(); @@ -4462,23 +4653,11 @@ RedwoodRecordRef VersionedBTree::dbBegin(StringRef(), 0); RedwoodRecordRef VersionedBTree::dbEnd(LiteralStringRef("\xff\xff\xff\xff\xff")); VersionedBTree::Counts VersionedBTree::counts; -ACTOR template -Future catchError(Promise error, Future f) { - try { - T result = wait(f); - return result; - } catch(Error &e) { - if(e.code() != error_code_actor_cancelled && error.canBeSet()) - error.sendError(e); - throw; - } -} - class KeyValueStoreRedwoodUnversioned : public IKeyValueStore { public: KeyValueStoreRedwoodUnversioned(std::string filePrefix, UID logID) : m_filePrefix(filePrefix) { // TODO: This constructor should really just take an IVersionedStore - IPager2 *pager = new COWPager(4096, filePrefix, 0); + IPager2 *pager = new DWALPager(4096, filePrefix, 0); m_tree = new VersionedBTree(pager, filePrefix, true); m_init = catchError(init_impl(this)); } @@ -4639,7 +4818,7 @@ private: Promise m_error; template inline Future catchError(Future f) { - return ::catchError(m_error, f); + return forwardError(f, m_error); } }; @@ -5484,7 +5663,7 @@ TEST_CASE("!/redwood/correctness/btree") { printf("Initializing...\n"); state double startTime = timer(); - pager = new COWPager(pageSize, pagerFile, 0); + pager = new DWALPager(pageSize, pagerFile, 0); state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); @@ -5677,7 +5856,7 @@ TEST_CASE("!/redwood/correctness/btree") { wait(closedFuture); printf("Reopening btree from disk.\n"); - IPager2 *pager = new COWPager(pageSize, pagerFile, 0); + IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); @@ -5703,6 +5882,7 @@ TEST_CASE("!/redwood/correctness/btree") { debug_printf("Waiting for outstanding commit\n"); wait(commit); committedVersions.sendError(end_of_stream()); + randomTask.cancel(); debug_printf("Waiting for verification to complete.\n"); wait(verifyTask); @@ -5714,6 +5894,7 @@ TEST_CASE("!/redwood/correctness/btree") { Future closedFuture = btree->onClosed(); btree->close(); + debug_printf("Closing.\n"); wait(closedFuture); return Void(); @@ -5742,7 +5923,7 @@ TEST_CASE("!/redwood/correctness/pager/cow") { deleteFile(pagerFile); int pageSize = 4096; - state IPager2 *pager = new COWPager(pageSize, pagerFile, 0); + state IPager2 *pager = new DWALPager(pageSize, pagerFile, 0); wait(success(pager->init())); state LogicalPageID id = wait(pager->newPageID()); @@ -5769,7 +5950,7 @@ TEST_CASE("!/redwood/performance/set") { deleteFile(pagerFile); int pageSize = 4096; - IPager2 *pager = new COWPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + IPager2 *pager = new DWALPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); state bool singleVersion = true; state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); @@ -5822,6 +6003,7 @@ TEST_CASE("!/redwood/performance/set") { } if(kvBytes >= commitTarget) { + btree->setOldestVersion(btree->getLastCommittedVersion()); wait(commit); printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); @@ -5849,6 +6031,7 @@ TEST_CASE("!/redwood/performance/set") { wait(commit); printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + printf("Starting random seeks\n"); state int reads = 30000; wait(randomSeeks(btree, reads, firstKeyChar, lastKeyChar) && randomSeeks(btree, reads, firstKeyChar, lastKeyChar) && randomSeeks(btree, reads, firstKeyChar, lastKeyChar)); From e345c9061f666549586d5dee979d509ad8cc7a27 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Mon, 4 Nov 2019 11:47:29 -0800 Subject: [PATCH 129/184] FastRestore:Refine debug messages --- fdbclient/BackupContainer.h | 2 +- fdbclient/RestoreWorkerInterface.actor.h | 2 +- fdbserver/RestoreApplier.actor.cpp | 1 + fdbserver/RestoreCommon.actor.h | 2 +- fdbserver/RestoreLoader.actor.cpp | 7 +++++++ fdbserver/RestoreMaster.actor.cpp | 2 ++ 6 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fdbclient/BackupContainer.h b/fdbclient/BackupContainer.h index 5671788c9a..b14ce7e37c 100644 --- a/fdbclient/BackupContainer.h +++ b/fdbclient/BackupContainer.h @@ -173,7 +173,7 @@ struct RestorableFileSet { Version targetVersion; std::vector logs; std::vector ranges; - KeyspaceSnapshotFile snapshot; + KeyspaceSnapshotFile snapshot; // Info. for debug purposes }; /* IBackupContainer is an interface to a set of backup data, which contains diff --git a/fdbclient/RestoreWorkerInterface.actor.h b/fdbclient/RestoreWorkerInterface.actor.h index d5155c3168..e2f7637eb5 100644 --- a/fdbclient/RestoreWorkerInterface.actor.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -360,7 +360,7 @@ struct RestoreSendMutationVectorVersionedRequest : TimedRequest { std::string toString() { std::stringstream ss; - ss << "fileIndex" << fileIndex << "prevVersion:" << prevVersion << " version:" << version + ss << "fileIndex" << fileIndex << " prevVersion:" << prevVersion << " version:" << version << " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size(); return ss.str(); } diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index ffd1ddf84b..f8bfa410e5 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -123,6 +123,7 @@ ACTOR static Future handleSendMutationVectorRequest(RestoreSendMutationVec .detail("Index", mIndex) .detail("MutationReceived", mutation.toString()); self->kvOps[commitVersion].push_back_deep(self->kvOps[commitVersion].arena(), mutation); + // TODO: What if log file's mutations are delivered out-of-order (behind) the range file's mutations?! } curFilePos.set(req.version); } diff --git a/fdbserver/RestoreCommon.actor.h b/fdbserver/RestoreCommon.actor.h index daa8f3dea2..421ebcc929 100644 --- a/fdbserver/RestoreCommon.actor.h +++ b/fdbserver/RestoreCommon.actor.h @@ -236,7 +236,7 @@ struct RestoreFileFR { ss << "version:" << std::to_string(version) << " fileName:" << fileName << " isRange:" << std::to_string(isRange) << " blockSize:" << std::to_string(blockSize) << " fileSize:" << std::to_string(fileSize) << " endVersion:" << std::to_string(endVersion) - << std::to_string(beginVersion) << " cursor:" << std::to_string(cursor) + << " beginVersion:" << std::to_string(beginVersion) << " cursor:" << std::to_string(cursor) << " fileIndex:" << std::to_string(fileIndex); return ss.str(); } diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 4263cad3d4..e2369b8da5 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -127,9 +127,14 @@ void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self) { + TraceEvent("FastRestore") + .detail("Loader", self->id()) + .detail("SetApplierKeyRangeVector", req.rangeToApplier.size()); // Idempodent operation. OK to re-execute the duplicate cmd if (self->rangeToApplier.empty()) { self->rangeToApplier = req.rangeToApplier; + } else { + ASSERT_WE_THINK(self->rangeToApplier == req.rangeToApplier); } req.reply.send(RestoreCommonReply(self->id())); } @@ -185,6 +190,8 @@ ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Referenceid()).detail("ProcessLoadParam", req.param.toString()); self->processedFileParams[req.param] = Never(); self->processedFileParams[req.param] = _processLoadingParam(req.param, self); + } else { + TraceEvent("FastRestore").detail("Loader", self->id()).detail("WaitOnProcessLoadParam", req.param.toString()); } ASSERT(self->processedFileParams.find(req.param) != self->processedFileParams.end()); wait(self->processedFileParams[req.param]); // wait on the processing of the req.param. diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index 16fd3e4182..47c8469b09 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -412,11 +412,13 @@ ACTOR static Future collectBackupFiles(Reference bc, std for (const RangeFile& f : restorable.get().ranges) { TraceEvent("FastRestore").detail("RangeFile", f.toString()); RestoreFileFR file(f.version, f.fileName, true, f.blockSize, f.fileSize, f.version, f.version); + TraceEvent("FastRestore").detail("RangeFileFR", file.toString()); files->push_back(file); } for (const LogFile& f : restorable.get().logs) { TraceEvent("FastRestore").detail("LogFile", f.toString()); RestoreFileFR file(f.beginVersion, f.fileName, false, f.blockSize, f.fileSize, f.endVersion, f.beginVersion); + TraceEvent("FastRestore").detail("LogFileFR", file.toString()); files->push_back(file); } From 0c95fef8aa9c53e158a35f7ba7d99facdd464dc6 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 4 Nov 2019 11:12:26 -0800 Subject: [PATCH 130/184] Bug fix in tree clear and size check where sometimes there could still be old versions of pages in use because not enough commits have passed for them to be rolled off and freed. --- fdbserver/VersionedBTree.actor.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 8b2cd3e9d6..daf97d46b4 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -2812,6 +2812,7 @@ public: } } + debug_printf("LazyDelete: freed %d pages, %s has %" PRId64 " entries\n", freedPages, self->m_lazyDeleteQueue.name.c_str(), self->m_lazyDeleteQueue.numEntries); return freedPages; } @@ -2923,10 +2924,10 @@ public: self->clear(KeyRangeRef(dbBegin.key, dbEnd.key)); loop { - int freedPages = wait(self->incrementalSubtreeClear(self)); - debug_printf("incrementalSubtreeClear freed %d\n", freedPages); + state int freedPages = wait(self->incrementalSubtreeClear(self)); wait(self->commit()); - if(self->m_lazyDeleteQueue.numEntries == 0) { + // Keep looping until the last commit doesn't do anything at all + if(self->m_lazyDeleteQueue.numEntries == 0 && freedPages == 0) { break; } self->setWriteVersion(self->getLatestVersion() + 1); From 96989e0fb68c3726118c8f5f1a7f4c84eb2e37a6 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Mon, 4 Nov 2019 14:18:39 -0800 Subject: [PATCH 131/184] AtomicOps test:Add sanity check for log and ops keys Provide more information about which opsKey is missing when log and ops results are inconsistent for Add operation. --- fdbserver/workloads/AtomicOps.actor.cpp | 159 +++++++++++++++++++++--- 1 file changed, 144 insertions(+), 15 deletions(-) diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index d090d71249..1f2f0c9fd2 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -33,6 +33,7 @@ struct AtomicOpsWorkload : TestWorkload { double testDuration, transactionsPerSecond; vector> clients; + uint64_t lbsum, ubsum; // Tell if setup txn fails when opType = AddValue AtomicOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), opNum(0) @@ -47,7 +48,10 @@ struct AtomicOpsWorkload : TestWorkload { apiVersion500 = ((sharedRandomNumber % 10) == 0); TraceEvent("AtomicOpsApiVersion500").detail("ApiVersion500", apiVersion500); - int64_t randNum = sharedRandomNumber / 10; + lbsum = 0; + ubsum = 0; + + int64_t randNum = sharedRandomNumber / 10; if(opType == -1) opType = randNum % 8; @@ -119,7 +123,13 @@ struct AtomicOpsWorkload : TestWorkload { virtual void getMetrics( vector& m ) { } - Key logKey( int group ) { return StringRef(format("log%08x%08x%08x",group,clientId,opNum++));} + // Key logKey( int group ) { return StringRef(format("log%08x%08x%08x",group,clientId,opNum++));} + std::pair logDebugKey(int group) { + Key logKey(format("log%08x%08x%08x", group, clientId, opNum)); + Key debugKey(format("debug%08x%08x%08x", group, clientId, opNum)); + opNum++; + return std::make_pair(logKey, debugKey); + } ACTOR Future _setup( Database cx, AtomicOpsWorkload* self ) { // Sanity check if log keyspace has elements @@ -172,29 +182,138 @@ struct AtomicOpsWorkload : TestWorkload { loop { try { int group = deterministicRandom()->randomInt(0,100); - uint64_t intValue = deterministicRandom()->randomInt( 0, 10000000 ); + state uint64_t intValue = deterministicRandom()->randomInt(0, 10000000); Key val = StringRef((const uint8_t*) &intValue, sizeof(intValue)); - tr.set(self->logKey(group), val); + std::pair logDebugKey = self->logDebugKey(group); int nodeIndex = deterministicRandom()->randomInt(0, self->nodeCount / 100); - tr.atomicOp(StringRef(format("ops%08x%08x", group, nodeIndex)), val, self->opType); - // TraceEvent(SevDebug, "AtomicOpWorker") - // .detail("LogKey", self->logKey(group)) - // .detail("Value", val) - // .detail("ValueInt", intValue); - // TraceEvent(SevDebug, "AtomicOpWorker") - // .detail("OpKey", format("ops%08x%08x", group, nodeIndex)) - // .detail("Value", val) - // .detail("ValueInt", intValue) - // .detail("AtomicOp", self->opType); + Key opsKey(format("ops%08x%08x", group, nodeIndex)); + tr.set(logDebugKey.first, val); // set log key + tr.set(logDebugKey.second, opsKey); // set debug key; one opsKey can have multiple logs key + tr.atomicOp(opsKey, val, self->opType); wait( tr.commit() ); + if (self->opType == MutationRef::AddValue) { + self->lbsum += intValue; + self->ubsum += intValue; + } break; } catch( Error &e ) { wait( tr.onError(e) ); + if (self->opType == MutationRef::AddValue) { + self->ubsum += intValue; + } } } } } + ACTOR Future dumpLogKV(Database cx, int g) { + ReadYourWritesTransaction tr(cx); + Key begin(format("log%08x", g)); + Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + uint64_t sum = 0; + for (auto& kv : log) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + sum += intValue; + TraceEvent("AtomicOpLog") + .detail("Key", kv.key) + .detail("Val", kv.value) + .detail("IntValue", intValue) + .detail("CurSum", sum); + } + return Void(); + } + + ACTOR Future dumpDebugKV(Database cx, int g) { + ReadYourWritesTransaction tr(cx); + Key begin(format("debug%08x", g)); + Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + for (auto& kv : log) { + TraceEvent("AtomicOpDebug").detail("Key", kv.key).detail("Val", kv.value); + } + return Void(); + } + + ACTOR Future dumpOpsKV(Database cx, int g) { + ReadYourWritesTransaction tr(cx); + Key begin(format("ops%08x", g)); + Standalone ops = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + uint64_t sum = 0; + for (auto& kv : ops) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + sum += intValue; + TraceEvent("AtomicOpOps") + .detail("Key", kv.key) + .detail("Val", kv.value) + .detail("IntVal", intValue) + .detail("CurSum", sum); + } + return Void(); + } + + ACTOR Future validateOpsKey(Database cx, AtomicOpsWorkload* self, int g) { + // Get mapping between opsKeys and debugKeys + state ReadYourWritesTransaction tr1(cx); + state std::map records; // + Key begin(format("debug%08x", g)); + Standalone debuglog = + wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + for (auto& kv : debuglog) { + records[kv.value] = kv.key; + } + + // Get log key's value and assign it to the associated debugKey + state ReadYourWritesTransaction tr2(cx); + state std::map logVal; // debugKey, log's value + Key begin(format("log%08x", g)); + Standalone log = wait(tr2.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + for (auto& kv : log) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + logVal[kv.key.removePrefix(LiteralStringRef("log")).withPrefix(LiteralStringRef("debug"))] = intValue; + } + + // Get opsKeys and validate if it has correct value + state ReadYourWritesTransaction tr3(cx); + state std::map opsVal; // ops key, ops value + Key begin(format("ops%08x", g)); + Standalone ops = wait(tr3.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + // Validate if ops' key value is consistent with logs' key value + for (auto& kv : ops) { + bool inRecord = records.find(kv.key) != records.end(); + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + opsVal[kv.key] = intValue; + if (!inRecord) { + TraceEvent(SevError, "MissingLogKey").detail("OpsKey", kv.key); + } + if (inRecord && intValue == 0) { + TraceEvent(SevError, "MissingOpsKey1").detail("OpsKey", kv.key).detail("DebugKey", records[kv.key]); + } + if (inRecord && (self->actorCount == 1 && intValue != logVal[records[kv.key]])) { + // When multiple actors exist, 1 opsKey can have multiple log keys + TraceEvent(SevError, "InconsistentOpsKeyValue") + .detail("OpsKey", kv.key) + .detail("DebugKey", records[kv.key]) + .detail("LogValue", logVal[records[kv.key]]) + .detail("OpValue", intValue); + } + } + + // Validate if there is any ops key missing + for (auto& kv : records) { + uint64_t intValue = opsVal[kv.first]; + if (intValue <= 0) { + TraceEvent(SevError, "MissingOpsKey2") + .detail("OpsKey", kv.first) + .detail("OpsVal", intValue) + .detail("DebugKey", kv.second); + } + } + return Void(); + } + ACTOR Future _check( Database cx, AtomicOpsWorkload* self ) { state int g = 0; state bool ret = true; @@ -251,7 +370,17 @@ struct AtomicOpsWorkload : TestWorkload { logResult += intValue; } if(logResult != opsResult) { - TraceEvent(SevError, "LogAddMismatch").detail("LogResult", logResult).detail("OpResult", opsResult).detail("OpsResultStr", printable(opsResultStr)).detail("Size", opsResultStr.size()); + TraceEvent(SevError, "LogAddMismatch") + .detail("LogResult", logResult) + .detail("OpResult", opsResult) + .detail("OpsResultStr", printable(opsResultStr)) + .detail("Size", opsResultStr.size()) + .detail("LowerBoundSum", self->lbsum) + .detail("UperBoundSum", self->ubsum); + wait(self->dumpLogKV(cx, g)); + wait(self->dumpDebugKV(cx, g)); + wait(self->dumpOpsKV(cx, g)); + wait(self->validateOpsKey(cx, self, g)); } } break; From c4d1e6e1a90af60c933a6f9d8a3b02095b6f91a9 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Mon, 4 Nov 2019 16:10:08 -0800 Subject: [PATCH 132/184] Trace:Severity:Include SevNoInfo to mute trace Define SevFRMutationInfo to trace mutations in restore. --- fdbserver/RestoreLoader.actor.cpp | 8 ++++++-- fdbserver/RestoreUtil.h | 3 +++ flow/Knobs.cpp | 4 +++- flow/Trace.h | 17 +++++++++-------- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 4263cad3d4..c75d5a470b 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -453,7 +453,9 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL const uint8_t* v = vReader.consume(vLen); MutationRef mutation((MutationRef::Type)type, KeyRef(k, kLen), KeyRef(v, vLen)); - //TraceEvent(SevDebug, "FastRestore_VerboseDebug").detail("CommitVersion", commitVersion).detail("ParsedMutation", mutation.toString()); + TraceEvent(SevFRMutationInfo, "FastRestore_VerboseDebug") + .detail("CommitVersion", commitVersion) + .detail("ParsedMutation", mutation.toString()); kvOps[commitVersion].push_back_deep(kvOps[commitVersion].arena(), mutation); ASSERT_WE_THINK(kLen >= 0 && kLen < val.size()); ASSERT_WE_THINK(vLen >= 0 && vLen < val.size()); @@ -515,7 +517,9 @@ ACTOR static Future _parseRangeFileToMutationsOnLoader(VersionedMutationsM // We cache all kv operations into kvOps, and apply all kv operations later in one place kvOps.insert(std::make_pair(version, VectorRef())); - //TraceEvent(SevDebug, "FastRestore_VerboseDebug").detail("CommitVersion", version).detail("ParsedMutationKV", m.toString()); + TraceEvent(SevFRMutationInfo, "FastRestore_VerboseDebug") + .detail("CommitVersion", version) + .detail("ParsedMutationKV", m.toString()); ASSERT_WE_THINK(kvOps.find(version) != kvOps.end()); kvOps[version].push_back_deep(kvOps[version].arena(), m); diff --git a/fdbserver/RestoreUtil.h b/fdbserver/RestoreUtil.h index 9045c9828e..0d7fa0e720 100644 --- a/fdbserver/RestoreUtil.h +++ b/fdbserver/RestoreUtil.h @@ -34,6 +34,9 @@ #include #include +// #define SevFRMutationInfo SevNoInfo +#define SevFRMutationInfo SevInfo + enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier }; BINARY_SERIALIZABLE(RestoreRole); std::string getRoleStr(RestoreRole role); diff --git a/flow/Knobs.cpp b/flow/Knobs.cpp index 9cdc510577..62d722ba83 100644 --- a/flow/Knobs.cpp +++ b/flow/Knobs.cpp @@ -27,6 +27,7 @@ FlowKnobs const* FLOW_KNOBS = new FlowKnobs(); #define init( knob, value ) initKnob( knob, value, #knob ) +// clang-format off FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( AUTOMATIC_TRACE_DUMP, 1 ); init( PREVENT_FAST_SPIN_DELAY, .01 ); @@ -140,7 +141,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( ZERO_LENGTH_FILE_PAD, 1 ); init( TRACE_FLUSH_INTERVAL, 0.25 ); init( TRACE_RETRY_OPEN_INTERVAL, 1.00 ); - init( MIN_TRACE_SEVERITY, isSimulated ? 0 : 10 ); // Related to the trace severity in Trace.h + init( MIN_TRACE_SEVERITY, isSimulated ? 1 : 10 ); // Related to the trace severity in Trace.h init( MAX_TRACE_SUPPRESSIONS, 1e4 ); init( TRACE_SYNC_ENABLED, 0 ); init( TRACE_EVENT_METRIC_UNITS_PER_SAMPLE, 500 ); @@ -183,6 +184,7 @@ FlowKnobs::FlowKnobs(bool randomize, bool isSimulated) { init( LOAD_BALANCE_MAX_BAD_OPTIONS, 1 ); //should be the same as MAX_MACHINES_FALLING_BEHIND init( LOAD_BALANCE_PENALTY_IS_BAD, true ); } +// clang-format on static std::string toLower( std::string const& name ) { std::string lower_name; diff --git a/flow/Trace.h b/flow/Trace.h index 12d2bb3ade..0d8dc55ff4 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -45,14 +45,15 @@ inline static bool TRACE_SAMPLE() { return false; } extern thread_local int g_trace_depth; enum Severity { - SevSample=1, - SevDebug=5, - SevInfo=10, - SevWarn=20, - SevWarnAlways=30, - SevError=40, - SevMaxUsed=SevError, - SevMax=1000000 + SevNoInfo = 0, + SevSample = 1, + SevDebug = 5, + SevInfo = 10, + SevWarn = 20, + SevWarnAlways = 30, + SevError = 40, + SevMaxUsed = SevError, + SevMax = 1000000 }; class TraceEventFields { From cecef8d0b50599c43baaac19a954e2fe848fa8a5 Mon Sep 17 00:00:00 2001 From: Balachandar Namasivayam Date: Mon, 4 Nov 2019 16:19:47 -0800 Subject: [PATCH 133/184] Added contrib folder to foundationdb. New tool called transaction_profiling_analyzer has been added to the folder. It is a python script that parses transaction profiling info and analyzes hot keys and ranges. Also monitoring folder has been moved to the contrib folder. --- CMakeLists.txt | 2 +- .../monitoring}/CMakeLists.txt | 0 .../monitoring}/actor_flamegraph.cpp | 0 contrib/transaction_profiling_analyzer.py | 806 ++++++++++++++++++ 4 files changed, 807 insertions(+), 1 deletion(-) rename {monitoring => contrib/monitoring}/CMakeLists.txt (100%) rename {monitoring => contrib/monitoring}/actor_flamegraph.cpp (100%) create mode 100644 contrib/transaction_profiling_analyzer.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a4c3bfdf4..762ba597c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -201,7 +201,7 @@ add_subdirectory(tests) if(WITH_DOCUMENTATION) add_subdirectory(documentation) endif() -add_subdirectory(monitoring) +add_subdirectory(contrib/monitoring) if(WIN32) add_subdirectory(packaging/msi) diff --git a/monitoring/CMakeLists.txt b/contrib/monitoring/CMakeLists.txt similarity index 100% rename from monitoring/CMakeLists.txt rename to contrib/monitoring/CMakeLists.txt diff --git a/monitoring/actor_flamegraph.cpp b/contrib/monitoring/actor_flamegraph.cpp similarity index 100% rename from monitoring/actor_flamegraph.cpp rename to contrib/monitoring/actor_flamegraph.cpp diff --git a/contrib/transaction_profiling_analyzer.py b/contrib/transaction_profiling_analyzer.py new file mode 100644 index 0000000000..c7d6e0c602 --- /dev/null +++ b/contrib/transaction_profiling_analyzer.py @@ -0,0 +1,806 @@ +""" +Requirements: +python3 +fdb python bindings +optional packages: + dateparser (for human date parsing) + sortedcontainers (for estimating key range read/write density) +""" + + +import argparse +from collections import defaultdict +from enum import Enum +import fdb +from fdb.impl import strinc +import json +from json import JSONEncoder +import logging +import struct +from bisect import bisect_left +import time + +PROTOCOL_VERSION_5_2 = 0x0FDB00A552000001 +PROTOCOL_VERSION_6_0 = 0x0FDB00A570010001 +PROTOCOL_VERSION_6_1 = 0x0FDB00B061060001 +PROTOCOL_VERSION_6_2 = 0x0FDB00B062010001 +supported_protocol_versions = frozenset([PROTOCOL_VERSION_5_2, PROTOCOL_VERSION_6_0, PROTOCOL_VERSION_6_1, + PROTOCOL_VERSION_6_2]) + + +fdb.api_version(600) + +BASIC_FORMAT = "%(asctime)s - %(levelname)-8s %(message)s" +LOG_PATH = "transaction_profiling_analyzer.log" + + +def setup_logger(name): + root = logging.getLogger(name) + root.setLevel(logging.DEBUG) + root.propagate = False + + file_formatter = logging.Formatter(BASIC_FORMAT) + + file_handler = logging.FileHandler(LOG_PATH) + file_handler.setFormatter(file_formatter) + file_handler.setLevel(logging.DEBUG) + + root.addHandler(file_handler) + + return root + + +logger = setup_logger(__name__) + + +class ByteBuffer(object): + def __init__(self, val): + self._offset = 0 + self.val = val + + def get_bytes(self, n): + if self._offset + n > len(self.val): + raise IndexError("Request to read %d bytes with only %d remaining" % (n, self.get_remaining_bytes())) + ret = self.val[self._offset:self._offset + n] + self._offset += n + return ret + + def get_int(self): + return struct.unpack("= PROTOCOL_VERSION_6_2: + self.transaction_priority_type = bb.get_int() + + +class GetInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.latency = bb.get_double() + self.value_size = bb.get_int() + self.key = bb.get_bytes_with_length() + + +class GetRangeInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.latency = bb.get_double() + self.range_size = bb.get_int() + self.key_range = bb.get_key_range() + + +class CommitInfo(BaseInfo): + def __init__(self, bb, full_output=True): + super().__init__(bb.get_double()) + self.latency = bb.get_double() + self.num_mutations = bb.get_int() + self.commit_bytes = bb.get_int() + + read_conflict_range = bb.get_key_range_list() + if full_output: + self.read_conflict_range = read_conflict_range + write_conflict_range = bb.get_key_range_list() + if full_output: + self.write_conflict_range = write_conflict_range + mutations = bb.get_mutation_list() + if full_output: + self.mutations = mutations + + self.read_snapshot_version = bb.get_long() + + +class ErrorGetInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.error_code = bb.get_int() + self.key = bb.get_bytes_with_length() + + +class ErrorGetRangeInfo(BaseInfo): + def __init__(self, bb): + super().__init__(bb.get_double()) + self.error_code = bb.get_int() + self.key_range = bb.get_key_range() + + +class ErrorCommitInfo(BaseInfo): + def __init__(self, bb, full_output=True): + super().__init__(bb.get_double()) + self.error_code = bb.get_int() + + read_conflict_range = bb.get_key_range_list() + if full_output: + self.read_conflict_range = read_conflict_range + write_conflict_range = bb.get_key_range_list() + if full_output: + self.write_conflict_range = write_conflict_range + mutations = bb.get_mutation_list() + if full_output: + self.mutations = mutations + + self.read_snapshot_version = bb.get_long() + + +class UnsupportedProtocolVersionError(Exception): + def __init__(self, protocol_version): + super().__init__("Unsupported protocol version 0x%0.2X" % protocol_version) + + +class ClientTransactionInfo: + def __init__(self, bb, full_output=True, type_filter=None): + self.get_version = None + self.gets = [] + self.get_ranges = [] + self.commit = None + self.error_gets = [] + self.error_get_ranges = [] + self.error_commits = [] + + protocol_version = bb.get_long() + if protocol_version not in supported_protocol_versions: + raise UnsupportedProtocolVersionError(protocol_version) + while bb.get_remaining_bytes(): + event = bb.get_int() + if event == 0: + # we need to read it to consume the buffer even if we don't want to store it + get_version = GetVersionInfo(bb, protocol_version) + if (not type_filter or "get_version" in type_filter): + self.get_version = get_version + elif event == 1: + get = GetInfo(bb) + if (not type_filter or "get" in type_filter): + # because of the crappy json serializtion using __dict__ we have to set the list here otherwise + # it doesn't print + if not self.gets: self.gets = [] + self.gets.append(get) + elif event == 2: + get_range = GetRangeInfo(bb) + if (not type_filter or "get_range" in type_filter): + if not self.get_ranges: self.get_ranges = [] + self.get_ranges.append(get_range) + elif event == 3: + commit = CommitInfo(bb, full_output=full_output) + if (not type_filter or "commit" in type_filter): + self.commit = commit + elif event == 4: + error_get = ErrorGetInfo(bb) + if (not type_filter or "error_gets" in type_filter): + if not self.error_gets: self.error_gets = [] + self.error_gets.append(error_get) + elif event == 5: + error_get_range = ErrorGetRangeInfo(bb) + if (not type_filter or "error_get_range" in type_filter): + if not self.error_get_ranges: self.error_get_ranges = [] + self.error_get_ranges.append(error_get_range) + elif event == 6: + error_commit = ErrorCommitInfo(bb, full_output=full_output) + if (not type_filter or "error_commit" in type_filter): + if not self.error_commits: self.error_commits = [] + self.error_commits.append(error_commit) + else: + raise Exception("Unknown event type %d" % event) + + def has_types(self): + return self.get_version or self.gets or self.get_ranges or self.commit or self.error_gets \ + or self.error_get_ranges or self.error_commits + + def to_json(self): + return json.dumps(self, cls=ObjJsonEncoder, sort_keys=True) + + +class TransactionInfoLoader(object): + max_num_chunks_to_store = 1000 # Each chunk would be 100 KB in size + + def __init__(self, db, full_output=True, type_filter=None, min_timestamp=None, max_timestamp=None): + self.db = db + self.full_output = full_output + self.type_filter = type_filter + self.min_timestamp = min_timestamp + self.max_timestamp = max_timestamp + ''' + Keys look like this + FF - 2 bytes \xff\x02 + SSSSSSSSSS - 10 bytes Version Stamp + RRRRRRRRRRRRRRRR - 16 bytes Transaction id + NNNN - 4 Bytes Chunk number + TTTT - 4 Bytes Total number of chunks + ''' + sample_key = "FF/fdbClientInfo/client_latency/SSSSSSSSSS/RRRRRRRRRRRRRRRR/NNNNTTTT/" + + self.client_latency_start = b'\xff\x02/fdbClientInfo/client_latency/' + self.client_latency_start_key_selector = fdb.KeySelector.first_greater_than(self.client_latency_start) + self.client_latency_end_key_selector = fdb.KeySelector.first_greater_or_equal(strinc(self.client_latency_start)) + self.version_stamp_start_idx = sample_key.index('S') + self.version_stamp_end_idx = sample_key.rindex('S') + self.tr_id_start_idx = sample_key.index('R') + self.tr_id_end_idx = sample_key.rindex('R') + self.chunk_num_start_idx = sample_key.index('N') + self.num_chunks_start_idx = sample_key.index('T') + + self.tr_info_map = {} + self.num_chunks_stored = 0 + self.num_transactions_discarded = 0 + + def _check_and_adjust_chunk_cache_size(self): + if self.num_chunks_stored > self.max_num_chunks_to_store: + c_list = self.tr_info_map.pop(next(iter(self.tr_info_map))) + self.num_chunks_stored -= len(c_list) + self.num_transactions_discarded += 1 + + def parse_key(self, k): + version_stamp_bytes = k[self.version_stamp_start_idx:self.version_stamp_end_idx + 1] + tr_id = k[self.tr_id_start_idx:self.tr_id_end_idx + 1] + num_chunks = struct.unpack(">i", k[self.num_chunks_start_idx:self.num_chunks_start_idx + 4])[0] + chunk_num = struct.unpack(">i", k[self.chunk_num_start_idx:self.chunk_num_start_idx + 4])[0] + return version_stamp_bytes, tr_id, num_chunks, chunk_num + + def get_key_prefix_for_version_stamp(self, version_stamp): + return self.client_latency_start + struct.pack(">Q", version_stamp) + b'\x00\x00' + + @fdb.transactional + def find_version_for_timestamp(self, tr, timestamp, start): + """ + Uses Timekeeper to find the closest version to a timestamp. + If start is True, will find the greatest version at or before timestamp. + If start is False, will find the smallest version at or after the timestamp. + + :param tr: + :param timestamp: + :param start: + :return: + """ + tr.options.set_read_system_keys() + tr.options.set_read_lock_aware() + timekeeper_prefix = b'\xff\x02/timeKeeper/map/' + timestamp_packed = fdb.tuple.pack((timestamp,)) + if start: + start_key = timekeeper_prefix + end_key = fdb.KeySelector.first_greater_than(timekeeper_prefix + timestamp_packed) + reverse = True + else: + start_key = fdb.KeySelector.first_greater_or_equal(timekeeper_prefix + timestamp_packed) + end_key = fdb.KeySelector.first_greater_or_equal(strinc(timekeeper_prefix)) + reverse = False + for k, v in tr.snapshot.get_range(start_key, end_key, limit=1, reverse=reverse): + return fdb.tuple.unpack(v)[0] + return 0 if start else 0x8000000000000000 # we didn't find any timekeeper data so find the max range + + def fetch_transaction_info(self): + if self.min_timestamp: + start_version = self.find_version_for_timestamp(self.db, self.min_timestamp, True) + logger.debug("Using start version %s" % start_version) + start_key = self.get_key_prefix_for_version_stamp(start_version) + else: + start_key = self.client_latency_start_key_selector + + if self.max_timestamp: + end_version = self.find_version_for_timestamp(self.db, self.max_timestamp, False) + logger.debug("Using end version %s" % end_version) + end_key = self.get_key_prefix_for_version_stamp(end_version) + else: + end_key = self.client_latency_end_key_selector + + valid_transaction_infos = 0 + invalid_transaction_infos = 0 + + def build_client_transaction_info(v): + return ClientTransactionInfo(ByteBuffer(v), full_output=self.full_output, type_filter=self.type_filter) + + more = True + tr = self.db.create_transaction() + while more: + tr.options.set_read_system_keys() + tr.options.set_read_lock_aware() + found = 0 + buffer = [] + try: + logger.debug("Querying [%s:%s]" % (start_key, end_key)) + transaction_info_range = tr.snapshot.get_range(start_key, end_key, + streaming_mode=fdb.impl.StreamingMode.want_all) + for k, v in transaction_info_range: + found += 1 + #logger.debug(k) + start_key = fdb.KeySelector.first_greater_than(k) + + _, tr_id, num_chunks, chunk_num = self.parse_key(k) + + #logger.debug("num_chunks=%d, chunk_num=%d" % (num_chunks,chunk_num)) + + if num_chunks == 1: + assert chunk_num == 1 + try: + info = build_client_transaction_info(v) + if info.has_types(): + buffer.append(info) + valid_transaction_infos += 1 + except UnsupportedProtocolVersionError as e: + invalid_transaction_infos += 1 + except ValueError: + invalid_transaction_infos += 1 + else: + if chunk_num == 1: + # first chunk + assert tr_id not in self.tr_info_map + self.tr_info_map[tr_id] = [TrInfoChunk(num_chunks, chunk_num, k, v)] + self.num_chunks_stored += 1 + self._check_and_adjust_chunk_cache_size() + else: + if tr_id not in self.tr_info_map: + logger.error("Got a middle chunk without getting beginning part. Discarding transaction id: %s\n" % tr_id) + continue + c_list = self.tr_info_map[tr_id] + if c_list[-1].num_chunks != num_chunks or c_list[-1].chunk_num != chunk_num - 1: + self.tr_info_map.pop(tr_id) + self.num_chunks_stored -= len(c_list) + raise Exception("Chunk numbers do not match for Transaction id: %s" % tr_id) + c_list.append(TrInfoChunk(num_chunks, chunk_num, k, v)) + self.num_chunks_stored += 1 + if num_chunks == chunk_num: + self.tr_info_map.pop(tr_id) + self.num_chunks_stored -= len(c_list) + try: + info = build_client_transaction_info(b''.join([chunk.value for chunk in c_list])) + if info.has_types(): + buffer.append(info) + valid_transaction_infos += 1 + except UnsupportedProtocolVersionError as e: + invalid_transaction_infos += 1 + except ValueError: + invalid_transaction_infos += 1 + self._check_and_adjust_chunk_cache_size() + if (valid_transaction_infos + invalid_transaction_infos) % 1000 == 0: + print("Processed valid: %d, invalid: %d" % (valid_transaction_infos, invalid_transaction_infos)) + if found == 0: + more = False + except fdb.FDBError as e: + # if too old then reset and don't wait + if e.code == 1007: + tr.reset() + else: + tr.on_error(e).wait() + for item in buffer: + yield item + + +def has_sortedcontainers(): + try: + import sortedcontainers + return True + except ImportError: + logger.warn("Can't find sortedcontainers so disabling RangeCounter") + return False + + +def has_dateparser(): + try: + import dateparser + return True + except ImportError: + logger.warn("Can't find dateparser so disabling human date parsing") + return False + + +class RangeCounter(object): + def __init__(self, k): + self.k = k + from sortedcontainers import SortedDict + self.ranges = SortedDict() + + def process(self, transaction_info): + for get_range in transaction_info.get_ranges: + self._insert_range(get_range.key_range.start_key, get_range.key_range.end_key) + + def _insert_range(self, start_key, end_key): + keys = self.ranges.keys() + if len(keys) == 0: + self.ranges[start_key] = end_key, 1 + return + + start_pos = bisect_left(keys, start_key) + end_pos = bisect_left(keys, end_key) + #print("start_pos=%d, end_pos=%d" % (start_pos, end_pos)) + + possible_intersection_keys = keys[max(0, start_pos - 1):min(len(keys), end_pos+1)] + + start_range_left = start_key + + for key in possible_intersection_keys: + cur_end_key, cur_count = self.ranges[key] + #logger.debug("key=%s, cur_end_key=%s, cur_count=%d, start_range_left=%s" % (key, cur_end_key, cur_count, start_range_left)) + if start_range_left < key: + if end_key <= key: + self.ranges[start_range_left] = end_key, 1 + return + self.ranges[start_range_left] = key, 1 + start_range_left = key + assert start_range_left >= key + if start_range_left >= cur_end_key: + continue + + # [key, start_range_left) = cur_count + # if key == start_range_left this will get overwritten below + self.ranges[key] = start_range_left, cur_count + + if end_key <= cur_end_key: + # [start_range_left, end_key) = cur_count+1 + # [end_key, cur_end_key) = cur_count + self.ranges[start_range_left] = end_key, cur_count + 1 + if end_key != cur_end_key: + self.ranges[end_key] = cur_end_key, cur_count + start_range_left = end_key + break + else: + # [start_range_left, cur_end_key) = cur_count+1 + self.ranges[start_range_left] = cur_end_key, cur_count+1 + start_range_left = cur_end_key + assert start_range_left <= end_key + + # there may be some range left + if start_range_left < end_key: + self.ranges[start_range_left] = end_key, 1 + + def get_count_for_key(self, key): + if key in self.ranges: + return self.ranges[key][1] + + keys = self.ranges.keys() + index = bisect_left(keys, key) + if index == 0: + return 0 + + index_key = keys[index-1] + if index_key <= key < self.ranges[index_key][0]: + return self.ranges[index_key][1] + return 0 + + def get_range_boundaries(self, shard_finder=None): + total = sum([count for _, (_, count) in self.ranges.items()]) + range_size = total // self.k + output_range_counts = [] + + def add_boundary(start, end, count): + if shard_finder: + shard_count = shard_finder.get_shard_count(start, end) + if shard_count == 1: + addresses = shard_finder.get_addresses_for_key(start) + else: + addresses = None + output_range_counts.append((start, end, count, shard_count, addresses)) + else: + output_range_counts.append((start, end, count, None, None)) + + this_range_start_key = None + count_this_range = 0 + for (start_key, (end_key, count)) in self.ranges.items(): + if not this_range_start_key: + this_range_start_key = start_key + count_this_range += count + if count_this_range >= range_size: + add_boundary(this_range_start_key, end_key, count_this_range) + count_this_range = 0 + this_range_start_key = None + if count_this_range > 0: + add_boundary(this_range_start_key, end_key, count_this_range) + + return output_range_counts + + +class ShardFinder(object): + def __init__(self, db): + self.db = db + + @staticmethod + @fdb.transactional + def _get_boundary_keys(tr, begin, end): + tr.options.set_read_lock_aware() + return fdb.locality.get_boundary_keys(tr, begin, end) + + @staticmethod + @fdb.transactional + def _get_addresses_for_key(tr, key): + tr.options.set_read_lock_aware() + return fdb.locality.get_addresses_for_key(tr, key) + + def get_shard_count(self, start_key, end_key): + return len(list(self._get_boundary_keys(self.db, start_key, end_key))) + 1 + + def get_addresses_for_key(self, key): + return [a.decode('ascii') for a in self._get_addresses_for_key(self.db, key).wait()] + + +class TopKeysCounter(object): + mutation_types_to_consider = frozenset([MutationType.SET_VALUE, MutationType.ADD_VALUE]) + + def __init__(self, k): + self.k = k + self.reads = defaultdict(lambda: 0) + self.writes = defaultdict(lambda: 0) + + def process(self, transaction_info): + for get in transaction_info.gets: + self.reads[get.key] += 1 + if transaction_info.commit: + for mutation in transaction_info.commit.mutations: + if mutation.code in self.mutation_types_to_consider: + self.writes[mutation.param_one] += 1 + + def _get_range_boundaries(self, counts, shard_finder=None): + total = sum([v for (k, v) in counts.items()]) + range_size = total // self.k + key_counts_sorted = sorted(counts.items()) + output_range_counts = [] + + def add_boundary(start, end, count): + if shard_finder: + shard_count = shard_finder.get_shard_count(start, end) + if shard_count == 1: + addresses = shard_finder.get_addresses_for_key(start) + else: + addresses = None + output_range_counts.append((start, end, count, shard_count, addresses)) + else: + output_range_counts.append((start, end, count, None, None)) + + start_key = None + count_this_range = 0 + for (k, v) in key_counts_sorted: + if not start_key: + start_key = k + count_this_range += v + if count_this_range >= range_size: + add_boundary(start_key, k, count_this_range) + count_this_range = 0 + start_key = None + if count_this_range > 0: + add_boundary(start_key, k, count_this_range) + + return output_range_counts + + def _get_top_k(self, counts): + count_key_pairs = sorted([(v, k) for (k, v) in counts.items()], reverse=True) + return count_key_pairs[0:self.k] + + def get_top_k_reads(self): + return self._get_top_k(self.reads) + + def get_top_k_writes(self): + return self._get_top_k(self.writes) + + def get_k_read_range_boundaries(self, shard_finder=None): + return self._get_range_boundaries(self.reads, shard_finder) + + def get_k_write_range_boundaries(self, shard_finder=None): + return self._get_range_boundaries(self.writes, shard_finder) + + +def connect(cluster_file=None): + db = fdb.open(cluster_file=cluster_file) + return db + + +def main(): + parser = argparse.ArgumentParser(description="TransactionProfilingAnalyzer") + parser.add_argument("-C", "--cluster-file", type=str, help="Cluster file") + parser.add_argument("--full-output", action="store_true", help="Print full output from mutations") + parser.add_argument("--filter-get-version", action="store_true", + help="Include get_version type. If no filter args are given all will be returned.") + parser.add_argument("--filter-get", action="store_true", + help="Include get type. If no filter args are given all will be returned.") + parser.add_argument("--filter-get-range", action="store_true", + help="Include get_range type. If no filter args are given all will be returned.") + parser.add_argument("--filter-commit", action="store_true", + help="Include commit type. If no filter args are given all will be returned.") + parser.add_argument("--filter-error-get", action="store_true", + help="Include error_get type. If no filter args are given all will be returned.") + parser.add_argument("--filter-error-get-range", action="store_true", + help="Include error_get_range type. If no filter args are given all will be returned.") + parser.add_argument("--filter-error-commit", action="store_true", + help="Include error_commit type. If no filter args are given all will be returned.") + start_time_group = parser.add_mutually_exclusive_group() + start_time_group.add_argument("--min-timestamp", type=int, help="Don't return events older than this epoch time") + start_time_group.add_argument("-s", "--start-time", type=str, + help="Don't return events older than this parsed time") + end_time_group = parser.add_mutually_exclusive_group() + end_time_group.add_argument("--max-timestamp", type=int, help="Don't return events newer than this epoch time") + end_time_group.add_argument("-e", "--end-time", type=str, help="Don't return events older than this parsed time") + parser.add_argument("--top-keys", type=int, help="If specified will output this many top keys for reads or writes", default=0) + args = parser.parse_args() + + type_filter = set() + if args.filter_get_version: type_filter.add("get_version") + if args.filter_get: type_filter.add("get") + if args.filter_get_range: type_filter.add("get_range") + if args.filter_commit: type_filter.add("commit") + if args.filter_error_get: type_filter.add("error_get") + if args.filter_error_get_range: type_filter.add("error_get_range") + if args.filter_error_commit: type_filter.add("error_commit") + top_keys = args.top_keys + key_counter = TopKeysCounter(top_keys) if top_keys else None + range_counter = RangeCounter(top_keys) if (has_sortedcontainers() and top_keys) else None + full_output = args.full_output or (top_keys is not None) + + if args.min_timestamp: + min_timestamp = args.min_timestamp + elif args.start_time: + if not has_dateparser(): + raise Exception("Can't find dateparser needed to parse human dates") + import dateparser + min_timestamp = int(dateparser.parse(args.start_time).timestamp()) + else: + raise Exception("Must specify start time") + + if args.max_timestamp: + max_timestamp = args.max_timestamp + elif args.end_time: + if not has_dateparser(): + raise Exception("Can't find dateparser needed to parse human dates") + import dateparser + max_timestamp = int(dateparser.parse(args.end_time).timestamp()) + else: + raise Exception("Must specify end time") + + now = time.time() + if max_timestamp > now: + raise Exception("max_timestamp is %d seconds in the future" % (max_timestamp - now)) + if min_timestamp > now: + raise Exception("min_timestamp is %d seconds in the future" % (min_timestamp - now)) + + logger.info("Loading transactions from %d to %d" % (min_timestamp, max_timestamp)) + + db = connect(cluster_file=args.cluster_file) + loader = TransactionInfoLoader(db, full_output=full_output, type_filter=type_filter, + min_timestamp=min_timestamp, max_timestamp=max_timestamp) + for info in loader.fetch_transaction_info(): + if info.has_types(): + if not key_counter and not range_counter: + print(info.to_json()) + else: + if key_counter: + key_counter.process(info) + if range_counter: + range_counter.process(info) + + if key_counter: + def print_top(top): + for (count, key) in top: + print("%s %d" % (key, count)) + + def print_range_boundaries(range_boundaries): + for (start, end, count, shard_count, addresses) in range_boundaries: + if not shard_count: + print("[%s, %s] %d" % (start, end, count)) + else: + addresses_string = "addresses=%s" % ','.join(addresses) if addresses else '' + print("[%s, %s] %d shards=%d %s" % (start, end, count, shard_count, addresses_string)) + + shard_finder = ShardFinder(db) + top_reads = key_counter.get_top_k_reads() + if top_reads: + print("Top %d reads:" % min(top_keys, len(top_reads))) + print_top(top_reads) + print("Approx equal sized gets range boundaries:") + print_range_boundaries(key_counter.get_k_read_range_boundaries(shard_finder=shard_finder)) + top_writes = key_counter.get_top_k_writes() + if top_writes: + print("Top %d writes:" % min(top_keys, len(top_writes))) + print_top(top_writes) + print("Approx equal sized commits range boundaries:") + print_range_boundaries(key_counter.get_k_write_range_boundaries(shard_finder=shard_finder)) + if range_counter: + range_boundaries = range_counter.get_range_boundaries(shard_finder=shard_finder) + if range_boundaries: + print("Approx equal sized get_ranges boundaries:") + print_range_boundaries(range_boundaries) + + +if __name__ == "__main__": + main() + From 457896b80d76b614e1db8e912f2964b5c0f1ac29 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 4 Nov 2019 19:47:45 -0800 Subject: [PATCH 134/184] remote logs use bufferedCursor when peeking from log routers to improve performance bufferedCursor performance has been improved --- fdbserver/LogSystem.h | 9 +- fdbserver/LogSystemPeekCursor.actor.cpp | 96 +++++++++++++++------ fdbserver/TagPartitionedLogSystem.actor.cpp | 29 +++---- 3 files changed, 90 insertions(+), 44 deletions(-) diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index 84389232ab..8660492eb3 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -438,6 +438,7 @@ struct ILogSystem { bool hasNextMessage; UID randomID; int tLogReplicationFactor; + Future more; MergedPeekCursor( std::vector< Reference > const& serverCursors, Version begin ); MergedPeekCursor( std::vector>>> const& logServers, int bestServer, int readQuorum, Tag tag, Version begin, Version end, bool parallelGetMore, std::vector const& tLogLocalities, Reference const tLogPolicy, int tLogReplicationFactor ); @@ -484,6 +485,7 @@ struct ILogSystem { bool hasNextMessage; bool useBestSet; UID randomID; + Future more; SetPeekCursor( std::vector> const& logSets, int bestSet, int bestServer, Tag tag, Version begin, Version end, bool parallelGetMore ); SetPeekCursor( std::vector> const& logSets, std::vector< std::vector< Reference > > const& serverCursors, LogMessageVersion const& messageVersion, int bestSet, int bestServer, Optional nextVersion, bool useBestSet ); @@ -572,16 +574,20 @@ struct ILogSystem { }; std::vector> cursors; + std::vector> cursorMessages; std::vector messages; int messageIndex; LogMessageVersion messageVersion; Version end; bool hasNextMessage; bool withTags; + bool knownUnique; Version poppedVersion; Version initialPoppedVersion; bool canDiscardPopped; Future more; + int targetQueueSize; + UID randomID; //FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade. bool collectTags; @@ -589,6 +595,7 @@ struct ILogSystem { void combineMessages(); BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ); + BufferedCursor( std::vector>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ); virtual Reference cloneNoMore(); virtual void setProtocolVersion( ProtocolVersion version ); @@ -644,7 +651,7 @@ struct ILogSystem { // Returns when the preceding changes are durable. (Later we will need multiple return signals for diffferent durability levels) // If the current epoch has ended, push will not return, and the pushed messages will not be visible in any subsequent epoch (but may become visible in this epoch) - virtual Reference peek( UID dbgid, Version begin, Tag tag, bool parallelGetMore = false ) = 0; + virtual Reference peek( UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore = false ) = 0; // Returns (via cursor interface) a stream of messages with the given tag and message versions >= (begin, 0), ordered by message version // If pop was previously or concurrently called with upTo > begin, the cursor may not return all such messages. In that case cursor->popped() will // be greater than begin to reflect that. diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 4c4409c0c0..53ef07b0bf 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -477,6 +477,10 @@ ACTOR Future mergedPeekGetMore(ILogSystem::MergedPeekCursor* self, LogMess } Future ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) { + if( more.isValid() && !more.isReady() ) { + return more; + } + if(!serverCursors.size()) return Never(); @@ -490,7 +494,8 @@ Future ILogSystem::MergedPeekCursor::getMore(TaskPriority taskID) { if (version() > startVersion) return Void(); - return mergedPeekGetMore(this, startVersion, taskID); + more = mergedPeekGetMore(this, startVersion, taskID); + return more; } Future ILogSystem::MergedPeekCursor::onFailed() { @@ -778,6 +783,10 @@ ACTOR Future setPeekGetMore(ILogSystem::SetPeekCursor* self, LogMessageVer } Future ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) { + if( more.isValid() && !more.isReady() ) { + return more; + } + auto startVersion = version(); calcHasMessage(); if( hasMessage() ) @@ -788,7 +797,8 @@ Future ILogSystem::SetPeekCursor::getMore(TaskPriority taskID) { if (version() > startVersion) return Void(); - return setPeekGetMore(this, startVersion, taskID); + more = setPeekGetMore(this, startVersion, taskID); + return more; } Future ILogSystem::SetPeekCursor::onFailed() { @@ -909,8 +919,20 @@ Version ILogSystem::MultiCursor::popped() { return std::max(poppedVersion, cursors.back()->popped()); } -ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped) { +ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped), knownUnique(false), randomID(deterministicRandom()->randomUniqueID()) { + targetQueueSize = 5000/cursors.size(); messages.reserve(10000); + cursorMessages.resize(cursors.size()); +} + +ILogSystem::BufferedCursor::BufferedCursor( std::vector>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ) : messageVersion(begin), end(end), withTags(true), collectTags(false), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(false), knownUnique(true), randomID(deterministicRandom()->randomUniqueID()) { + targetQueueSize = 5000/logServers.size(); + messages.reserve(10000); + cursorMessages.resize(logServers.size()); + for( int i = 0; i < logServers.size(); i++ ) { + Reference cursor( new ILogSystem::ServerPeekCursor( logServers[i], tag, begin, end, false, parallelGetMore ) ); + cursors.push_back( cursor ); + } } void ILogSystem::BufferedCursor::combineMessages() { @@ -990,26 +1012,23 @@ void ILogSystem::BufferedCursor::advanceTo(LogMessageVersion n) { ASSERT(false); } -ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, Version maxVersion, TaskPriority taskID ) { - if(cursor->version().version >= maxVersion) { - return Void(); - } +ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Reference cursor, int idx, TaskPriority taskID ) { loop { wait(yield()); + if(cursor->version().version >= self->end || self->cursorMessages[idx].size() > self->targetQueueSize) { + return Void(); + } wait(cursor->getMore(taskID)); self->poppedVersion = std::max(self->poppedVersion, cursor->popped()); if(self->canDiscardPopped) { self->initialPoppedVersion = std::max(self->initialPoppedVersion, cursor->popped()); } - if(cursor->version().version >= maxVersion) { + if(cursor->version().version >= self->end) { return Void(); } while(cursor->hasMessage()) { - self->messages.push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector() : cursor->getTags(), cursor->version())); + self->cursorMessages[idx].push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector() : cursor->getTags(), cursor->version())); cursor->nextMessage(); - if(cursor->version().version >= maxVersion) { - return Void(); - } } } } @@ -1020,37 +1039,55 @@ ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriori throw internal_error(); } - state Version targetVersion = std::min(self->end, self->messageVersion.version + SERVER_KNOBS->VERSIONS_PER_BATCH); self->messages.clear(); std::vector> loaders; loaders.reserve(self->cursors.size()); - for(auto& cursor : self->cursors) { - loaders.push_back(bufferedGetMoreLoader(self, cursor, targetVersion, taskID)); - } - wait( waitForAll(loaders) ); - wait(yield()); - if(self->collectTags) { + for(int i = 0; i < self->cursors.size(); i++) { + loaders.push_back(bufferedGetMoreLoader(self, self->cursors[i], i, taskID)); + } + + state Future allLoaders = waitForAll(loaders); + state Version minVersion; + loop { + wait( allLoaders || delay(0.005, taskID) ); + minVersion = self->end; + for(auto& cursor : self->cursors) { + minVersion = std::min(minVersion, cursor->version().version); + } + if(minVersion > self->messageVersion.version) { + break; + } + if(allLoaders.isReady()) { + wait(Future(Never())); + } + } + wait( yield() ); + + for(auto &it : self->cursorMessages) { + while(!it.empty() && it.front().version.version < minVersion) { + self->messages.push_back(it.front()); + it.pop_front(); + } + } + if(self->collectTags || self->knownUnique) { std::sort(self->messages.begin(), self->messages.end()); } else { uniquify(self->messages); } + + self->messageVersion = LogMessageVersion(minVersion); self->messageIndex = 0; self->hasNextMessage = self->messages.size() > 0; - Version minVersion = self->end; - for(auto& cursor : self->cursors) { - minVersion = std::min(minVersion, cursor->version().version); - } - self->messageVersion = LogMessageVersion(minVersion); - + if(self->collectTags) { self->combineMessages(); } wait(yield()); if(self->canDiscardPopped && self->poppedVersion > self->version().version) { - TraceEvent(SevWarn, "DiscardingPoppedData").detail("Version", self->version().version).detail("Popped", self->poppedVersion); + TraceEvent(SevWarn, "DiscardingPoppedData", self->randomID).detail("Version", self->version().version).detail("Popped", self->poppedVersion); self->messageVersion = std::max(self->messageVersion, LogMessageVersion(self->poppedVersion)); for(auto& cursor : self->cursors) { cursor->advanceTo(self->messageVersion); @@ -1107,8 +1144,11 @@ const LogMessageVersion& ILogSystem::BufferedCursor::version() { } Version ILogSystem::BufferedCursor::getMinKnownCommittedVersion() { - ASSERT(false); - return invalidVersion; + Version res = 0; + for(auto& cursor : cursors) { + res = std::max(res, cursor->getMinKnownCommittedVersion()); + } + return res; } Version ILogSystem::BufferedCursor::popped() { diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 35616454d8..9a62d8b99e 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -538,7 +538,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted peekRemote( UID dbgid, Version begin, Tag tag, bool parallelGetMore ) { + Reference peekRemote( UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore ) { int bestSet = -1; Version lastBegin = recoveredAt.present() ? recoveredAt.get() + 1 : 0; for(int t = 0; t < tLogs.size(); t++) { @@ -552,21 +552,21 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } if(begin >= lastBegin) { - TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - return Reference( new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, begin, getPeekEnd(), parallelGetMore, std::vector(), Reference(), 0 ) ); + TraceEvent("TLogPeekRemoteBestOnly", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); + return Reference( new ILogSystem::BufferedCursor( tLogs[bestSet]->logRouters, tag, begin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore ) ); } else { std::vector< Reference > cursors; std::vector< LogMessageVersion > epochEnds; - TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); - cursors.emplace_back(new ILogSystem::MergedPeekCursor( tLogs[bestSet]->logRouters, -1, (int)tLogs[bestSet]->logRouters.size(), tag, lastBegin, getPeekEnd(), parallelGetMore, std::vector(), Reference(), 0 ) ); + TraceEvent("TLogPeekRemoteAddingBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestSet", bestSet).detail("BestSetStart", lastBegin).detail("LogRouterIds", tLogs[bestSet]->logRouterString()); + cursors.emplace_back(new ILogSystem::BufferedCursor( tLogs[bestSet]->logRouters, tag, lastBegin, end.present() ? end.get() + 1 : getPeekEnd(), parallelGetMore ) ); int i = 0; while(begin < lastBegin) { if(i == oldLogData.size()) { - TraceEvent("TLogPeekRemoteDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size()); + TraceEvent("TLogPeekRemoteDead", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("LastBegin", lastBegin).detail("OldLogDataSize", oldLogData.size()); return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } @@ -583,15 +583,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, parallelGetMore ) ); } if(thisBegin < lastBegin) { - TraceEvent("TLogPeekRemoteAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("BestOldSet", bestOldSet).detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) + TraceEvent("TLogPeekRemoteAddingOldBest", dbgid).detail("Tag", tag.toString()).detail("Begin", begin).detail("End", end.present() ? end.get() : getPeekEnd()).detail("BestOldSet", bestOldSet).detail("LogRouterIds", oldLogData[i].tLogs[bestOldSet]->logRouterString()) .detail("LastBegin", lastBegin).detail("ThisBegin", thisBegin).detail("BestStartVer", oldLogData[i].tLogs[bestOldSet]->startVersion); - cursors.emplace_back(new ILogSystem::MergedPeekCursor(oldLogData[i].tLogs[bestOldSet]->logRouters, -1, (int)oldLogData[i].tLogs[bestOldSet]->logRouters.size(), tag, - thisBegin, lastBegin, parallelGetMore, std::vector(), Reference(), 0)); + cursors.emplace_back(new ILogSystem::BufferedCursor(oldLogData[i].tLogs[bestOldSet]->logRouters, tag, thisBegin, lastBegin, parallelGetMore)); epochEnds.emplace_back(lastBegin); lastBegin = thisBegin; } @@ -602,14 +601,14 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted peek( UID dbgid, Version begin, Tag tag, bool parallelGetMore ) { + virtual Reference peek( UID dbgid, Version begin, Optional end, Tag tag, bool parallelGetMore ) { if(!tLogs.size()) { TraceEvent("TLogPeekNoLogSets", dbgid).detail("Tag", tag.toString()).detail("Begin", begin); return Reference( new ILogSystem::ServerPeekCursor( Reference>>(), tag, begin, getPeekEnd(), false, false ) ); } if(tag.locality == tagLocalityRemoteLog) { - return peekRemote(dbgid, begin, tag, parallelGetMore); + return peekRemote(dbgid, begin, end, tag, parallelGetMore); } else { return peekAll(dbgid, begin, getPeekEnd(), tag, parallelGetMore); } @@ -622,12 +621,12 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted > cursors; for(auto tag : tags) { - cursors.push_back(peek(dbgid, begin, tag, parallelGetMore)); + cursors.push_back(peek(dbgid, begin, end, tag, parallelGetMore)); } return Reference( new ILogSystem::BufferedCursor(cursors, begin, end.present() ? end.get() + 1 : getPeekEnd(), true, tLogs[0]->locality == tagLocalityUpgraded, false) ); } From daac8a2c22b5a38a46033857bd473a904b975022 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 4 Nov 2019 20:21:38 -0800 Subject: [PATCH 135/184] Knobified a few variables --- documentation/sphinx/source/release-notes.rst | 2 +- fdbserver/Knobs.cpp | 3 ++- fdbserver/Knobs.h | 3 ++- fdbserver/LogSystemPeekCursor.actor.cpp | 10 +++++----- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index a761cd2389..fd12e17c9d 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -8,7 +8,7 @@ Release Notes Fixes ----- -* Significantly improved the rate at which the transaction logs in a remote region can pull data from the primary region. `(PR #2307) `_. +* Significantly improved the rate at which the transaction logs in a remote region can pull data from the primary region. `(PR #2307) `_ `(PR #2323) `_. * The ``system_kv_size_bytes`` status field could report a size much larger than the actual size of the system keyspace. `(PR #2305) `_. 6.2.7 diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index c692d80ed9..469b2ecc60 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -67,7 +67,8 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( PARALLEL_GET_MORE_REQUESTS, 32 ); if( randomize && BUGGIFY ) PARALLEL_GET_MORE_REQUESTS = 2; init( MULTI_CURSOR_PRE_FETCH_LIMIT, 10 ); init( MAX_QUEUE_COMMIT_BYTES, 15e6 ); if( randomize && BUGGIFY ) MAX_QUEUE_COMMIT_BYTES = 5000; - init( VERSIONS_PER_BATCH, VERSIONS_PER_SECOND/20 ); if( randomize && BUGGIFY ) VERSIONS_PER_BATCH = std::max(1,VERSIONS_PER_SECOND/1000); + init( DESIRED_OUTSTANDING_MESSAGES, 5000 ); if( randomize && BUGGIFY ) DESIRED_OUTSTANDING_MESSAGES = deterministicRandom()->randomInt(0,100); + init( DESIRED_GET_MORE_DELAY, 0.005 ); init( CONCURRENT_LOG_ROUTER_READS, 1 ); init( LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED, 1 ); if( randomize && BUGGIFY ) LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED = 0; init( DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME, 1.0 ); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index 924e6a427f..d89566b9f2 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -70,7 +70,8 @@ public: int PARALLEL_GET_MORE_REQUESTS; int MULTI_CURSOR_PRE_FETCH_LIMIT; int64_t MAX_QUEUE_COMMIT_BYTES; - int64_t VERSIONS_PER_BATCH; + int DESIRED_OUTSTANDING_MESSAGES; + double DESIRED_GET_MORE_DELAY; int CONCURRENT_LOG_ROUTER_READS; int LOG_ROUTER_PEEK_FROM_SATELLITES_PREFERRED; // 0==peek from primary, non-zero==peek from satellites double DISK_QUEUE_ADAPTER_MIN_SWITCH_TIME; diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 53ef07b0bf..5c84886408 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -920,14 +920,14 @@ Version ILogSystem::MultiCursor::popped() { } ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped), knownUnique(false), randomID(deterministicRandom()->randomUniqueID()) { - targetQueueSize = 5000/cursors.size(); - messages.reserve(10000); + targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/cursors.size(); + messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES); cursorMessages.resize(cursors.size()); } ILogSystem::BufferedCursor::BufferedCursor( std::vector>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ) : messageVersion(begin), end(end), withTags(true), collectTags(false), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(false), knownUnique(true), randomID(deterministicRandom()->randomUniqueID()) { - targetQueueSize = 5000/logServers.size(); - messages.reserve(10000); + targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/logServers.size(); + messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES); cursorMessages.resize(logServers.size()); for( int i = 0; i < logServers.size(); i++ ) { Reference cursor( new ILogSystem::ServerPeekCursor( logServers[i], tag, begin, end, false, parallelGetMore ) ); @@ -1051,7 +1051,7 @@ ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriori state Future allLoaders = waitForAll(loaders); state Version minVersion; loop { - wait( allLoaders || delay(0.005, taskID) ); + wait( allLoaders || delay(SERVER_KNOBS->DESIRED_GET_MORE_DELAY, taskID) ); minVersion = self->end; for(auto& cursor : self->cursors) { minVersion = std::min(minVersion, cursor->version().version); From cb65641115f92a0b1773dcca478b515fb11fab89 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 4 Nov 2019 20:25:49 -0800 Subject: [PATCH 136/184] updated downloads for 6.2.8 --- documentation/sphinx/source/downloads.rst | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/documentation/sphinx/source/downloads.rst b/documentation/sphinx/source/downloads.rst index 64d13865f0..4f300b9aee 100644 --- a/documentation/sphinx/source/downloads.rst +++ b/documentation/sphinx/source/downloads.rst @@ -10,38 +10,38 @@ macOS The macOS installation package is supported on macOS 10.7+. It includes the client and (optionally) the server. -* `FoundationDB-6.2.7.pkg `_ +* `FoundationDB-6.2.8.pkg `_ Ubuntu ------ The Ubuntu packages are supported on 64-bit Ubuntu 12.04+, but beware of the Linux kernel bug in Ubuntu 12.x. -* `foundationdb-clients-6.2.7-1_amd64.deb `_ -* `foundationdb-server-6.2.7-1_amd64.deb `_ (depends on the clients package) +* `foundationdb-clients-6.2.8-1_amd64.deb `_ +* `foundationdb-server-6.2.8-1_amd64.deb `_ (depends on the clients package) RHEL/CentOS EL6 --------------- The RHEL/CentOS EL6 packages are supported on 64-bit RHEL/CentOS 6.x. -* `foundationdb-clients-6.2.7-1.el6.x86_64.rpm `_ -* `foundationdb-server-6.2.7-1.el6.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.8-1.el6.x86_64.rpm `_ +* `foundationdb-server-6.2.8-1.el6.x86_64.rpm `_ (depends on the clients package) RHEL/CentOS EL7 --------------- The RHEL/CentOS EL7 packages are supported on 64-bit RHEL/CentOS 7.x. -* `foundationdb-clients-6.2.7-1.el7.x86_64.rpm `_ -* `foundationdb-server-6.2.7-1.el7.x86_64.rpm `_ (depends on the clients package) +* `foundationdb-clients-6.2.8-1.el7.x86_64.rpm `_ +* `foundationdb-server-6.2.8-1.el7.x86_64.rpm `_ (depends on the clients package) Windows ------- The Windows installer is supported on 64-bit Windows XP and later. It includes the client and (optionally) the server. -* `foundationdb-6.2.7-x64.msi `_ +* `foundationdb-6.2.8-x64.msi `_ API Language Bindings ===================== @@ -58,18 +58,18 @@ On macOS and Windows, the FoundationDB Python API bindings are installed as part If you need to use the FoundationDB Python API from other Python installations or paths, download the Python package: -* `foundationdb-6.2.7.tar.gz `_ +* `foundationdb-6.2.8.tar.gz `_ Ruby 1.9.3/2.0.0+ ----------------- -* `fdb-6.2.7.gem `_ +* `fdb-6.2.8.gem `_ Java 8+ ------- -* `fdb-java-6.2.7.jar `_ -* `fdb-java-6.2.7-javadoc.jar `_ +* `fdb-java-6.2.8.jar `_ +* `fdb-java-6.2.8-javadoc.jar `_ Go 1.11+ -------- From f84c2667f0e7aeebbea1351e3f4126f40682d817 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Mon, 4 Nov 2019 20:39:37 -0800 Subject: [PATCH 137/184] update installer WIX GUID following release --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index 72aa8d3851..d58ccfa3e7 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ Date: Tue, 5 Nov 2019 01:11:34 -0800 Subject: [PATCH 138/184] Bug fix, DWALPager must flush its page ID queues in order to get an accurate user page count. --- fdbserver/VersionedBTree.actor.cpp | 43 +++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index daf97d46b4..e339de8873 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -1335,19 +1335,12 @@ public: return Void(); } - ACTOR static Future commit_impl(DWALPager *self) { - debug_printf("DWALPager(%s) commit begin\n", self->filename.c_str()); - - // Write old committed header to Page 1 - self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); - - // Trigger the remap eraser to stop and then wait for it. - self->remapUndoStop = true; - wait(self->remapUndoFuture); + // Flush all queues so they have no operations pending. + ACTOR static Future flushQueues(DWALPager *self) { + ASSERT(self->remapUndoFuture.isReady()); // Flush remap queue separately, it's not involved in free page management wait(self->remapQueue.flush()); - self->pHeader->remapQueue = self->remapQueue.getState(); // Flush the free list and delayed free list queues together as they are used by freePage() and newPageID() loop { @@ -1364,6 +1357,22 @@ public: self->freeList.finishFlush(); self->delayedFreeList.finishFlush(); + return Void(); + } + + ACTOR static Future commit_impl(DWALPager *self) { + debug_printf("DWALPager(%s) commit begin\n", self->filename.c_str()); + + // Write old committed header to Page 1 + self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); + + // Trigger the remap eraser to stop and then wait for it. + self->remapUndoStop = true; + wait(self->remapUndoFuture); + + wait(flushQueues(self)); + + self->pHeader->remapQueue = self->remapQueue.getState(); self->pHeader->freeList = self->freeList.getState(); self->pHeader->delayedFreeList = self->delayedFreeList.getState(); @@ -1476,9 +1485,19 @@ public: return StorageBytes(free, total, pagerSize, free + reusable); } - // Get the number of pages in use but not by the pager itself. + ACTOR static Future getUserPageCount_cleanup(DWALPager *self) { + // Wait for the remap eraser to finish all of its work (not triggering stop) + wait(self->remapUndoFuture); + + // Flush queues so there are no pending freelist operations + wait(flushQueues(self)); + + return Void(); + } + + // Get the number of pages in use by the pager's user Future getUserPageCount() override { - return map(remapUndoFuture, [=](Void) { + return map(getUserPageCount_cleanup(this), [=](Void) { int64_t userPages = pHeader->pageCount - 2 - freeList.numPages - freeList.numEntries - delayedFreeList.numPages - delayedFreeList.numEntries - remapQueue.numPages; debug_printf("DWALPager(%s) userPages=%" PRId64 " totalPageCount=%" PRId64 " freeQueuePages=%" PRId64 " freeQueueCount=%" PRId64 " delayedFreeQueuePages=%" PRId64 " delayedFreeQueueCount=%" PRId64 " remapQueuePages=%" PRId64 " remapQueueCount=%" PRId64 "\n", filename.c_str(), userPages, pHeader->pageCount, freeList.numPages, freeList.numEntries, delayedFreeList.numPages, delayedFreeList.numEntries, remapQueue.numPages, remapQueue.numEntries); From e7210fe8429a58f25c9367223dd27b704e377f97 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 5 Nov 2019 09:42:17 -0800 Subject: [PATCH 139/184] Trace:Resolve review comments and add SevVerbose level --- fdbserver/RestoreUtil.h | 4 ++-- flow/Trace.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/RestoreUtil.h b/fdbserver/RestoreUtil.h index 0d7fa0e720..a645d3a391 100644 --- a/fdbserver/RestoreUtil.h +++ b/fdbserver/RestoreUtil.h @@ -34,8 +34,8 @@ #include #include -// #define SevFRMutationInfo SevNoInfo -#define SevFRMutationInfo SevInfo +#define SevFRMutationInfo SevVerbose +//#define SevFRMutationInfo SevInfo enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier }; BINARY_SERIALIZABLE(RestoreRole); diff --git a/flow/Trace.h b/flow/Trace.h index 0d8dc55ff4..ff9d6a9673 100644 --- a/flow/Trace.h +++ b/flow/Trace.h @@ -45,7 +45,7 @@ inline static bool TRACE_SAMPLE() { return false; } extern thread_local int g_trace_depth; enum Severity { - SevNoInfo = 0, + SevVerbose = 0, SevSample = 1, SevDebug = 5, SevInfo = 10, From f7b3686fc733d0eff605cffbed7fc2ff71344a05 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Mon, 4 Nov 2019 13:49:32 -0800 Subject: [PATCH 140/184] fixed bug in maintaining kill set size --- .../workloads/RemoveServersSafely.actor.cpp | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/fdbserver/workloads/RemoveServersSafely.actor.cpp b/fdbserver/workloads/RemoveServersSafely.actor.cpp index d44e4e5b08..7067aef81a 100644 --- a/fdbserver/workloads/RemoveServersSafely.actor.cpp +++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp @@ -452,18 +452,35 @@ struct RemoveServersSafelyWorkload : TestWorkload { // Swap coordinator with one server in the kill set to ensure the number of processes to kill does not increase. // This is needed only if a new coordinator is added to the toKill set in this function and safety check passes if (markExcludeAsFailed && coordExcl.isValid()) { + // Situation where the entirety of original kill set is selected and extra coordinator is added + // Shrink down failed vector to maintain size guarantees + if (toKillMarkFailedArray.size() > toKillArray.size()) { + auto removeServer = toKillMarkFailedArray.begin(); + TraceEvent("RemoveAndKill", functionId) + .detail("Step", "ShrinkFailedKillSet") + .detail("Removing", removeServer->toString()); + toKillMarkFailedArray.erase(removeServer); + } auto removeServer = toKill.begin(); TraceEvent("RemoveAndKill", functionId) - .detail("Step", "ReplaceKillSet") + .detail("Step", "ReplaceNonFailedKillSet") .detail("Removing", removeServer->toString()) .detail("Adding", coordExcl.toString()); - toKill.erase(removeServer); - toKill.insert(coordExcl); toKillArray.erase(std::remove(toKillArray.begin(), toKillArray.end(), *removeServer), toKillArray.end()); toKillArray.push_back(coordExcl); + toKill.erase(removeServer); + toKill.insert(coordExcl); } killProcArray = self->getProcesses(toKill); - TraceEvent("RemoveAndKill", functionId).detail("Step", "Activate Server Exclusion").detail("KillAddrs", toKill.size()).detail("KillProcs", killProcArray.size()).detail("MissingProcs", toKill.size()!=killProcArray.size()).detail("ToKill", describe(toKill)).detail("Addresses", describe(toKillArray)).detail("ClusterAvailable", g_simulator.isAvailable()); + TraceEvent("RemoveAndKill", functionId) + .detail("Step", "Activate Server Exclusion") + .detail("KillAddrs", toKill.size()) + .detail("KillProcs", killProcArray.size()) + .detail("MissingProcs", toKill.size() != killProcArray.size()) + .detail("ToKill", describe(toKill)) + .detail("Addresses", describe(toKillArray)) + .detail("FailedAddresses", describe(toKillMarkFailedArray)) + .detail("ClusterAvailable", g_simulator.isAvailable()); if (markExcludeAsFailed) { wait( excludeServers( cx, toKillMarkFailedArray, true ) ); } From f77a64dce122490e6e0a9984fd8771979018fc89 Mon Sep 17 00:00:00 2001 From: Balachandar Namasivayam Date: Tue, 5 Nov 2019 13:04:00 -0800 Subject: [PATCH 141/184] Mov alloc_instrumentation.py to contrib/ --- {tools => contrib}/alloc_instrumentation.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {tools => contrib}/alloc_instrumentation.py (100%) diff --git a/tools/alloc_instrumentation.py b/contrib/alloc_instrumentation.py similarity index 100% rename from tools/alloc_instrumentation.py rename to contrib/alloc_instrumentation.py From b50d26c4da7e1ba26d66d00d3505a56484b65b13 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Tue, 5 Nov 2019 13:46:04 -0800 Subject: [PATCH 142/184] KVStoreTest now runs sqlite and redwood tests. --- tests/KVStoreTest.txt | 58 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/tests/KVStoreTest.txt b/tests/KVStoreTest.txt index edb5ac249c..97eb709703 100644 --- a/tests/KVStoreTest.txt +++ b/tests/KVStoreTest.txt @@ -7,11 +7,12 @@ setFraction=0.01 nodeCount=20000000 keyBytes=16 valueBytes=96 -filename=bttest setup=true clear=false count=false useDB=false +storeType=ssd +filename=bttest-sqlite testTitle=Scan testName=KVStoreTest @@ -22,11 +23,12 @@ setFraction=0.01 nodeCount=20000000 keyBytes=16 valueBytes=96 -filename=bttest setup=false clear=false count=true useDB=false +storeType=ssd +filename=bttest-sqlite testTitle=RandomWriteSaturation testName=KVStoreTest @@ -38,8 +40,58 @@ setFraction=1.0 nodeCount=20000000 keyBytes=16 valueBytes=96 -filename=bttest setup=false clear=false count=false useDB=false +storeType=ssd +filename=bttest-sqlite + +testTitle=Insert +testName=KVStoreTest +testDuration=0.0 +operationsPerSecond=28000 +commitFraction=0.001 +setFraction=0.01 +nodeCount=20000000 +keyBytes=16 +valueBytes=96 +setup=true +clear=false +count=false +useDB=false +storeType=ssd-redwood-experimental +filename=bttest-redwood + +testTitle=Scan +testName=KVStoreTest +testDuration=20.0 +operationsPerSecond=28000 +commitFraction=0.0001 +setFraction=0.01 +nodeCount=20000000 +keyBytes=16 +valueBytes=96 +setup=false +clear=false +count=true +useDB=false +storeType=ssd-redwood-experimental +filename=bttest-redwood + +testTitle=RandomWriteSaturation +testName=KVStoreTest +testDuration=20.0 +saturation=true +operationsPerSecond=10000 +commitFraction=0.00005 +setFraction=1.0 +nodeCount=20000000 +keyBytes=16 +valueBytes=96 +setup=false +clear=false +count=false +useDB=false +storeType=ssd-redwood-experimental +filename=bttest-redwood From 4a597fdcce643c7d724590de667410bde91fc86d Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 5 Nov 2019 15:03:41 -0800 Subject: [PATCH 143/184] increase the task priority of popping --- fdbserver/TagPartitionedLogSystem.actor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 9a62d8b99e..c052552d80 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -1032,7 +1032,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted popFromLog( TagPartitionedLogSystem* self, Reference>> log, Tag tag, double time ) { state Version last = 0; loop { - wait( delay(time) ); + wait( delay(time, TaskPriority::TLogPop) ); state std::pair to = self->outstandingPops[ std::make_pair(log->get().id(),tag) ]; @@ -1044,7 +1044,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedget().present() ) return Void(); - wait(log->get().interf().popMessages.getReply( TLogPopRequest( to.first, to.second, tag ) ) ); + wait(log->get().interf().popMessages.getReply( TLogPopRequest( to.first, to.second, tag ), TaskPriority::TLogPop ) ); last = to.first; } catch (Error& e) { From a8ca47beffd64f4707589fbe72e41e5dbade76d3 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 5 Nov 2019 18:07:30 -0800 Subject: [PATCH 144/184] optimized memory allocations by using VectorRef instead of std::vector --- fdbclient/FDBTypes.h | 4 +- fdbserver/LogRouter.actor.cpp | 5 ++- fdbserver/LogSystem.h | 29 ++++++++------ fdbserver/LogSystemPeekCursor.actor.cpp | 44 +++++++++------------ fdbserver/OldTLogServer_6_0.actor.cpp | 13 +++--- fdbserver/TLogServer.actor.cpp | 13 +++--- fdbserver/TagPartitionedLogSystem.actor.cpp | 14 +++---- 7 files changed, 61 insertions(+), 61 deletions(-) diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index 690ebb9865..d88355735e 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -110,10 +110,10 @@ enum { txsTagOld = -1, invalidTagOld = -100 }; struct TagsAndMessage { StringRef message; - std::vector tags; + VectorRef tags; TagsAndMessage() {} - TagsAndMessage(StringRef message, const std::vector& tags) : message(message), tags(tags) {} + TagsAndMessage(StringRef message, VectorRef tags) : message(message), tags(tags) {} }; struct KeyRangeRef; diff --git a/fdbserver/LogRouter.actor.cpp b/fdbserver/LogRouter.actor.cpp index 53fa69b163..46686ef677 100644 --- a/fdbserver/LogRouter.actor.cpp +++ b/fdbserver/LogRouter.actor.cpp @@ -245,6 +245,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { state Version ver = 0; state std::vector messages; + state Arena arena; while (true) { state bool foundMessage = r->hasMessage(); if (!foundMessage || r->version().version != ver) { @@ -260,6 +261,7 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { lastVer = ver; ver = r->version().version; messages.clear(); + arena = Arena(); if (!foundMessage) { ver--; //ver is the next possible version we will get data for @@ -277,8 +279,9 @@ ACTOR Future pullAsyncData( LogRouterData *self ) { tagAndMsg.message = r->getMessageWithTags(); tags.clear(); self->logSet.getPushLocations(r->getTags(), tags, 0); + tagAndMsg.tags.reserve(arena, tags.size()); for (const auto& t : tags) { - tagAndMsg.tags.emplace_back(tagLocalityRemoteLog, t); + tagAndMsg.tags.push_back(arena, Tag(tagLocalityRemoteLog, t)); } messages.push_back(std::move(tagAndMsg)); diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index 8660492eb3..a65ccf56f3 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -231,7 +231,7 @@ public: return resultEntries.size() == 0; } - void getPushLocations(std::vector const& tags, std::vector& locations, int locationOffset, + void getPushLocations(VectorRef tags, std::vector& locations, int locationOffset, bool allLocations = false) { if(locality == tagLocalitySatellite) { for(auto& t : tags) { @@ -309,7 +309,7 @@ struct ILogSystem { //pre: only callable if hasMessage() returns true //return the tags associated with the message for the current sequence - virtual const std::vector& getTags() = 0; + virtual VectorRef getTags() = 0; //pre: only callable if hasMessage() returns true //returns the arena containing the contents of getMessage(), getMessageWithTags(), and reader() @@ -382,7 +382,7 @@ struct ILogSystem { LogMessageVersion messageVersion, end; Version poppedVersion; int32_t messageLength, rawLength; - std::vector tags; + VectorRef tags; bool hasMsg; Future more; UID randomID; @@ -405,7 +405,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -454,7 +454,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -500,7 +500,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -534,7 +534,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -557,12 +557,12 @@ struct ILogSystem { struct BufferedMessage { Arena arena; StringRef message; - std::vector tags; + VectorRef tags; LogMessageVersion version; BufferedMessage() {} explicit BufferedMessage( Version version ) : version(version) {} - BufferedMessage( Arena arena, StringRef message, const std::vector& tags, const LogMessageVersion& version ) : arena(arena), message(message), tags(tags), version(version) {} + BufferedMessage( Arena arena, StringRef message, const VectorRef& tags, const LogMessageVersion& version ) : arena(arena), message(message), tags(tags), version(version) {} bool operator < (BufferedMessage const& r) const { return version < r.version; @@ -582,6 +582,7 @@ struct ILogSystem { bool hasNextMessage; bool withTags; bool knownUnique; + Version minKnownCommittedVersion; Version poppedVersion; Version initialPoppedVersion; bool canDiscardPopped; @@ -591,7 +592,7 @@ struct ILogSystem { //FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade. bool collectTags; - std::vector tags; + VectorRef tags; void combineMessages(); BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ); @@ -605,7 +606,7 @@ struct ILogSystem { virtual void nextMessage(); virtual StringRef getMessage(); virtual StringRef getMessageWithTags(); - virtual const std::vector& getTags(); + virtual VectorRef getTags(); virtual void advanceTo(LogMessageVersion n); virtual Future getMore(TaskPriority taskID = TaskPriority::TLogPeekReply); virtual Future onFailed(); @@ -717,7 +718,11 @@ struct ILogSystem { virtual Future onLogSystemConfigChange() = 0; // Returns when the log system configuration has changed due to a tlog rejoin. - virtual void getPushLocations(std::vector const& tags, std::vector& locations, bool allLocations = false) = 0; + virtual void getPushLocations(VectorRef tags, std::vector& locations, bool allLocations = false) = 0; + + void getPushLocations(std::vector const& tags, std::vector& locations, bool allLocations = false) { + getPushLocations(VectorRef((Tag*)&tags.front(), tags.size()), locations, allLocations); + } virtual bool hasRemoteLogs() const = 0; diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 5c84886408..081468e575 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -92,10 +92,7 @@ void ILogSystem::ServerPeekCursor::nextMessage() { uint16_t tagCount; rd.checkpoint(); rd >> messageLength >> messageVersion.sub >> tagCount; - tags.resize(tagCount); - for(int i = 0; i < tagCount; i++) { - rd >> tags[i]; - } + tags = VectorRef((Tag*)rd.readBytes(tagCount*sizeof(Tag)), tagCount); rawLength = messageLength + sizeof(messageLength); messageLength -= (sizeof(messageVersion.sub) + sizeof(tagCount) + tagCount*sizeof(Tag)); hasMsg = true; @@ -112,7 +109,7 @@ StringRef ILogSystem::ServerPeekCursor::getMessageWithTags() { return StringRef( (uint8_t const*)rd.readBytes(rawLength), rawLength); } -const std::vector& ILogSystem::ServerPeekCursor::getTags() { +VectorRef ILogSystem::ServerPeekCursor::getTags() { return tags; } @@ -438,7 +435,7 @@ StringRef ILogSystem::MergedPeekCursor::getMessageWithTags() { return serverCursors[currentCursor]->getMessageWithTags(); } -const std::vector& ILogSystem::MergedPeekCursor::getTags() { +VectorRef ILogSystem::MergedPeekCursor::getTags() { return serverCursors[currentCursor]->getTags(); } @@ -702,7 +699,7 @@ StringRef ILogSystem::SetPeekCursor::getMessage() { return serverCursors[current StringRef ILogSystem::SetPeekCursor::getMessageWithTags() { return serverCursors[currentSet][currentCursor]->getMessageWithTags(); } -const std::vector& ILogSystem::SetPeekCursor::getTags() { +VectorRef ILogSystem::SetPeekCursor::getTags() { return serverCursors[currentSet][currentCursor]->getTags(); } @@ -869,7 +866,7 @@ StringRef ILogSystem::MultiCursor::getMessageWithTags() { return cursors.back()->getMessageWithTags(); } -const std::vector& ILogSystem::MultiCursor::getTags() { +VectorRef ILogSystem::MultiCursor::getTags() { return cursors.back()->getTags(); } @@ -919,13 +916,13 @@ Version ILogSystem::MultiCursor::popped() { return std::max(poppedVersion, cursors.back()->popped()); } -ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped), knownUnique(false), randomID(deterministicRandom()->randomUniqueID()) { +ILogSystem::BufferedCursor::BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ) : cursors(cursors), messageVersion(begin), end(end), withTags(withTags), collectTags(collectTags), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(canDiscardPopped), knownUnique(false), minKnownCommittedVersion(0), randomID(deterministicRandom()->randomUniqueID()) { targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/cursors.size(); messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES); cursorMessages.resize(cursors.size()); } -ILogSystem::BufferedCursor::BufferedCursor( std::vector>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ) : messageVersion(begin), end(end), withTags(true), collectTags(false), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(false), knownUnique(true), randomID(deterministicRandom()->randomUniqueID()) { +ILogSystem::BufferedCursor::BufferedCursor( std::vector>>> const& logServers, Tag tag, Version begin, Version end, bool parallelGetMore ) : messageVersion(begin), end(end), withTags(true), collectTags(false), hasNextMessage(false), messageIndex(0), poppedVersion(0), initialPoppedVersion(0), canDiscardPopped(false), knownUnique(true), minKnownCommittedVersion(0), randomID(deterministicRandom()->randomUniqueID()) { targetQueueSize = SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES/logServers.size(); messages.reserve(SERVER_KNOBS->DESIRED_OUTSTANDING_MESSAGES); cursorMessages.resize(logServers.size()); @@ -940,22 +937,22 @@ void ILogSystem::BufferedCursor::combineMessages() { return; } - tags.clear(); - tags.push_back(messages[messageIndex].tags[0]); + std::vector tempTags; + tempTags.push_back(messages[messageIndex].tags[0]); for(int i = messageIndex + 1; i < messages.size() && messages[messageIndex].version == messages[i].version; i++) { - tags.push_back(messages[i].tags[0]); + tempTags.push_back(messages[i].tags[0]); messageIndex = i; } auto& msg = messages[messageIndex]; BinaryWriter messageWriter(Unversioned()); - messageWriter << uint32_t(msg.message.size() + sizeof(uint32_t) + sizeof(uint16_t) + tags.size()*sizeof(Tag)) << msg.version.sub << uint16_t(tags.size()); - for(auto& t : tags) { + messageWriter << uint32_t(msg.message.size() + sizeof(uint32_t) + sizeof(uint16_t) + tempTags.size()*sizeof(Tag)) << msg.version.sub << uint16_t(tags.size()); + msg.tags = VectorRef((Tag*)(((uint8_t*)messageWriter.getData())+messageWriter.getLength()), tags.size()); + for(auto t : tempTags) { messageWriter << t; } messageWriter.serializeBytes(msg.message); Standalone val = messageWriter.toValue(); msg.arena = val.arena(); - msg.tags = tags; msg.message = val; } @@ -1003,7 +1000,7 @@ StringRef ILogSystem::BufferedCursor::getMessageWithTags() { return messages[messageIndex].message; } -const std::vector& ILogSystem::BufferedCursor::getTags() { +VectorRef ILogSystem::BufferedCursor::getTags() { ASSERT(withTags); return messages[messageIndex].tags; } @@ -1020,6 +1017,7 @@ ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe } wait(cursor->getMore(taskID)); self->poppedVersion = std::max(self->poppedVersion, cursor->popped()); + self->minKnownCommittedVersion = std::max(self->minKnownCommittedVersion, cursor->getMinKnownCommittedVersion()); if(self->canDiscardPopped) { self->initialPoppedVersion = std::max(self->initialPoppedVersion, cursor->popped()); } @@ -1027,7 +1025,7 @@ ACTOR Future bufferedGetMoreLoader( ILogSystem::BufferedCursor* self, Refe return Void(); } while(cursor->hasMessage()) { - self->cursorMessages[idx].push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? std::vector() : cursor->getTags(), cursor->version())); + self->cursorMessages[idx].push_back(ILogSystem::BufferedCursor::BufferedMessage(cursor->arena(), (!self->withTags || self->collectTags) ? cursor->getMessage() : cursor->getMessageWithTags(), !self->withTags ? VectorRef() : cursor->getTags(), cursor->version())); cursor->nextMessage(); } } @@ -1053,7 +1051,7 @@ ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriori loop { wait( allLoaders || delay(SERVER_KNOBS->DESIRED_GET_MORE_DELAY, taskID) ); minVersion = self->end; - for(auto& cursor : self->cursors) { + for(auto cursor : self->cursors) { minVersion = std::min(minVersion, cursor->version().version); } if(minVersion > self->messageVersion.version) { @@ -1089,7 +1087,7 @@ ACTOR Future bufferedGetMore( ILogSystem::BufferedCursor* self, TaskPriori if(self->canDiscardPopped && self->poppedVersion > self->version().version) { TraceEvent(SevWarn, "DiscardingPoppedData", self->randomID).detail("Version", self->version().version).detail("Popped", self->poppedVersion); self->messageVersion = std::max(self->messageVersion, LogMessageVersion(self->poppedVersion)); - for(auto& cursor : self->cursors) { + for(auto cursor : self->cursors) { cursor->advanceTo(self->messageVersion); } self->messageIndex = self->messages.size(); @@ -1144,11 +1142,7 @@ const LogMessageVersion& ILogSystem::BufferedCursor::version() { } Version ILogSystem::BufferedCursor::getMinKnownCommittedVersion() { - Version res = 0; - for(auto& cursor : cursors) { - res = std::max(res, cursor->getMinKnownCommittedVersion()); - } - return res; + return minKnownCommittedVersion; } Version ILogSystem::BufferedCursor::popped() { diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index 10626eb241..b18de4eb7b 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -284,6 +284,7 @@ struct TLogData : NonCopyable { std::map toBePopped; // map of Tag->Version for all the pops // that came when ignorePopRequest was set Reference> degraded; + std::vector tempTagMessages; TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> dbInfo, Reference> degraded, std::string folder) : dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()), @@ -890,21 +891,18 @@ void commitMessages( TLogData *self, Reference logData, Version version int32_t messageLength, rawLength; uint16_t tagCount; uint32_t sub; - std::vector msgs; + self->tempTagMessages.clear(); while(!rd.empty()) { TagsAndMessage tagsAndMsg; rd.checkpoint(); rd >> messageLength >> sub >> tagCount; - tagsAndMsg.tags.resize(tagCount); - for(int i = 0; i < tagCount; i++) { - rd >> tagsAndMsg.tags[i]; - } + tagsAndMsg.tags = VectorRef((Tag*)rd.readBytes(tagCount*sizeof(Tag)), tagCount); rawLength = messageLength + sizeof(messageLength); rd.rewind(); tagsAndMsg.message = StringRef((uint8_t const*)rd.readBytes(rawLength), rawLength); - msgs.push_back(std::move(tagsAndMsg)); + self->tempTagMessages.push_back(std::move(tagsAndMsg)); } - commitMessages(self, logData, version, msgs); + commitMessages(self, logData, version, self->tempTagMessages); } Version poppedVersion( Reference self, Tag tag) { @@ -1241,6 +1239,7 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData, st self->queueCommitBegin = commitNumber; logData->queueCommittingVersion = ver; + g_network->setCurrentTask(TaskPriority::TLogCommitReply); Future c = self->persistentQueue->commit(); self->diskQueueCommitBytes = 0; self->largeDiskQueueCommitBytes.set(false); diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index a4c85f6ead..ec9fbca906 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -335,6 +335,7 @@ struct TLogData : NonCopyable { std::map toBePopped; // map of Tag->Version for all the pops // that came when ignorePopRequest was set Reference> degraded; + std::vector tempTagMessages; TLogData(UID dbgid, IKeyValueStore* persistentData, IDiskQueue * persistentQueue, Reference> dbInfo, Reference> degraded, std::string folder) : dbgid(dbgid), instanceID(deterministicRandom()->randomUniqueID().first()), @@ -1156,21 +1157,18 @@ void commitMessages( TLogData *self, Reference logData, Version version int32_t messageLength, rawLength; uint16_t tagCount; uint32_t sub; - std::vector msgs; + self->tempTagMessages.clear(); while(!rd.empty()) { TagsAndMessage tagsAndMsg; rd.checkpoint(); rd >> messageLength >> sub >> tagCount; - tagsAndMsg.tags.resize(tagCount); - for(int i = 0; i < tagCount; i++) { - rd >> tagsAndMsg.tags[i]; - } + tagsAndMsg.tags = VectorRef((Tag*)rd.readBytes(tagCount*sizeof(Tag)), tagCount); rawLength = messageLength + sizeof(messageLength); rd.rewind(); tagsAndMsg.message = StringRef((uint8_t const*)rd.readBytes(rawLength), rawLength); - msgs.push_back(std::move(tagsAndMsg)); + self->tempTagMessages.push_back(std::move(tagsAndMsg)); } - commitMessages(self, logData, version, msgs); + commitMessages(self, logData, version, self->tempTagMessages); } Version poppedVersion( Reference self, Tag tag) { @@ -1632,6 +1630,7 @@ ACTOR Future doQueueCommit( TLogData* self, Reference logData, st self->queueCommitBegin = commitNumber; logData->queueCommittingVersion = ver; + g_network->setCurrentTask(TaskPriority::TLogCommitReply); Future c = self->persistentQueue->commit(); self->diskQueueCommitBytes = 0; self->largeDiskQueueCommitBytes.set(false); diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index c052552d80..02e6038d92 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -1269,7 +1269,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted::max(); } - virtual void getPushLocations(std::vector const& tags, std::vector& locations, bool allLocations) { + virtual void getPushLocations(VectorRef tags, std::vector& locations, bool allLocations) { int locationOffset = 0; for(auto& log : tLogs) { if(log->isLocal && log->logServers.size()) { @@ -1906,7 +1906,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted locations; for( Tag tag : localTags ) { locations.clear(); - logSet->getPushLocations( vector(1, tag), locations, 0 ); + logSet->getPushLocations( VectorRef(&tag, 1), locations, 0 ); for(int loc : locations) remoteTLogReqs[ loc ].recoverTags.push_back( tag ); } @@ -1922,7 +1922,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtxsTags); locations.clear(); - logSet->getPushLocations( {pushTag}, locations, 0 ); + logSet->getPushLocations( VectorRef(&pushTag, 1), locations, 0 ); for(int loc : locations) remoteTLogReqs[ loc ].recoverTags.push_back( tag ); } @@ -2116,7 +2116,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCounted locations; for( Tag tag : localTags ) { locations.clear(); - logSystem->tLogs[0]->getPushLocations( vector(1, tag), locations, 0 ); + logSystem->tLogs[0]->getPushLocations( VectorRef(&tag, 1), locations, 0 ); for(int loc : locations) reqs[ loc ].recoverTags.push_back( tag ); } @@ -2130,7 +2130,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtxsTags); locations.clear(); - logSystem->tLogs[0]->getPushLocations( vector(1, pushTag), locations, 0 ); + logSystem->tLogs[0]->getPushLocations( VectorRef(&pushTag, 1), locations, 0 ); for(int loc : locations) reqs[ loc ].recoverTags.push_back( tag ); } @@ -2182,7 +2182,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedlogRouterTags); locations.clear(); - logSystem->tLogs[1]->getPushLocations( {pushLocation}, locations, 0 ); + logSystem->tLogs[1]->getPushLocations( VectorRef(&pushLocation,1), locations, 0 ); for(int loc : locations) sreqs[ loc ].recoverTags.push_back( tag ); } @@ -2192,7 +2192,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedtxsTags); locations.clear(); - logSystem->tLogs[1]->getPushLocations( {pushTag}, locations, 0 ); + logSystem->tLogs[1]->getPushLocations( VectorRef(&pushTag,1), locations, 0 ); for(int loc : locations) sreqs[ loc ].recoverTags.push_back( tag ); } From 86560fe727294ec05b5de4e669fc35beb2162b67 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 5 Nov 2019 18:22:25 -0800 Subject: [PATCH 145/184] fix: tempTags was not used correctly --- fdbserver/LogSystemPeekCursor.actor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 081468e575..2759a51de8 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -937,17 +937,17 @@ void ILogSystem::BufferedCursor::combineMessages() { return; } - std::vector tempTags; - tempTags.push_back(messages[messageIndex].tags[0]); + tags.clear(); + tags.push_back(messages[messageIndex].tags[0]); for(int i = messageIndex + 1; i < messages.size() && messages[messageIndex].version == messages[i].version; i++) { - tempTags.push_back(messages[i].tags[0]); + tags.push_back(messages[i].tags[0]); messageIndex = i; } auto& msg = messages[messageIndex]; BinaryWriter messageWriter(Unversioned()); - messageWriter << uint32_t(msg.message.size() + sizeof(uint32_t) + sizeof(uint16_t) + tempTags.size()*sizeof(Tag)) << msg.version.sub << uint16_t(tags.size()); + messageWriter << uint32_t(msg.message.size() + sizeof(uint32_t) + sizeof(uint16_t) + tags.size()*sizeof(Tag)) << msg.version.sub << uint16_t(tags.size()); msg.tags = VectorRef((Tag*)(((uint8_t*)messageWriter.getData())+messageWriter.getLength()), tags.size()); - for(auto t : tempTags) { + for(auto t : tags) { messageWriter << t; } messageWriter.serializeBytes(msg.message); From 1c873591be98acfa9fd04fa777e8937198ce863c Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 5 Nov 2019 18:32:15 -0800 Subject: [PATCH 146/184] fixed a compiler error --- fdbserver/LogSystem.h | 1 - fdbserver/LogSystemPeekCursor.actor.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fdbserver/LogSystem.h b/fdbserver/LogSystem.h index a65ccf56f3..8a91172dd7 100644 --- a/fdbserver/LogSystem.h +++ b/fdbserver/LogSystem.h @@ -592,7 +592,6 @@ struct ILogSystem { //FIXME: collectTags is needed to support upgrades from 5.X to 6.0. Remove this code when we no longer support that upgrade. bool collectTags; - VectorRef tags; void combineMessages(); BufferedCursor( std::vector> cursors, Version begin, Version end, bool withTags, bool collectTags, bool canDiscardPopped ); diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 2759a51de8..7fddb4dcff 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -937,7 +937,7 @@ void ILogSystem::BufferedCursor::combineMessages() { return; } - tags.clear(); + std::vector tags; tags.push_back(messages[messageIndex].tags[0]); for(int i = messageIndex + 1; i < messages.size() && messages[messageIndex].version == messages[i].version; i++) { tags.push_back(messages[i].tags[0]); From dbc5a2393c34d0d436096bb2324d14df70491848 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Tue, 5 Nov 2019 18:44:30 -0800 Subject: [PATCH 147/184] combineMessages still did not serialize tags correctly --- fdbserver/LogSystemPeekCursor.actor.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fdbserver/LogSystemPeekCursor.actor.cpp b/fdbserver/LogSystemPeekCursor.actor.cpp index 7fddb4dcff..250681956f 100644 --- a/fdbserver/LogSystemPeekCursor.actor.cpp +++ b/fdbserver/LogSystemPeekCursor.actor.cpp @@ -946,7 +946,6 @@ void ILogSystem::BufferedCursor::combineMessages() { auto& msg = messages[messageIndex]; BinaryWriter messageWriter(Unversioned()); messageWriter << uint32_t(msg.message.size() + sizeof(uint32_t) + sizeof(uint16_t) + tags.size()*sizeof(Tag)) << msg.version.sub << uint16_t(tags.size()); - msg.tags = VectorRef((Tag*)(((uint8_t*)messageWriter.getData())+messageWriter.getLength()), tags.size()); for(auto t : tags) { messageWriter << t; } @@ -954,6 +953,10 @@ void ILogSystem::BufferedCursor::combineMessages() { Standalone val = messageWriter.toValue(); msg.arena = val.arena(); msg.message = val; + msg.tags = VectorRef(); + for(auto t : tags) { + msg.tags.push_back(msg.arena, t); + } } Reference ILogSystem::BufferedCursor::cloneNoMore() { From 0ccded1929e06a101ff6314b1361c187fc139f66 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 5 Nov 2019 11:36:28 -0800 Subject: [PATCH 148/184] AtomicOps:Resolve review comments --- fdbclient/RestoreWorkerInterface.actor.h | 2 +- fdbserver/RestoreLoader.actor.cpp | 2 +- fdbserver/workloads/AtomicOps.actor.cpp | 89 ++++++++++++++---------- 3 files changed, 54 insertions(+), 39 deletions(-) diff --git a/fdbclient/RestoreWorkerInterface.actor.h b/fdbclient/RestoreWorkerInterface.actor.h index e2f7637eb5..cbc9500e1c 100644 --- a/fdbclient/RestoreWorkerInterface.actor.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -360,7 +360,7 @@ struct RestoreSendMutationVectorVersionedRequest : TimedRequest { std::string toString() { std::stringstream ss; - ss << "fileIndex" << fileIndex << " prevVersion:" << prevVersion << " version:" << version + ss << "fileIndex:" << fileIndex << " prevVersion:" << prevVersion << " version:" << version << " isRangeFile:" << isRangeFile << " mutations.size:" << mutations.size(); return ss.str(); } diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index e2369b8da5..f538de5452 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -134,7 +134,7 @@ void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVector if (self->rangeToApplier.empty()) { self->rangeToApplier = req.rangeToApplier; } else { - ASSERT_WE_THINK(self->rangeToApplier == req.rangeToApplier); + ASSERT(self->rangeToApplier == req.rangeToApplier); } req.reply.send(RestoreCommonReply(self->id())); } diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index 1f2f0c9fd2..cabf0d07d7 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -51,7 +51,7 @@ struct AtomicOpsWorkload : TestWorkload { lbsum = 0; ubsum = 0; - int64_t randNum = sharedRandomNumber / 10; + int64_t randNum = sharedRandomNumber / 10; if(opType == -1) opType = randNum % 8; @@ -123,7 +123,6 @@ struct AtomicOpsWorkload : TestWorkload { virtual void getMetrics( vector& m ) { } - // Key logKey( int group ) { return StringRef(format("log%08x%08x%08x",group,clientId,opNum++));} std::pair logDebugKey(int group) { Key logKey(format("log%08x%08x%08x", group, clientId, opNum)); Key debugKey(format("debug%08x%08x%08x", group, clientId, opNum)); @@ -207,47 +206,62 @@ struct AtomicOpsWorkload : TestWorkload { } ACTOR Future dumpLogKV(Database cx, int g) { - ReadYourWritesTransaction tr(cx); - Key begin(format("log%08x", g)); - Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); - uint64_t sum = 0; - for (auto& kv : log) { - uint64_t intValue = 0; - memcpy(&intValue, kv.value.begin(), kv.value.size()); - sum += intValue; - TraceEvent("AtomicOpLog") - .detail("Key", kv.key) - .detail("Val", kv.value) - .detail("IntValue", intValue) - .detail("CurSum", sum); + try { + state ReadYourWritesTransaction tr(cx); + Key begin(format("log%08x", g)); + Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + uint64_t sum = 0; + for (auto& kv : log) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + sum += intValue; + TraceEvent("AtomicOpLog") + .detail("Key", kv.key) + .detail("Val", kv.value) + .detail("IntValue", intValue) + .detail("CurSum", sum); + } + } catch( Error &e ) { + TraceEvent("DumpLogKVError").detail("Error", e.what()); + wait( tr.onError(e) ); } return Void(); } ACTOR Future dumpDebugKV(Database cx, int g) { - ReadYourWritesTransaction tr(cx); - Key begin(format("debug%08x", g)); - Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); - for (auto& kv : log) { - TraceEvent("AtomicOpDebug").detail("Key", kv.key).detail("Val", kv.value); + try { + state ReadYourWritesTransaction tr(cx); + Key begin(format("debug%08x", g)); + Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + for (auto& kv : log) { + TraceEvent("AtomicOpDebug").detail("Key", kv.key).detail("Val", kv.value); + } + } catch( Error &e ) { + TraceEvent("DumpDebugKVError").detail("Error", e.what()); + wait( tr.onError(e) ); } return Void(); } ACTOR Future dumpOpsKV(Database cx, int g) { - ReadYourWritesTransaction tr(cx); - Key begin(format("ops%08x", g)); - Standalone ops = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); - uint64_t sum = 0; - for (auto& kv : ops) { - uint64_t intValue = 0; - memcpy(&intValue, kv.value.begin(), kv.value.size()); - sum += intValue; - TraceEvent("AtomicOpOps") - .detail("Key", kv.key) - .detail("Val", kv.value) - .detail("IntVal", intValue) - .detail("CurSum", sum); + try { + state ReadYourWritesTransaction tr(cx); + Key begin(format("ops%08x", g)); + Standalone ops = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + uint64_t sum = 0; + for (auto& kv : ops) { + uint64_t intValue = 0; + memcpy(&intValue, kv.value.begin(), kv.value.size()); + sum += intValue; + TraceEvent("AtomicOpOps") + .detail("Key", kv.key) + .detail("Val", kv.value) + .detail("IntVal", intValue) + .detail("CurSum", sum); + } + } catch( Error &e ) { + TraceEvent("DumpOpsKVError").detail("Error", e.what()); + wait( tr.onError(e) ); } return Void(); } @@ -259,6 +273,7 @@ struct AtomicOpsWorkload : TestWorkload { Key begin(format("debug%08x", g)); Standalone debuglog = wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + ASSERT(!debuglog.more); for (auto& kv : debuglog) { records[kv.value] = kv.key; } @@ -268,6 +283,7 @@ struct AtomicOpsWorkload : TestWorkload { state std::map logVal; // debugKey, log's value Key begin(format("log%08x", g)); Standalone log = wait(tr2.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + ASSERT(!log.more); for (auto& kv : log) { uint64_t intValue = 0; memcpy(&intValue, kv.value.begin(), kv.value.size()); @@ -279,6 +295,7 @@ struct AtomicOpsWorkload : TestWorkload { state std::map opsVal; // ops key, ops value Key begin(format("ops%08x", g)); Standalone ops = wait(tr3.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + ASSERT(!ops.more); // Validate if ops' key value is consistent with logs' key value for (auto& kv : ops) { bool inRecord = records.find(kv.key) != records.end(); @@ -303,11 +320,9 @@ struct AtomicOpsWorkload : TestWorkload { // Validate if there is any ops key missing for (auto& kv : records) { - uint64_t intValue = opsVal[kv.first]; - if (intValue <= 0) { + if (opsVal.find(kv.first) == opsVal.end()) { TraceEvent(SevError, "MissingOpsKey2") .detail("OpsKey", kv.first) - .detail("OpsVal", intValue) .detail("DebugKey", kv.second); } } @@ -376,7 +391,7 @@ struct AtomicOpsWorkload : TestWorkload { .detail("OpsResultStr", printable(opsResultStr)) .detail("Size", opsResultStr.size()) .detail("LowerBoundSum", self->lbsum) - .detail("UperBoundSum", self->ubsum); + .detail("UpperBoundSum", self->ubsum); wait(self->dumpLogKV(cx, g)); wait(self->dumpDebugKV(cx, g)); wait(self->dumpOpsKV(cx, g)); From 5fbe399bafc8275e52a5ed1aa9e0b842703d853a Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 6 Nov 2019 11:59:40 -0800 Subject: [PATCH 149/184] AtomicOp: Resolve review comments; no functional change. 1) Trace Txn commit_unknown_results in workload; 2) Add SevError trace events when txn reads hit limits since we do not handle this situation in dumping the debug info. --- fdbserver/workloads/AtomicOps.actor.cpp | 42 ++++++++++++++----------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index cabf0d07d7..d97e721229 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -33,7 +33,7 @@ struct AtomicOpsWorkload : TestWorkload { double testDuration, transactionsPerSecond; vector> clients; - uint64_t lbsum, ubsum; // Tell if setup txn fails when opType = AddValue + uint64_t logsum; // The sum of operations when opType = AddValue AtomicOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), opNum(0) @@ -48,8 +48,7 @@ struct AtomicOpsWorkload : TestWorkload { apiVersion500 = ((sharedRandomNumber % 10) == 0); TraceEvent("AtomicOpsApiVersion500").detail("ApiVersion500", apiVersion500); - lbsum = 0; - ubsum = 0; + logsum = 0; int64_t randNum = sharedRandomNumber / 10; if(opType == -1) @@ -183,23 +182,25 @@ struct AtomicOpsWorkload : TestWorkload { int group = deterministicRandom()->randomInt(0,100); state uint64_t intValue = deterministicRandom()->randomInt(0, 10000000); Key val = StringRef((const uint8_t*) &intValue, sizeof(intValue)); - std::pair logDebugKey = self->logDebugKey(group); + state std::pair logDebugKey = self->logDebugKey(group); int nodeIndex = deterministicRandom()->randomInt(0, self->nodeCount / 100); - Key opsKey(format("ops%08x%08x", group, nodeIndex)); + state Key opsKey(format("ops%08x%08x", group, nodeIndex)); tr.set(logDebugKey.first, val); // set log key tr.set(logDebugKey.second, opsKey); // set debug key; one opsKey can have multiple logs key tr.atomicOp(opsKey, val, self->opType); wait( tr.commit() ); if (self->opType == MutationRef::AddValue) { - self->lbsum += intValue; - self->ubsum += intValue; + self->logsum += intValue; } break; } catch( Error &e ) { - wait( tr.onError(e) ); - if (self->opType == MutationRef::AddValue) { - self->ubsum += intValue; + if (e.code() == 1021) { + TraceEvent(SevWarnAlways, "TxnCommitUnknownResult") + .detail("Value", intValue) + .detail("LogKey", logDebugKey.first) + .detail("OpsKey", opsKey); } + wait(tr.onError(e)); } } } @@ -273,7 +274,10 @@ struct AtomicOpsWorkload : TestWorkload { Key begin(format("debug%08x", g)); Standalone debuglog = wait(tr1.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); - ASSERT(!debuglog.more); + if (debuglog.more) { + TraceEvent(SevError, "DebugLogHitTxnLimits").detail("Result", debuglog.toString()); + return Void(); + } for (auto& kv : debuglog) { records[kv.value] = kv.key; } @@ -283,7 +287,10 @@ struct AtomicOpsWorkload : TestWorkload { state std::map logVal; // debugKey, log's value Key begin(format("log%08x", g)); Standalone log = wait(tr2.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); - ASSERT(!log.more); + if (log.more) { + TraceEvent(SevError, "LogHitTxnLimits").detail("Result", log.toString()); + return Void(); + } for (auto& kv : log) { uint64_t intValue = 0; memcpy(&intValue, kv.value.begin(), kv.value.size()); @@ -295,7 +302,10 @@ struct AtomicOpsWorkload : TestWorkload { state std::map opsVal; // ops key, ops value Key begin(format("ops%08x", g)); Standalone ops = wait(tr3.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); - ASSERT(!ops.more); + if (ops.more) { + TraceEvent(SevError, "OpsHitTxnLimits").detail("Result", ops.toString()); + return Void(); + } // Validate if ops' key value is consistent with logs' key value for (auto& kv : ops) { bool inRecord = records.find(kv.key) != records.end(); @@ -305,9 +315,6 @@ struct AtomicOpsWorkload : TestWorkload { if (!inRecord) { TraceEvent(SevError, "MissingLogKey").detail("OpsKey", kv.key); } - if (inRecord && intValue == 0) { - TraceEvent(SevError, "MissingOpsKey1").detail("OpsKey", kv.key).detail("DebugKey", records[kv.key]); - } if (inRecord && (self->actorCount == 1 && intValue != logVal[records[kv.key]])) { // When multiple actors exist, 1 opsKey can have multiple log keys TraceEvent(SevError, "InconsistentOpsKeyValue") @@ -390,8 +397,7 @@ struct AtomicOpsWorkload : TestWorkload { .detail("OpResult", opsResult) .detail("OpsResultStr", printable(opsResultStr)) .detail("Size", opsResultStr.size()) - .detail("LowerBoundSum", self->lbsum) - .detail("UpperBoundSum", self->ubsum); + .detail("Sum", self->logsum); wait(self->dumpLogKV(cx, g)); wait(self->dumpDebugKV(cx, g)); wait(self->dumpOpsKV(cx, g)); From 7b8f1df3b6538f3595865e1290885ea33be72933 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 6 Nov 2019 13:09:57 -0800 Subject: [PATCH 150/184] update versions target to 6.2.9 --- versions.target | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.target b/versions.target index 99a6f62e05..b1813aefef 100644 --- a/versions.target +++ b/versions.target @@ -1,7 +1,7 @@ - 6.2.8 + 6.2.9 6.2 From e660149042d0196b600ce63b4dc91bf64d427885 Mon Sep 17 00:00:00 2001 From: Evan Tschannen Date: Wed, 6 Nov 2019 13:09:57 -0800 Subject: [PATCH 151/184] update installer WIX GUID following release --- packaging/msi/FDBInstaller.wxs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/msi/FDBInstaller.wxs b/packaging/msi/FDBInstaller.wxs index d58ccfa3e7..01bc76c575 100644 --- a/packaging/msi/FDBInstaller.wxs +++ b/packaging/msi/FDBInstaller.wxs @@ -32,7 +32,7 @@ Date: Wed, 6 Nov 2019 13:12:30 -0800 Subject: [PATCH 152/184] updated cmake for 6.2.9 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 311b32c3e4..b5281942e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ # limitations under the License. cmake_minimum_required(VERSION 3.12) project(foundationdb - VERSION 6.2.8 + VERSION 6.2.9 DESCRIPTION "FoundationDB is a scalable, fault-tolerant, ordered key-value store with full ACID transactions." HOMEPAGE_URL "http://www.foundationdb.org/" LANGUAGES C CXX ASM) From 5d00d93665effe9ff43a3090bb2c2f8391daec8f Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 7 Nov 2019 01:46:33 -0800 Subject: [PATCH 153/184] Fixed errors found by valgrind involving incorrect page memory lifetimes for IO operations plus some false positives for partially used pages. --- fdbserver/VersionedBTree.actor.cpp | 216 +++++++++++++++++------------ 1 file changed, 128 insertions(+), 88 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index e339de8873..79bb5d944c 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -680,6 +680,7 @@ public: // Create a fast-allocated page with size total bytes INCLUDING checksum FastAllocatedPage(int size, int bufferSize) : logicalSize(size), bufferSize(bufferSize) { buffer = (uint8_t *)allocateFast(bufferSize); + // Mark any unused page portion defined VALGRIND_MAKE_MEM_DEFINED(buffer + logicalSize, bufferSize - logicalSize); }; @@ -733,10 +734,17 @@ private: // Holds an index of recently used objects. // ObjectType must have the method -// bool evictable() const; +// bool evictable() const; // return true if the entry can be evicted +// Future onEvictable() const; // ready when entry can be evicted // indicating if it is safe to evict. template class ObjectCache { + + struct Entry : public boost::intrusive::list_base_hook<> { + IndexType index; + ObjectType item; + }; + public: ObjectCache(int sizeLimit = 0) : sizeLimit(sizeLimit) { } @@ -783,13 +791,34 @@ public: return entry.item; } - // Clears the cache and calls destroy() on each ObjectType - void destroy() { - evictionOrder.clear(); - for(auto &entry : cache) { - entry.second.item.destroy(); + // Clears the cache, saving the entries, and then waits for eachWaits for each item to be evictable and evicts it. + // The cache should not be Evicts all evictable entries + ACTOR static Future clear_impl(ObjectCache *self) { + state std::unordered_map cache; + state boost::intrusive::list evictionOrder; + + // Swap cache contents to local state vars + cache.swap(self->cache); + evictionOrder.swap(self->evictionOrder); + + state typename boost::intrusive::list::iterator i = evictionOrder.begin(); + state typename boost::intrusive::list::iterator iEnd = evictionOrder.begin(); + + while(i != iEnd) { + if(!i->item.evictable()) { + wait(i->item.onEvictable()); + } + ++i; } + + evictionOrder.clear(); cache.clear(); + + return Void(); + } + + Future clear() { + return clear_impl(this); } int count() const { @@ -798,16 +827,12 @@ public: } private: - struct Entry : public boost::intrusive::list_base_hook<> { - IndexType index; - ObjectType item; - }; - int sizeLimit; // TODO: Use boost intrusive unordered set instead, with a comparator that only considers entry.index std::unordered_map cache; boost::intrusive::list evictionOrder; + }; ACTOR template Future forwardError(Future f, Promise target) { @@ -1090,37 +1115,52 @@ public: } Future newPageID() override { - return forwardError(newPageID_impl(this), errorPromise); + return newPageID_impl(this); + } + + Future writePhysicalPage(PhysicalPageID pageID, Reference page, bool header = false) { + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeader" : "writePhysical"), toString(pageID).c_str(), page->begin()); + + VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); + ((Page *)page.getPtr())->updateChecksum(pageID); + + // Note: Not using forwardError here so a write error won't be discovered until commit time. + int blockSize = header ? smallestPhysicalBlock : physicalPageSize; + Future f = holdWhile(page, map(pageFile->write(page->begin(), blockSize, (int64_t)pageID * blockSize), [=](Void) { + debug_printf("DWALPager(%s) op=%s %s ptr=%p\n", filename.c_str(), (header ? "writePhysicalHeaderComplete" : "writePhysicalComplete"), toString(pageID).c_str(), page->begin()); + return Void(); + })); + operations.add(f); + return f; } Future writeHeaderPage(PhysicalPageID pageID, Reference page) { - debug_printf("DWALPager(%s) header op=write %s\n", filename.c_str(), toString(pageID).c_str()); - ((Page *)page.getPtr())->updateChecksum(pageID); - return holdWhile(page, pageFile->write(page->begin(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); - } - - Future writePhysicalPage(PhysicalPageID pageID, Reference page) { - debug_printf("DWALPager(%s) op=write %s\n", filename.c_str(), toString(pageID).c_str()); - ((Page *)page.getPtr())->updateChecksum(pageID); - return holdWhile(page, pageFile->write(page->begin(), physicalPageSize, (int64_t)pageID * physicalPageSize)); + return writePhysicalPage(pageID, page, true); } void updatePage(LogicalPageID pageID, Reference data) override { // Get the cache entry for this page PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places - // the new content in the cache entry when the write is launched, not when it is completed. - // Any waiting readers should not see this write (though this might change) - if(cacheEntry.reading()) { + // the new content into readFuture when the write is launched, not when it is completed. + // Read/write ordering is being enforced waiting readers will not see the new write. This + // is necessary for remap erasure to work correctly since the oldest version of a page, located + // at the original page ID, could have a pending read when that version is expired and the write + // of the next newest version over top of the original page begins. + if(!cacheEntry.initialized()) { + cacheEntry.writeFuture = writePhysicalPage(pageID, data); + } + else if(cacheEntry.reading()) { // Wait for the read to finish, then start the write. cacheEntry.writeFuture = map(success(cacheEntry.readFuture), [=](Void) { writePhysicalPage(pageID, data); return Void(); }); } - // If the page is being written, wait for this write before issuing the new write + // If the page is being written, wait for this write before issuing the new write to ensure the + // writes happen in the correct order else if(cacheEntry.writing()) { cacheEntry.writeFuture = map(cacheEntry.writeFuture, [=](Void) { writePhysicalPage(pageID, data); @@ -1131,9 +1171,6 @@ public: cacheEntry.writeFuture = writePhysicalPage(pageID, data); } - cacheEntry.writeFuture = forwardError(cacheEntry.writeFuture, errorPromise); - operations.add(cacheEntry.writeFuture); - // Always update the page contents immediately regardless of what happened above. cacheEntry.readFuture = data; } @@ -1175,46 +1212,44 @@ public: } }; - // Header pages use a page size of smallestPhysicalBlock - // If the user chosen physical page size is larger, then there will be a gap of unused space after - // between the end of page 1 and the start of page 2. - ACTOR static Future> readHeaderPage(DWALPager *self, PhysicalPageID pageID) { + // Read a physical page from the page file. Note that header pages use a page size of smallestPhysicalBlock + // If the user chosen physical page size is larger, then there will be a gap of unused space after the header pages + // and before the user-chosen sized pages. + ACTOR static Future> readPhysicalPage(DWALPager *self, PhysicalPageID pageID, bool header = false) { if(g_network->getCurrentTask() > TaskPriority::DiskRead) { wait(delay(0, TaskPriority::DiskRead)); } - state Reference page(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)); - int readBytes = wait(self->pageFile->read(page->mutate(), smallestPhysicalBlock, (int64_t)pageID * smallestPhysicalBlock)); - debug_printf("DWALPager(%s) header op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); - ASSERT(readBytes == smallestPhysicalBlock); + state Reference page = header ? Reference(new FastAllocatedPage(smallestPhysicalBlock, smallestPhysicalBlock)) : self->newPageBuffer(); + debug_printf("DWALPager(%s) op=readPhysicalStart %s ptr=%p\n", self->filename.c_str(), toString(pageID).c_str(), page->begin()); + + int blockSize = header ? smallestPhysicalBlock : self->physicalPageSize; + // TODO: Could a dispatched read try to write to page after it has been destroyed if this actor is cancelled? + int readBytes = wait(self->pageFile->read(page->mutate(), blockSize, (int64_t)pageID * blockSize)); + debug_printf("DWALPager(%s) op=readPhysicalComplete %s ptr=%p bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), page->begin(), readBytes); + + // Header reads are checked explicitly during recovery + if(!header) { + Page *p = (Page *)page.getPtr(); + if(!p->verifyChecksum(pageID)) { + debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); + Error e = checksum_failed(); + TraceEvent(SevError, "DWALPagerChecksumFailed") + .detail("Filename", self->filename.c_str()) + .detail("PageID", pageID) + .detail("PageSize", self->physicalPageSize) + .detail("Offset", pageID * self->physicalPageSize) + .detail("CalculatedChecksum", p->calculateChecksum(pageID)) + .detail("ChecksumInPage", p->getChecksum()) + .error(e); + throw e; + } + } return page; } - ACTOR static Future> readPhysicalPage(DWALPager *self, PhysicalPageID pageID) { - if(g_network->getCurrentTask() > TaskPriority::DiskRead) { - wait(delay(0, TaskPriority::DiskRead)); - } - - state Reference page = self->newPageBuffer(); - debug_printf("DWALPager(%s) op=read_physical_start %s\n", self->filename.c_str(), toString(pageID).c_str()); - int readBytes = wait(self->pageFile->read(page->mutate(), self->physicalPageSize, (int64_t)pageID * self->physicalPageSize)); - debug_printf("DWALPager(%s) op=read_complete %s bytes=%d\n", self->filename.c_str(), toString(pageID).c_str(), readBytes); - ASSERT(readBytes == self->physicalPageSize); - Page *p = (Page *)page.getPtr(); - if(!p->verifyChecksum(pageID)) { - debug_printf("DWALPager(%s) checksum failed for %s\n", self->filename.c_str(), toString(pageID).c_str()); - Error e = checksum_failed(); - TraceEvent(SevError, "DWALPagerChecksumFailed") - .detail("Filename", self->filename.c_str()) - .detail("PageID", pageID) - .detail("PageSize", self->physicalPageSize) - .detail("Offset", pageID * self->physicalPageSize) - .detail("CalculatedChecksum", p->calculateChecksum(pageID)) - .detail("ChecksumInPage", p->getChecksum()) - .error(e); - throw e; - } - return page; + static Future> readHeaderPage(DWALPager *self, PhysicalPageID pageID) { + return readPhysicalPage(self, pageID, true); } // Reads the most recent version of pageID either committed or written using updatePage() @@ -1222,23 +1257,24 @@ public: // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache if(!cacheable) { - debug_printf("DWALPager(%s) op=read_nocache %s\n", filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) op=readUncached %s\n", filename.c_str(), toString(pageID).c_str()); PageCacheEntry *pCacheEntry = pageCache.getIfExists(pageID); if(pCacheEntry != nullptr) { - debug_printf("DWALPager(%s) op=read_nocache_hit %s\n", filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) op=readUncachedHit %s\n", filename.c_str(), toString(pageID).c_str()); return pCacheEntry->readFuture; } - debug_printf("DWALPager(%s) op=read_nocache_miss %s\n", filename.c_str(), toString(pageID).c_str()); + debug_printf("DWALPager(%s) op=readUncachedMiss %s\n", filename.c_str(), toString(pageID).c_str()); return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); } PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.readFuture.isValid(), cacheEntry.reading(), cacheEntry.writing()); + debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing()); - if(!cacheEntry.readFuture.isValid()) { + if(!cacheEntry.initialized()) { debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); cacheEntry.readFuture = readPhysicalPage(this, (PhysicalPageID)pageID); + cacheEntry.writeFuture = Void(); } cacheEntry.readFuture = forwardError(cacheEntry.readFuture, errorPromise); @@ -1310,10 +1346,6 @@ public: // Read the data from the page that the original was mapped to Reference data = wait(self->readPage(p.get().newPageID, false)); - // Some page reads will mark the unused portion of the page as undefined to catch bugs with valgrind. - // We are blindly copying the page data to a new location regardless of its format so mark all of it defined. - VALGRIND_MAKE_MEM_DEFINED(data->begin(), data->size()); - // Write the data to the original page so it can be read using its original pageID self->updatePage(p.get().originalPageID, data); @@ -1364,7 +1396,7 @@ public: debug_printf("DWALPager(%s) commit begin\n", self->filename.c_str()); // Write old committed header to Page 1 - self->operations.add(self->writeHeaderPage(1, self->lastCommittedHeaderPage)); + self->writeHeaderPage(1, self->lastCommittedHeaderPage); // Trigger the remap eraser to stop and then wait for it. self->remapUndoStop = true; @@ -1432,21 +1464,30 @@ public: } ACTOR void shutdown(DWALPager *self, bool dispose) { + debug_printf("DWALPager(%s) shutdown cancel recovery\n", self->filename.c_str()); self->recoverFuture.cancel(); + debug_printf("DWALPager(%s) shutdown cancel commit\n", self->filename.c_str()); self->commitFuture.cancel(); + debug_printf("DWALPager(%s) shutdown cancel remap\n", self->filename.c_str()); self->remapUndoFuture.cancel(); if(self->errorPromise.canBeSet()) { + debug_printf("DWALPager(%s) shutdown sending error\n", self->filename.c_str()); self->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress } - self->operations.clear(); - // Destroy the cache, cancelling reads and writes in progress - self->pageCache.destroy(); + // Must wait for pending operations to complete, canceling them can cause a crash because the underlying + // operations may be uncancellable and depend on memory from calling scope's page reference + debug_printf("DWALPager(%s) shutdown wait for operations\n", self->filename.c_str()); + wait(self->operations.signal()); + + debug_printf("DWALPager(%s) shutdown destroy page cache\n", self->filename.c_str()); + wait(self->pageCache.clear()); // Unreference the file and clear self->pageFile.clear(); if(dispose) { + debug_printf("DWALPager(%s) shutdown deleting file\n", self->filename.c_str()); wait(IAsyncFileSystem::filesystem()->incrementalDeleteFile(self->filename, true)); } @@ -1557,12 +1598,16 @@ private: Future> readFuture; Future writeFuture; + bool initialized() const { + return readFuture.isValid(); + } + bool reading() const { - return readFuture.isValid() && !readFuture.isReady(); + return !readFuture.isReady(); } bool writing() const { - return writeFuture.isValid() && !writeFuture.isReady(); + return !writeFuture.isReady(); } bool evictable() const { @@ -1570,9 +1615,8 @@ private: return !reading() && !writing(); } - void destroy() { - readFuture.cancel(); - writeFuture.cancel(); + Future onEvictable() const { + return ready(readFuture) && writeFuture; } }; @@ -2495,7 +2539,6 @@ static void makeEmptyRoot(Reference page) { btpage->kvBytes = 0; btpage->itemCount = 0; btpage->tree().build(nullptr, nullptr, nullptr, nullptr); - VALGRIND_MAKE_MEM_DEFINED(page->begin() + btpage->tree().size(), page->size() - btpage->tree().size()); } BTreePage::BinaryTree::Reader * getReader(Reference page) { @@ -3393,7 +3436,6 @@ private: if(blockCount == 1) { Reference page = self->m_pager->newPageBuffer(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); btPage = (BTreePage *)page->mutate(); pages.push_back(std::move(page)); } @@ -3401,7 +3443,6 @@ private: ASSERT(blockCount > 1); int size = blockSize * blockCount; btPage = (BTreePage *)new uint8_t[size]; - VALGRIND_MAKE_MEM_DEFINED(btPage, size); } btPage->formatVersion = BTreePage::FORMAT_VERSION; @@ -3419,10 +3460,11 @@ private: // Create chunked pages // TODO: Avoid copying page bytes, but this is not trivial due to how pager checksums are currently handled. if(blockCount != 1) { + // Mark the slack in the page buffer as defined + VALGRIND_MAKE_MEM_DEFINED(((uint8_t *)btPage) + written, (blockCount * blockSize) - written); const uint8_t *rptr = (const uint8_t *)btPage; for(int b = 0; b < blockCount; ++b) { Reference page = self->m_pager->newPageBuffer(); - VALGRIND_MAKE_MEM_DEFINED(page->begin(), page->size()); memcpy(page->mutate(), rptr, blockSize); rptr += blockSize; pages.push_back(std::move(page)); @@ -3590,9 +3632,6 @@ private: debug_printf("readPage() %s\n", pTreePage->toString(false, id, snapshot->getVersion(), lowerBound, upperBound).c_str()); } - // Nothing should attempt to read bytes in the page outside the BTreePage structure - VALGRIND_MAKE_MEM_UNDEFINED(page->begin() + pTreePage->size(), page->size() - pTreePage->size()); - return page; } @@ -4591,6 +4630,7 @@ private: wait(success(self->m_cur2.move(true))); } + self->m_kv.reset(); while(self->m_cur1.valid()) { if(self->m_cur1.presentAtVersion(self->m_version) && @@ -4616,7 +4656,6 @@ private: } - self->m_kv.reset(); debug_printf("Cursor::move(%d): Exit, end of db reached. Cursor = %s\n", fwd, self->toString().c_str()); return Void(); } @@ -5871,6 +5910,7 @@ TEST_CASE("!/redwood/correctness/btree") { debug_printf("Waiting for verification to complete.\n"); wait(verifyTask); + debug_printf("Closing btree\n"); Future closedFuture = btree->onClosed(); btree->close(); wait(closedFuture); From 2aa672cb5935a62f9ff9d401bfe97f1b6ebb66b0 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Thu, 7 Nov 2019 15:52:23 -0800 Subject: [PATCH 154/184] When bulk building pages, make most of them full. --- fdbserver/VersionedBTree.actor.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 79bb5d944c..22ca40784e 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -3416,12 +3416,17 @@ private: // If flush then write a page using records from start to i. It's guaranteed that pageUpperBound has been set above. if(flush) { - end = i == entries.size(); // i could have been moved above - + int remaining = entries.size() - i; + end = remaining == 0; // i could have been moved above int count = i - start; - // If not writing the final page, reduce entry count of page by a third - if(!end) { - i -= count / 3; + + // If + // - this is not the last page + // - the number of entries remaining after this page is less than the count of the current page + // - the page that would be written ends on a user key boundary + // Then adjust the current page item count to half the amount remaining after the start position. + if(!end && remaining < count && entries[i - 1].key != entries[i].key) { + i = (start + entries.size()) / 2; pageUpperBound = entries[i].withoutValue(); } From 3de7ae5b0cbec772cd8eee11ff32d44747ab68c1 Mon Sep 17 00:00:00 2001 From: Jon Fu Date: Fri, 8 Nov 2019 09:39:25 -0800 Subject: [PATCH 155/184] Added size assertion in test workload --- fdbserver/workloads/RemoveServersSafely.actor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/fdbserver/workloads/RemoveServersSafely.actor.cpp b/fdbserver/workloads/RemoveServersSafely.actor.cpp index 7067aef81a..1900ddeeaa 100644 --- a/fdbserver/workloads/RemoveServersSafely.actor.cpp +++ b/fdbserver/workloads/RemoveServersSafely.actor.cpp @@ -461,6 +461,7 @@ struct RemoveServersSafelyWorkload : TestWorkload { .detail("Removing", removeServer->toString()); toKillMarkFailedArray.erase(removeServer); } + ASSERT(toKillMarkFailedArray.size() <= toKillArray.size()); auto removeServer = toKill.begin(); TraceEvent("RemoveAndKill", functionId) .detail("Step", "ReplaceNonFailedKillSet") From d0d036b3a7eca78e742da710186f95c496dddddc Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 8 Nov 2019 21:39:21 +0000 Subject: [PATCH 156/184] Add cmake command to package tests --- cmake/AddFdbTest.cmake | 49 ++++++++++++++++++++++++++++++++++++---- cmake/FlowCommands.cmake | 1 - tests/CMakeLists.txt | 1 + 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index b2f9b72ea7..d5e3f76956 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -130,9 +130,48 @@ function(add_fdb_test) ${VALGRIND_OPTION} ${ADD_FDB_TEST_TEST_FILES} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - get_filename_component(test_dir_full ${first_file} DIRECTORY) - if(NOT ${test_dir_full} STREQUAL "") - get_filename_component(test_dir ${test_dir_full} NAME) - set_tests_properties(${test_name} PROPERTIES TIMEOUT ${this_test_timeout} LABELS "${test_dir}") - endif() + get_filename_component(test_dir_full ${first_file} DIRECTORY) + if(NOT ${test_dir_full} STREQUAL "") + get_filename_component(test_dir ${test_dir_full} NAME) + set_tests_properties(${test_name} PROPERTIES TIMEOUT ${this_test_timeout} LABELS "${test_dir}") + endif() + # set variables used for generating test packages + set(TEST_NAMES ${TEST_NAMES} ${test_name} PARENT_SCOPE) + set(TEST_FILES_${test_name} ${ADD_FDB_TEST_TEST_FILES} PARENT_SCOPE) + set(TEST_TYPE_${test_name} ${test_type} PARENT_SCOPE) +endfunction() + +set(TEST_PACKAGE_INCLUDE ".*" CACHE STRING "A regex of all tests that should be included in the test package") +set(TEST_PACKAGE_EXCLUDE ".^" CACHE STRING "A regex of all tests that shouldn't be added to the test package") + +function(create_test_package) + string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length) + foreach(test IN LISTS TEST_NAMES) + if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND + (${test} MATCHES ${TEST_PACKAGE_INCLUDE}) AND + (NOT ${test} MATCHES ${TEST_PACKAGE_EXCLUDE})) + foreach(file IN LISTS TEST_FILES_${test}) + string(SUBSTRING ${file} ${base_length} -1 rel_out_file) + set(out_file ${CMAKE_BINARY_DIR}/packages/tests/${rel_out_file}) + list(APPEND out_files ${out_file}) + get_filename_component(test_dir ${out_file} DIRECTORY) + file(MAKE_DIRECTORY packages/tests/${test_dir}) + add_custom_command( + OUTPUT ${out_file} + DEPENDS ${file} + COMMAND ${CMAKE_COMMAND} -E copy ${file} ${out_file}) + endforeach() + endif() + endforeach() + set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness.tar.gz) + add_custom_command( + OUTPUT ${tar_file} + DEPENDS ${out_files} + COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver + ${out_files} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages + COMMENT "Package correctness archive" + ) + add_custom_target(package_tests DEPENDS ${tar_file}) + add_dependencies(package_tests strip_fdbserver) endfunction() diff --git a/cmake/FlowCommands.cmake b/cmake/FlowCommands.cmake index 8c9d964d3e..19df995f25 100644 --- a/cmake/FlowCommands.cmake +++ b/cmake/FlowCommands.cmake @@ -136,7 +136,6 @@ function(strip_debug_symbols target) add_custom_command(OUTPUT "${out_file}.debug" COMMAND objcopy --only-keep-debug $ "${out_file}.debug" && objcopy --add-gnu-debuglink="${out_file}.debug" ${out_file} - DEPENDS "${out_file}" COMMENT "Copy debug symbols to ${out_name}.debug") list(APPEND out_files "${out_file}.debug") endif() diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 23f5eee46a..b659d9fae9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -215,3 +215,4 @@ add_fdb_test(TEST_FILES status/single_process_too_many_config_params.txt) verify_testing() +create_test_package() From 01c26761521d9008b3ac175a2ef7a65b7fabe641 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 8 Nov 2019 22:15:38 +0000 Subject: [PATCH 157/184] Add feature to add external dependencies to test package --- cmake/AddFdbTest.cmake | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index d5e3f76956..305bd03b2a 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -143,6 +143,7 @@ endfunction() set(TEST_PACKAGE_INCLUDE ".*" CACHE STRING "A regex of all tests that should be included in the test package") set(TEST_PACKAGE_EXCLUDE ".^" CACHE STRING "A regex of all tests that shouldn't be added to the test package") +set(TEST_PACKAGE_ADD_DIRECTORIES "" CACHE STRING "A ;-separated list of directories. All files within each directory will be added to the test package") function(create_test_package) string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length) @@ -163,12 +164,23 @@ function(create_test_package) endforeach() endif() endforeach() + foreach(dir IN LISTS TEST_PACKAGE_ADD_DIRECTORIES) + file(GLOB_RECURSE files ${dir}/*) + string(LENGTH ${dir} dir_len) + foreach(file IN LISTS files) + get_filename_component(src_dir ${file} DIRECTORY) + string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir) + string(SUBSTRING ${file} ${dir_len} -1 out_file) + list(APPEND external_files ${CMAKE_BINARY_DIR}/packages/${out_file}) + file(COPY ${file} DESTINATION ${CMAKE_BINARY_DIR}/packages/${dest_dir}) + endforeach() + endforeach() set(tar_file ${CMAKE_BINARY_DIR}/packages/correctness.tar.gz) add_custom_command( OUTPUT ${tar_file} DEPENDS ${out_files} COMMAND ${CMAKE_COMMAND} -E tar cfz ${tar_file} ${CMAKE_BINARY_DIR}/packages/bin/fdbserver - ${out_files} + ${out_files} ${external_files} WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/packages COMMENT "Package correctness archive" ) From 94791fbd12e5fddca022054aa7f719bfbe9b6601 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 8 Nov 2019 22:21:47 +0000 Subject: [PATCH 158/184] remove this functionality from Windows Windows file paths are a pain to work with. Currently I don't know of anyone who needs this feature on Windows - so I just remove it there --- cmake/AddFdbTest.cmake | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index 305bd03b2a..eaa54f96a5 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -141,11 +141,16 @@ function(add_fdb_test) set(TEST_TYPE_${test_name} ${test_type} PARENT_SCOPE) endfunction() -set(TEST_PACKAGE_INCLUDE ".*" CACHE STRING "A regex of all tests that should be included in the test package") -set(TEST_PACKAGE_EXCLUDE ".^" CACHE STRING "A regex of all tests that shouldn't be added to the test package") -set(TEST_PACKAGE_ADD_DIRECTORIES "" CACHE STRING "A ;-separated list of directories. All files within each directory will be added to the test package") +if(NOT WIN32) + set(TEST_PACKAGE_INCLUDE ".*" CACHE STRING "A regex of all tests that should be included in the test package") + set(TEST_PACKAGE_EXCLUDE ".^" CACHE STRING "A regex of all tests that shouldn't be added to the test package") + set(TEST_PACKAGE_ADD_DIRECTORIES "" CACHE STRING "A ;-separated list of directories. All files within each directory will be added to the test package") +endif() function(create_test_package) + if(WIN32) + return() + endif() string(LENGTH "${CMAKE_SOURCE_DIR}/tests/" base_length) foreach(test IN LISTS TEST_NAMES) if(("${TEST_TYPE_${test}}" STREQUAL "simulation") AND From 04e66fa0ec647b15f7030decb7d5102f10bd7a64 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Fri, 8 Nov 2019 14:32:42 -0800 Subject: [PATCH 159/184] AtomicOp:Trace when txn reads exceeds limit and add upper bound sum --- fdbserver/workloads/AtomicOps.actor.cpp | 26 +++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fdbserver/workloads/AtomicOps.actor.cpp b/fdbserver/workloads/AtomicOps.actor.cpp index d97e721229..33519ee333 100644 --- a/fdbserver/workloads/AtomicOps.actor.cpp +++ b/fdbserver/workloads/AtomicOps.actor.cpp @@ -33,7 +33,7 @@ struct AtomicOpsWorkload : TestWorkload { double testDuration, transactionsPerSecond; vector> clients; - uint64_t logsum; // The sum of operations when opType = AddValue + uint64_t lbsum, ubsum; // The lower bound and upper bound sum of operations when opType = AddValue AtomicOpsWorkload(WorkloadContext const& wcx) : TestWorkload(wcx), opNum(0) @@ -48,7 +48,8 @@ struct AtomicOpsWorkload : TestWorkload { apiVersion500 = ((sharedRandomNumber % 10) == 0); TraceEvent("AtomicOpsApiVersion500").detail("ApiVersion500", apiVersion500); - logsum = 0; + lbsum = 0; + ubsum = 0; int64_t randNum = sharedRandomNumber / 10; if(opType == -1) @@ -190,11 +191,13 @@ struct AtomicOpsWorkload : TestWorkload { tr.atomicOp(opsKey, val, self->opType); wait( tr.commit() ); if (self->opType == MutationRef::AddValue) { - self->logsum += intValue; + self->lbsum += intValue; + self->ubsum += intValue; } break; } catch( Error &e ) { if (e.code() == 1021) { + self->ubsum += intValue; TraceEvent(SevWarnAlways, "TxnCommitUnknownResult") .detail("Value", intValue) .detail("LogKey", logDebugKey.first) @@ -211,6 +214,9 @@ struct AtomicOpsWorkload : TestWorkload { state ReadYourWritesTransaction tr(cx); Key begin(format("log%08x", g)); Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (log.more) { + TraceEvent(SevError, "LogHitTxnLimits").detail("Result", log.toString()); + } uint64_t sum = 0; for (auto& kv : log) { uint64_t intValue = 0; @@ -233,8 +239,12 @@ struct AtomicOpsWorkload : TestWorkload { try { state ReadYourWritesTransaction tr(cx); Key begin(format("debug%08x", g)); - Standalone log = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); - for (auto& kv : log) { + Standalone debuglog = + wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (debuglog.more) { + TraceEvent(SevError, "DebugLogHitTxnLimits").detail("Result", debuglog.toString()); + } + for (auto& kv : debuglog) { TraceEvent("AtomicOpDebug").detail("Key", kv.key).detail("Val", kv.value); } } catch( Error &e ) { @@ -249,6 +259,9 @@ struct AtomicOpsWorkload : TestWorkload { state ReadYourWritesTransaction tr(cx); Key begin(format("ops%08x", g)); Standalone ops = wait(tr.getRange(KeyRangeRef(begin, strinc(begin)), CLIENT_KNOBS->TOO_MANY)); + if (ops.more) { + TraceEvent(SevError, "OpsHitTxnLimits").detail("Result", ops.toString()); + } uint64_t sum = 0; for (auto& kv : ops) { uint64_t intValue = 0; @@ -397,7 +410,8 @@ struct AtomicOpsWorkload : TestWorkload { .detail("OpResult", opsResult) .detail("OpsResultStr", printable(opsResultStr)) .detail("Size", opsResultStr.size()) - .detail("Sum", self->logsum); + .detail("LowerBoundSum", self->lbsum) + .detail("UpperBoundSum", self->ubsum); wait(self->dumpLogKV(cx, g)); wait(self->dumpDebugKV(cx, g)); wait(self->dumpOpsKV(cx, g)); From 653b18000483c6db9e75b6d176d358fe9ee4cfb3 Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Fri, 8 Nov 2019 23:13:32 +0000 Subject: [PATCH 160/184] make it work if external dir has trailing / --- cmake/AddFdbTest.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/AddFdbTest.cmake b/cmake/AddFdbTest.cmake index eaa54f96a5..c494e19229 100644 --- a/cmake/AddFdbTest.cmake +++ b/cmake/AddFdbTest.cmake @@ -174,6 +174,10 @@ function(create_test_package) string(LENGTH ${dir} dir_len) foreach(file IN LISTS files) get_filename_component(src_dir ${file} DIRECTORY) + # We need to make sure that ${src_dir} is at least + # as long as ${dir}. Otherwise the later call to + # SUBSTRING will fail + set(src_dir "${src_dir}/") string(SUBSTRING ${src_dir} ${dir_len} -1 dest_dir) string(SUBSTRING ${file} ${dir_len} -1 out_file) list(APPEND external_files ${CMAKE_BINARY_DIR}/packages/${out_file}) From 61558eea04eecc8468ea7ab44fb641d3fce26a89 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 11 Nov 2019 00:46:05 -0800 Subject: [PATCH 161/184] Implemented page preloading on BTree cursor seeks to enable hiding latency on soon-to-be-read sibling pages. Added random scans with various preload sizes to the set performance unit test. ObjectCache now tracks hits, misses, and pages which were preloaded but then never used prior to eviction. BTree pages no longer store flags because height is sufficient. Removed virtual specifier in classes not designed to be further inherited. Removed old prototype code (PrefixTree, IndirectShadowPager, MemoryPager) as some interface changes are incompatible and they are no longer worth maintaining. --- fdbserver/CMakeLists.txt | 4 - fdbserver/DeltaTree.h | 78 +- fdbserver/IPager.h | 76 +- fdbserver/IVersionedStore.h | 10 +- fdbserver/IndirectShadowPager.actor.cpp | 960 --------------------- fdbserver/IndirectShadowPager.h | 215 ----- fdbserver/MemoryPager.actor.cpp | 456 ---------- fdbserver/MemoryPager.h | 29 - fdbserver/PrefixTree.h | 1049 ----------------------- fdbserver/VersionedBTree.actor.cpp | 526 ++++++++---- fdbserver/fdbserver.vcxproj | 4 - fdbserver/fdbserver.vcxproj.filters | 4 - 12 files changed, 432 insertions(+), 2979 deletions(-) delete mode 100644 fdbserver/IndirectShadowPager.actor.cpp delete mode 100644 fdbserver/IndirectShadowPager.h delete mode 100644 fdbserver/MemoryPager.actor.cpp delete mode 100644 fdbserver/MemoryPager.h delete mode 100644 fdbserver/PrefixTree.h diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 9301a3245b..d4c5e5f226 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -24,8 +24,6 @@ set(FDBSERVER_SRCS IKeyValueStore.h IPager.h IVersionedStore.h - IndirectShadowPager.actor.cpp - IndirectShadowPager.h KeyValueStoreCompressTestData.actor.cpp KeyValueStoreMemory.actor.cpp KeyValueStoreSQLite.actor.cpp @@ -45,8 +43,6 @@ set(FDBSERVER_SRCS MasterInterface.h MasterProxyServer.actor.cpp masterserver.actor.cpp - MemoryPager.actor.cpp - MemoryPager.h MoveKeys.actor.cpp MoveKeys.actor.h networktest.actor.cpp diff --git a/fdbserver/DeltaTree.h b/fdbserver/DeltaTree.h index ce584f76f2..b1eb53dfff 100644 --- a/fdbserver/DeltaTree.h +++ b/fdbserver/DeltaTree.h @@ -20,13 +20,89 @@ #pragma once -#include "fdbserver/PrefixTree.h" #include "flow/flow.h" #include "flow/Arena.h" #include "fdbclient/FDBTypes.h" #include "fdbserver/Knobs.h" #include +typedef uint64_t Word; +static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) { + int i = 0; + const int wordEnd = cl - sizeof(Word) + 1; + + for(; i < wordEnd; i += sizeof(Word)) { + Word a = *(Word *)ap; + Word b = *(Word *)bp; + if(a != b) { + return i + ctzll(a ^ b) / 8; + } + ap += sizeof(Word); + bp += sizeof(Word); + } + + for (; i < cl; i++) { + if (*ap != *bp) { + return i; + } + ++ap; + ++bp; + } + return cl; +} + +static int commonPrefixLength(StringRef a, StringRef b) { + return commonPrefixLength(a.begin(), b.begin(), std::min(a.size(), b.size())); +} + +// This appears to be the fastest version +static int lessOrEqualPowerOfTwo(int n) { + int p; + for (p = 1; p+p <= n; p+=p); + return p; +} + +/* +static int _lessOrEqualPowerOfTwo(uint32_t n) { + if(n == 0) + return n; + int trailing = __builtin_ctz(n); + int leading = __builtin_clz(n); + if(trailing + leading == ((sizeof(n) * 8) - 1)) + return n; + return 1 << ( (sizeof(n) * 8) - leading - 1); +} + +static int __lessOrEqualPowerOfTwo(unsigned int n) { + int p = 1; + for(; p <= n; p <<= 1); + return p >> 1; +} +*/ + +static int perfectSubtreeSplitPoint(int subtree_size) { + // return the inorder index of the root node in a subtree of the given size + // consistent with the resulting binary search tree being "perfect" (having minimal height + // and all missing nodes as far right as possible). + // There has to be a simpler way to do this. + int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1; + return std::min(s * 2 + 1, subtree_size - s - 1); +} + +static int perfectSubtreeSplitPointCached(int subtree_size) { + static uint16_t *points = nullptr; + static const int max = 500; + if(points == nullptr) { + points = new uint16_t[max]; + for(int i = 0; i < max; ++i) + points[i] = perfectSubtreeSplitPoint(i); + } + + if(subtree_size < max) + return points[subtree_size]; + return perfectSubtreeSplitPoint(subtree_size); +} + // Delta Tree is a memory mappable binary tree of T objects such that each node's item is // stored as a Delta which can reproduce the node's T item given the node's greatest // lesser ancestor and the node's least greater ancestor. diff --git a/fdbserver/IPager.h b/fdbserver/IPager.h index dc58461e47..8f79d9c57f 100644 --- a/fdbserver/IPager.h +++ b/fdbserver/IPager.h @@ -53,8 +53,9 @@ #define VALGRIND_MAKE_MEM_DEFINED(x, y) #endif -typedef uint32_t LogicalPageID; // uint64_t? -static const LogicalPageID invalidLogicalPageID = std::numeric_limits::max(); +typedef uint32_t LogicalPageID; +typedef uint32_t PhysicalPageID; +#define invalidLogicalPageID std::numeric_limits::max() class IPage { public: @@ -85,12 +86,10 @@ public: class IPagerSnapshot { public: - virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable) = 0; + virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool nohit) = 0; virtual Version getVersion() const = 0; - virtual Key getMetaKey() const { - return Key(); - } + virtual Key getMetaKey() const = 0; virtual ~IPagerSnapshot() {} @@ -98,65 +97,7 @@ public: virtual void delref() = 0; }; -class IPager : public IClosable { -public: - // Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed. - virtual Reference newPageBuffer() = 0; - - // Returns the usable size of pages returned by the pager (i.e. the size of the page that isn't pager overhead). - // For a given pager instance, separate calls to this function must return the same value. - virtual int getUsablePageSize() = 0; - - virtual StorageBytes getStorageBytes() = 0; - - // Permitted to fail (ASSERT) during recovery. - virtual Reference getReadSnapshot(Version version) = 0; - - // Returns an unused LogicalPageID. - // LogicalPageIDs in the range [0, SERVER_KNOBS->PAGER_RESERVED_PAGES) do not need to be allocated. - // Permitted to fail (ASSERT) during recovery. - virtual LogicalPageID allocateLogicalPage() = 0; - - // Signals that the page will no longer be used as of the specified version. Versions prior to the specified version must be kept. - // Permitted to fail (ASSERT) during recovery. - virtual void freeLogicalPage(LogicalPageID pageID, Version version) = 0; - - // Writes a page with the given LogicalPageID at the specified version. LogicalPageIDs in the range [0, SERVER_KNOBS->PAGER_RESERVED_PAGES) - // can be written without being allocated. All other LogicalPageIDs must be allocated using allocateLogicalPage before writing them. - // - // If updateVersion is 0, we are signalling to the pager that we are reusing the LogicalPageID entry at the current latest version of pageID. - // - // Otherwise, we will add a new entry for LogicalPageID at the specified version. In that case, updateVersion must be larger than any version - // written to this page previously, and it must be larger than any version committed. If referencePageID is given, the latest version of that - // page will be used for the write, which *can* be less than the latest committed version. - // - // Permitted to fail (ASSERT) during recovery. - virtual void writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID = invalidLogicalPageID) = 0; - - // Signals to the pager that no more reads will be performed in the range [begin, end). - // Permitted to fail (ASSERT) during recovery. - virtual void forgetVersions(Version begin, Version end) = 0; - - // Makes durable all writes and any data structures used for recovery. - // Permitted to fail (ASSERT) during recovery. - virtual Future commit() = 0; - - // Returns the latest version of the pager. Permitted to block until recovery is complete, at which point it should always be set immediately. - // Some functions in the IPager interface are permitted to fail (ASSERT) during recovery, so users should wait for getLatestVersion to complete - // before doing anything else. - virtual Future getLatestVersion() = 0; - - // Sets the latest version of the pager. Must be monotonically increasing. - // - // Must be called prior to reading the specified version. SOMEDAY: It may be desirable in the future to relax this constraint for performance reasons. - // - // Permitted to fail (ASSERT) during recovery. - virtual void setLatestVersion(Version version) = 0; - -protected: - ~IPager() {} // Destruction should be done using close()/dispose() from the IClosable interface -}; - +// This API is probably customized to the behavior of DWALPager and probably needs some changes to be more generic. class IPager2 : public IClosable { public: // Returns an IPage that can be passed to writePage. The data in the returned IPage might not be zeroed. @@ -189,7 +130,10 @@ public: // The data returned will be the later of // - the most recent committed atomic // - the most recent non-atomic write - virtual Future> readPage(LogicalPageID pageID, bool cacheable) = 0; + // Cacheable indicates that the page should be added to the page cache (if applicable?) as a result of this read. + // NoHit indicates that the read should not be considered a cache hit, such as when preloading pages that are + // considered likely to be needed soon. + virtual Future> readPage(LogicalPageID pageID, bool cacheable = true, bool noHit = false) = 0; // Get a snapshot of the metakey and all pages as of the version v which must be >= getOldestVersion() // Note that snapshots at any version may still see the results of updatePage() calls. diff --git a/fdbserver/IVersionedStore.h b/fdbserver/IVersionedStore.h index de4cfd2084..9baf5c4469 100644 --- a/fdbserver/IVersionedStore.h +++ b/fdbserver/IVersionedStore.h @@ -30,10 +30,10 @@ class IStoreCursor { public: virtual Future findEqual(KeyRef key) = 0; - virtual Future findFirstEqualOrGreater(KeyRef key, bool needValue, int prefetchNextBytes) = 0; - virtual Future findLastLessOrEqual(KeyRef key, bool needValue, int prefetchPriorBytes) = 0; - virtual Future next(bool needValue) = 0; - virtual Future prev(bool needValue) = 0; + virtual Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes = 0) = 0; + virtual Future findLastLessOrEqual(KeyRef key, int prefetchBytes = 0) = 0; + virtual Future next() = 0; + virtual Future prev() = 0; virtual bool isValid() = 0; virtual KeyRef getKey() = 0; @@ -41,8 +41,6 @@ public: virtual void addref() = 0; virtual void delref() = 0; - - virtual std::string toString() const = 0; }; class IVersionedStore : public IClosable { diff --git a/fdbserver/IndirectShadowPager.actor.cpp b/fdbserver/IndirectShadowPager.actor.cpp deleted file mode 100644 index 5a525b17af..0000000000 --- a/fdbserver/IndirectShadowPager.actor.cpp +++ /dev/null @@ -1,960 +0,0 @@ -/* - * IndirectShadowPager.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "fdbserver/IndirectShadowPager.h" -#include "fdbserver/Knobs.h" - -#include "flow/UnitTest.h" -#include "flow/actorcompiler.h" -#include "fdbrpc/crc32c.h" - -struct SumType { - bool operator==(const SumType &rhs) const { return crc == rhs.crc; } - uint32_t crc; - std::string toString() { return format("0x%08x", crc); } -}; - -bool checksum(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical, bool write) { - // Calculates and then stores or verifies the checksum at the end of the page. - // If write is true then the checksum is written into the page - // If write is false then the checksum is compared to the in-page sum and - // and error will be thrown if they do not match. - ASSERT(sizeof(SumType) == IndirectShadowPage::PAGE_OVERHEAD_BYTES); - // Adjust pageSize to refer to only usable storage bytes - pageSize -= IndirectShadowPage::PAGE_OVERHEAD_BYTES; - SumType sum; - SumType *pSumInPage = (SumType *)(page + pageSize); - // Write sum directly to page or to sum variable based on mode - SumType *sumOut = write ? pSumInPage : ∑ - sumOut->crc = crc32c_append(logical, page, pageSize); - VALGRIND_MAKE_MEM_DEFINED(sumOut, sizeof(SumType)); - - debug_printf("checksum %s%s logical %d physical %d size %d checksums page %s calculated %s data at %p %s\n", - write ? "write" : "read", - (!write && sum != *pSumInPage) ? " MISMATCH" : "", - logical, physical, pageSize, - write ? "NA" : pSumInPage->toString().c_str(), - sumOut->toString().c_str(), page, ""); - - // Verify if not in write mode - if(!write && sum != *pSumInPage) { - TraceEvent (SevError, "IndirectShadowPagerPageChecksumFailure") - .detail("UserPageSize", pageSize) - .detail("Filename", file->getFilename()) - .detail("LogicalPage", logical) - .detail("PhysicalPage", physical) - .detail("ChecksumInPage", pSumInPage->toString()) - .detail("ChecksumCalculated", sum.toString()); - return false; - } - return true; -} - -inline bool checksumRead(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical) { - return checksum(file, page, pageSize, logical, physical, false); -} - -inline void checksumWrite(IAsyncFile *file, uint8_t *page, int pageSize, LogicalPageID logical, PhysicalPageID physical) { - checksum(file, page, pageSize, logical, physical, true); -} - -IndirectShadowPage::IndirectShadowPage() : fastAllocated(true) { - data = (uint8_t*)FastAllocator<4096>::allocate(); -} - -IndirectShadowPage::~IndirectShadowPage() { - if(fastAllocated) { - FastAllocator<4096>::release(data); - } - else if(file) { - file->releaseZeroCopy(data, PAGE_BYTES, (int64_t) physicalPageID * PAGE_BYTES); - } -} - -uint8_t const* IndirectShadowPage::begin() const { - return data; -} - -uint8_t* IndirectShadowPage::mutate() { - return data; -} - -int IndirectShadowPage::size() const { - return PAGE_BYTES - PAGE_OVERHEAD_BYTES; -} - -const int IndirectShadowPage::PAGE_BYTES = 4096; -const int IndirectShadowPage::PAGE_OVERHEAD_BYTES = sizeof(SumType); - -IndirectShadowPagerSnapshot::IndirectShadowPagerSnapshot(IndirectShadowPager *pager, Version version) - : pager(pager), version(version), pagerError(pager->getError()) -{ -} - -Future> IndirectShadowPagerSnapshot::getPhysicalPage(LogicalPageID pageID, bool cacheable) { - if(pagerError.isReady()) - pagerError.get(); - return pager->getPage(Reference::addRef(this), pageID, version); -} - -template -T bigEndian(T val) { - static_assert(sizeof(T) <= 8, "Can't compute bigEndian on integers larger than 8 bytes"); - uint64_t b = bigEndian64(val); - return *(T*)((uint8_t*)&b+8-sizeof(T)); -} - -ACTOR Future recover(IndirectShadowPager *pager) { - try { - TraceEvent("PagerRecovering").detail("Filename", pager->pageFileName); - pager->pageTableLog = keyValueStoreMemory(pager->basename, UID(), 1e9, "pagerlog"); - - // TODO: this can be done synchronously with the log recovery - int64_t flags = IAsyncFile::OPEN_READWRITE | IAsyncFile::OPEN_LOCK; - state bool exists = fileExists(pager->pageFileName); - if(!exists) { - flags |= IAsyncFile::OPEN_ATOMIC_WRITE_AND_CREATE | IAsyncFile::OPEN_CREATE; - } - - Reference dataFile = wait(IAsyncFileSystem::filesystem()->open(pager->pageFileName, flags, 0600)); - pager->dataFile = dataFile; - - TraceEvent("PagerOpenedDataFile").detail("Filename", pager->pageFileName); - - if(!exists) { - wait(pager->dataFile->sync()); - } - TraceEvent("PagerSyncdDataFile").detail("Filename", pager->pageFileName); - - state int64_t fileSize = wait(pager->dataFile->size()); - TraceEvent("PagerGotFileSize").detail("Size", fileSize).detail("Filename", pager->pageFileName); - - if(fileSize > 0) { - TraceEvent("PagerRecoveringFromLogs").detail("Filename", pager->pageFileName); - Optional pagesAllocatedValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::PAGES_ALLOCATED_KEY)); - if(pagesAllocatedValue.present()) { - BinaryReader pr(pagesAllocatedValue.get(), Unversioned()); - uint32_t pagesAllocated; - pr >> pagesAllocated; - pager->pagerFile.init(fileSize, pagesAllocated); - - debug_printf("%s: Recovered pages allocated: %d\n", pager->pageFileName.c_str(), pager->pagerFile.pagesAllocated); - ASSERT(pager->pagerFile.pagesAllocated != PagerFile::INVALID_PAGE); - - Optional latestVersionValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::LATEST_VERSION_KEY)); - ASSERT(latestVersionValue.present()); - - BinaryReader vr(latestVersionValue.get(), Unversioned()); - vr >> pager->latestVersion; - - Optional oldestVersionValue = wait(pager->pageTableLog->readValue(IndirectShadowPager::OLDEST_VERSION_KEY)); - - if(oldestVersionValue.present()) { - BinaryReader vr(oldestVersionValue.get(), Unversioned()); - vr >> pager->oldestVersion; - } - - debug_printf("%s: Recovered version info: earliest v%lld latest v%lld\n", pager->pageFileName.c_str(), pager->oldestVersion, pager->latestVersion); - pager->committedVersion = pager->latestVersion; - - Standalone> tableEntries = wait(pager->pageTableLog->readRange(KeyRangeRef(IndirectShadowPager::TABLE_ENTRY_PREFIX, strinc(IndirectShadowPager::TABLE_ENTRY_PREFIX)))); - - if(tableEntries.size() > 0) { - BinaryReader kr(tableEntries.back().key, Unversioned()); - - uint8_t prefix; - LogicalPageID logicalPageID; - - kr >> prefix; - ASSERT(prefix == IndirectShadowPager::TABLE_ENTRY_PREFIX.begin()[0]); - - kr >> logicalPageID; - logicalPageID = bigEndian(logicalPageID); - - LogicalPageID pageTableSize = std::max(logicalPageID+1, SERVER_KNOBS->PAGER_RESERVED_PAGES); - pager->pageTable.resize(pageTableSize); - debug_printf("%s: Recovered page table size: %d\n", pager->pageFileName.c_str(), pageTableSize); - } - else { - debug_printf("%s: Recovered no page table entries\n", pager->pageFileName.c_str()); - } - - LogicalPageID nextPageID = SERVER_KNOBS->PAGER_RESERVED_PAGES; - std::set allocatedPhysicalPages; - for(auto entry : tableEntries) { - BinaryReader kr(entry.key, Unversioned()); - BinaryReader vr(entry.value, Unversioned()); - - uint8_t prefix; - LogicalPageID logicalPageID; - Version version; - PhysicalPageID physicalPageID; - - kr >> prefix; - ASSERT(prefix == IndirectShadowPager::TABLE_ENTRY_PREFIX.begin()[0]); - - kr >> logicalPageID; - logicalPageID = bigEndian(logicalPageID); - - kr >> version; - version = bigEndian(version); - vr >> physicalPageID; - - ASSERT(version <= pager->latestVersion); - - pager->pageTable[logicalPageID].push_back(std::make_pair(version, physicalPageID)); - - if(physicalPageID != PagerFile::INVALID_PAGE) { - allocatedPhysicalPages.insert(physicalPageID); - pager->pagerFile.markPageAllocated(logicalPageID, version, physicalPageID); - } - - while(nextPageID < logicalPageID) { - pager->logicalFreeList.push_back(nextPageID++); - } - if(logicalPageID == nextPageID) { - ++nextPageID; - } - - debug_printf("%s: Recovered page table entry logical %d -> (v%lld, physical %d)\n", pager->pageFileName.c_str(), logicalPageID, version, physicalPageID); - } - - debug_printf("%s: Building physical free list\n", pager->pageFileName.c_str()); - // TODO: can we do this better? does it require storing extra info in the log? - PhysicalPageID nextPhysicalPageID = 0; - for(auto itr = allocatedPhysicalPages.begin(); itr != allocatedPhysicalPages.end(); ++itr) { - while(nextPhysicalPageID < *itr) { - pager->pagerFile.freePage(nextPhysicalPageID++); - } - ++nextPhysicalPageID; - } - - while(nextPhysicalPageID < pager->pagerFile.pagesAllocated) { - pager->pagerFile.freePage(nextPhysicalPageID++); - } - } - } - - if(pager->pageTable.size() < SERVER_KNOBS->PAGER_RESERVED_PAGES) { - pager->pageTable.resize(SERVER_KNOBS->PAGER_RESERVED_PAGES); - } - - pager->pagerFile.finishedMarkingPages(); - pager->pagerFile.startVacuuming(); - - debug_printf("%s: Finished recovery at v%lld\n", pager->pageFileName.c_str(), pager->latestVersion); - TraceEvent("PagerFinishedRecovery").detail("LatestVersion", pager->latestVersion).detail("OldestVersion", pager->oldestVersion).detail("Filename", pager->pageFileName); - } - catch(Error &e) { - if(e.code() != error_code_actor_cancelled) { - TraceEvent(SevError, "PagerRecoveryFailed").error(e, true).detail("Filename", pager->pageFileName); - } - throw; - } - - return Void(); -} - -ACTOR Future housekeeper(IndirectShadowPager *pager) { - wait(pager->recovery); - wait(Never()); - loop { - state LogicalPageID pageID = 0; - for(; pageID < pager->pageTable.size(); ++pageID) { - // TODO: pick an appropriate rate for this loop and determine the right way to implement it - // Right now, this delays 10ms every 400K pages, which means we have 1s of delay for every - // 40M pages. In total, we introduce 100s delay for a max size 4B page file. - if(pageID % 400000 == 0) { - wait(delay(0.01)); - } - else { - wait(yield()); - } - - auto& pageVersionMap = pager->pageTable[pageID]; - - if(pageVersionMap.size() > 0) { - auto itr = pageVersionMap.begin(); - for(auto prev = itr; prev != pageVersionMap.end() && prev->first < pager->oldestVersion; prev=itr) { - pager->pagerFile.markPageAllocated(pageID, itr->first, itr->second); - ++itr; - if(prev->second != PagerFile::INVALID_PAGE && (itr == pageVersionMap.end() || itr->first <= pager->oldestVersion)) { - pager->freePhysicalPageID(prev->second); - } - if(itr == pageVersionMap.end() || itr->first >= pager->oldestVersion) { - debug_printf("%s: Updating oldest version for logical %u: v%lld\n", pager->pageFileName.c_str(), pageID, pager->oldestVersion); - pager->logPageTableClear(pageID, 0, pager->oldestVersion); - - if(itr != pageVersionMap.end() && itr->first > pager->oldestVersion) { - debug_printf("%s: Erasing pages to prev from pageVersionMap for %d (itr=%lld, prev=%lld)\n", pager->pageFileName.c_str(), pageID, itr->first, prev->first); - prev->first = pager->oldestVersion; - pager->logPageTableUpdate(pageID, pager->oldestVersion, prev->second); - itr = pageVersionMap.erase(pageVersionMap.begin(), prev); - } - else { - debug_printf("%s: Erasing pages to itr from pageVersionMap for %d (%d) (itr=%lld, prev=%lld)\n", pager->pageFileName.c_str(), pageID, itr == pageVersionMap.end(), itr==pageVersionMap.end() ? -1 : itr->first, prev->first); - itr = pageVersionMap.erase(pageVersionMap.begin(), itr); - } - } - } - - for(; itr != pageVersionMap.end(); ++itr) { - pager->pagerFile.markPageAllocated(pageID, itr->first, itr->second); - } - - if(pageVersionMap.size() == 0) { - pager->freeLogicalPageID(pageID); - } - } - } - - pager->pagerFile.finishedMarkingPages(); - } -} - -ACTOR Future forwardError(Future f, Promise target) { - try { - wait(f); - } - catch(Error &e) { - if(e.code() != error_code_actor_cancelled && target.canBeSet()) { - target.sendError(e); - } - - throw e; - } - - return Void(); -} - -IndirectShadowPager::IndirectShadowPager(std::string basename) - : basename(basename), latestVersion(0), committedVersion(0), committing(Void()), oldestVersion(0), pagerFile(this) -{ - pageFileName = basename; - recovery = forwardError(recover(this), errorPromise); - housekeeping = forwardError(housekeeper(this), errorPromise); -} - -StorageBytes IndirectShadowPager::getStorageBytes() { - int64_t free; - int64_t total; - g_network->getDiskBytes(parentDirectory(basename), free, total); - return StorageBytes(free, total, pagerFile.size(), free + IndirectShadowPage::PAGE_BYTES * pagerFile.getFreePages()); -} - -Reference IndirectShadowPager::newPageBuffer() { - return Reference(new IndirectShadowPage()); -} - -int IndirectShadowPager::getUsablePageSize() { - return IndirectShadowPage::PAGE_BYTES - IndirectShadowPage::PAGE_OVERHEAD_BYTES; -} - -Reference IndirectShadowPager::getReadSnapshot(Version version) { - debug_printf("%s: Getting read snapshot v%lld latest v%lld oldest v%lld\n", pageFileName.c_str(), version, latestVersion, oldestVersion); - ASSERT(recovery.isReady()); - ASSERT(version <= latestVersion); - ASSERT(version >= oldestVersion); - - return Reference(new IndirectShadowPagerSnapshot(this, version)); -} - -LogicalPageID IndirectShadowPager::allocateLogicalPage() { - ASSERT(recovery.isReady()); - - LogicalPageID allocatedPage; - if(logicalFreeList.size() > 0) { - allocatedPage = logicalFreeList.front(); - logicalFreeList.pop_front(); - } - else { - ASSERT(pageTable.size() < std::numeric_limits::max()); // TODO: different error? - allocatedPage = pageTable.size(); - pageTable.push_back(PageVersionMap()); - } - - ASSERT(allocatedPage >= SERVER_KNOBS->PAGER_RESERVED_PAGES); - debug_printf("%s: op=allocate id=%u\n", pageFileName.c_str(), allocatedPage); - return allocatedPage; -} - -void IndirectShadowPager::freeLogicalPage(LogicalPageID pageID, Version version) { - ASSERT(recovery.isReady()); - ASSERT(committing.isReady()); - - ASSERT(pageID < pageTable.size()); - - PageVersionMap &pageVersionMap = pageTable[pageID]; - ASSERT(!pageVersionMap.empty()); - - // 0 will mean delete as of latest version, similar to write at latest version - if(version == 0) { - version = pageVersionMap.back().first; - } - - auto itr = pageVersionMapLowerBound(pageVersionMap, version); - // TODO: Is this correct, that versions from the past *forward* can be deleted? - for(auto i = itr; i != pageVersionMap.end(); ++i) { - freePhysicalPageID(i->second); - } - - if(itr != pageVersionMap.end()) { - debug_printf("%s: Clearing newest versions for logical %u: v%lld\n", pageFileName.c_str(), pageID, version); - logPageTableClearToEnd(pageID, version); - pageVersionMap.erase(itr, pageVersionMap.end()); - } - - if(pageVersionMap.size() == 0) { - debug_printf("%s: Freeing logical %u (freeLogicalPage)\n", pageFileName.c_str(), pageID); - logicalFreeList.push_back(pageID); - } - else if(pageVersionMap.back().second != PagerFile::INVALID_PAGE) { - pageVersionMap.push_back(std::make_pair(version, PagerFile::INVALID_PAGE)); - logPageTableUpdate(pageID, version, PagerFile::INVALID_PAGE); - } -} - -ACTOR Future waitAndFreePhysicalPageID(IndirectShadowPager *pager, PhysicalPageID pageID, Future canFree) { - wait(canFree); - pager->pagerFile.freePage(pageID); - return Void(); -} - -// TODO: Freeing physical pages must be done *after* committing the page map changes that cause the physical page to no longer be used. -// Otherwise, the physical page could be reused by a write followed by a power loss in which case the mapping change would not -// have been committed and so the physical page should still contain its previous data but it's been overwritten. -void IndirectShadowPager::freePhysicalPageID(PhysicalPageID pageID) { - debug_printf("%s: Freeing physical %u\n", pageFileName.c_str(), pageID); - pagerFile.freePage(pageID); -} - -void IndirectShadowPager::writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID) { - ASSERT(recovery.isReady()); - ASSERT(committing.isReady()); - - ASSERT(updateVersion > latestVersion || updateVersion == 0); - ASSERT(pageID < pageTable.size()); - - PageVersionMap &pageVersionMap = pageTable[pageID]; - - ASSERT(pageVersionMap.empty() || pageVersionMap.back().second != PagerFile::INVALID_PAGE); - - // TODO: should this be conditional on the write succeeding? - bool updateExisting = updateVersion == 0; - if(updateExisting) { - // If there is no existing latest version to update then there must be a referencePageID from which to get a latest version - // so get that version and change this to a normal update - if(pageVersionMap.empty()) { - ASSERT(referencePageID != invalidLogicalPageID); - PageVersionMap &rpv = pageTable[referencePageID]; - ASSERT(!rpv.empty()); - updateVersion = rpv.back().first; - updateExisting = false; - } - else { - ASSERT(pageVersionMap.size()); - updateVersion = pageVersionMap.back().first; - } - } - - PhysicalPageID physicalPageID = pagerFile.allocatePage(pageID, updateVersion); - - debug_printf("%s: Writing logical %d v%lld physical %d\n", pageFileName.c_str(), pageID, updateVersion, physicalPageID); - - if(updateExisting) { - // TODO: Physical page cannot be freed now, it must be done after the page mapping change above is committed - //freePhysicalPageID(pageVersionMap.back().second); - pageVersionMap.back().second = physicalPageID; - } - else { - ASSERT(pageVersionMap.empty() || pageVersionMap.back().first < updateVersion); - pageVersionMap.push_back(std::make_pair(updateVersion, physicalPageID)); - } - - logPageTableUpdate(pageID, updateVersion, physicalPageID); - - checksumWrite(dataFile.getPtr(), contents->mutate(), IndirectShadowPage::PAGE_BYTES, pageID, physicalPageID); - - Future write = holdWhile(contents, dataFile->write(contents->begin(), IndirectShadowPage::PAGE_BYTES, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES)); - - if(write.isError()) { - if(errorPromise.canBeSet()) { - errorPromise.sendError(write.getError()); - } - throw write.getError(); - } - writeActors.add(forwardError(write, errorPromise)); -} - -void IndirectShadowPager::forgetVersions(Version begin, Version end) { - ASSERT(recovery.isReady()); - ASSERT(begin <= end); - ASSERT(end <= latestVersion); - - // TODO: support forgetting arbitrary ranges - if(begin <= oldestVersion) { - oldestVersion = std::max(end, oldestVersion); - logVersion(OLDEST_VERSION_KEY, oldestVersion); - } -} - -ACTOR Future commitImpl(IndirectShadowPager *pager, Future previousCommit) { - state Future outstandingWrites = pager->writeActors.signalAndCollapse(); - state Version commitVersion = pager->latestVersion; - - wait(previousCommit); - - pager->logVersion(IndirectShadowPager::LATEST_VERSION_KEY, commitVersion); - - // TODO: we need to prevent writes that happen now from being committed in the subsequent log commit - // This is probably best done once we have better control of the log, where we can write a commit entry - // here without syncing the file. - - wait(outstandingWrites); - - wait(pager->dataFile->sync()); - wait(pager->pageTableLog->commit()); - - pager->committedVersion = std::max(pager->committedVersion, commitVersion); - - return Void(); -} - -Future IndirectShadowPager::commit() { - ASSERT(recovery.isReady()); - Future f = commitImpl(this, committing); - committing = f; - return committing; -} - -void IndirectShadowPager::setLatestVersion(Version version) { - ASSERT(recovery.isReady()); - latestVersion = version; -} - -ACTOR Future getLatestVersionImpl(IndirectShadowPager *pager) { - wait(pager->recovery); - return pager->latestVersion; -} - -Future IndirectShadowPager::getLatestVersion() { - return getLatestVersionImpl(this); -} - -Future IndirectShadowPager::getError() { - return errorPromise.getFuture(); -} - -Future IndirectShadowPager::onClosed() { - return closed.getFuture(); -} - -ACTOR void shutdown(IndirectShadowPager *pager, bool dispose) { - if(pager->errorPromise.canBeSet()) - pager->errorPromise.sendError(actor_cancelled()); // Ideally this should be shutdown_in_progress - - // Cancel all outstanding reads - auto i = pager->busyPages.begin(); - auto iEnd = pager->busyPages.end(); - - while(i != iEnd) { - // Advance before calling cancel as the rawRead cancel will destroy the map entry it lives in - (i++)->second.read.cancel(); - } - ASSERT(pager->busyPages.empty()); - - wait(ready(pager->writeActors.signal())); - wait(ready(pager->operations.signal())); - wait(ready(pager->committing)); - - pager->housekeeping.cancel(); - pager->pagerFile.shutdown(); - - state Future pageTableClosed = pager->pageTableLog->onClosed(); - if(dispose) { - wait(ready(IAsyncFileSystem::filesystem()->deleteFile(pager->pageFileName, true))); - pager->pageTableLog->dispose(); - } - else { - pager->pageTableLog->close(); - } - - wait(ready(pageTableClosed)); - - pager->closed.send(Void()); - delete pager; -} - -void IndirectShadowPager::dispose() { - shutdown(this, true); -} - -void IndirectShadowPager::close() { - shutdown(this, false); -} - -ACTOR Future> rawRead(IndirectShadowPager *pager, LogicalPageID logicalPageID, PhysicalPageID physicalPageID) { - state void *data; - state int len = IndirectShadowPage::PAGE_BYTES; - state bool readSuccess = false; - - try { - wait(pager->dataFile->readZeroCopy(&data, &len, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES)); - readSuccess = true; - - if(!checksumRead(pager->dataFile.getPtr(), (uint8_t *)data, len, logicalPageID, physicalPageID)) { - throw checksum_failed(); - } - - pager->busyPages.erase(physicalPageID); - return Reference(new IndirectShadowPage((uint8_t *)data, pager->dataFile, physicalPageID)); - } - catch(Error &e) { - pager->busyPages.erase(physicalPageID); - if(readSuccess || e.code() == error_code_actor_cancelled) { - pager->dataFile->releaseZeroCopy(data, len, (int64_t) physicalPageID * IndirectShadowPage::PAGE_BYTES); - } - throw; - } -} - -Future> getPageImpl(IndirectShadowPager *pager, Reference snapshot, LogicalPageID logicalPageID, Version version) { - ASSERT(logicalPageID < pager->pageTable.size()); - PageVersionMap &pageVersionMap = pager->pageTable[logicalPageID]; - - auto itr = IndirectShadowPager::pageVersionMapUpperBound(pageVersionMap, version); - if(itr == pageVersionMap.begin()) { - debug_printf("%s: Page version map empty! op=error id=%u @%lld\n", pager->pageFileName.c_str(), logicalPageID, version); - ASSERT(false); - } - --itr; - PhysicalPageID physicalPageID = itr->second; - ASSERT(physicalPageID != PagerFile::INVALID_PAGE); - - debug_printf("%s: Reading logical %d v%lld physical %d mapSize %lu\n", pager->pageFileName.c_str(), logicalPageID, version, physicalPageID, pageVersionMap.size()); - - IndirectShadowPager::BusyPage &bp = pager->busyPages[physicalPageID]; - if(!bp.read.isValid()) { - Future> get = rawRead(pager, logicalPageID, physicalPageID); - if(!get.isReady()) { - bp.read = get; - } - return get; - } - return bp.read; -} - -Future> IndirectShadowPager::getPage(Reference snapshot, LogicalPageID pageID, Version version) { - if(!recovery.isReady()) { - debug_printf("%s: getPage failure, recovery not ready - op=error id=%u @%lld\n", pageFileName.c_str(), pageID, version); - ASSERT(false); - } - - Future> f = getPageImpl(this, snapshot, pageID, version); - operations.add(forwardError(ready(f), errorPromise)); // For some reason if success is ready() then shutdown hangs when waiting on operations - return f; -} - -PageVersionMap::iterator IndirectShadowPager::pageVersionMapLowerBound(PageVersionMap &pageVersionMap, Version version) { - return std::lower_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](std::pair p, Version v) { - return p.first < v; - }); -} - -PageVersionMap::iterator IndirectShadowPager::pageVersionMapUpperBound(PageVersionMap &pageVersionMap, Version version) { - return std::upper_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](Version v, std::pair p) { - return v < p.first; - }); -} - -void IndirectShadowPager::freeLogicalPageID(LogicalPageID pageID) { - if(pageID >= SERVER_KNOBS->PAGER_RESERVED_PAGES) { - debug_printf("%s: Freeing logical %u\n", pageFileName.c_str(), pageID); - logicalFreeList.push_back(pageID); - } -} - -void IndirectShadowPager::logVersion(StringRef versionKey, Version version) { - BinaryWriter v(Unversioned()); - v << version; - - pageTableLog->set(KeyValueRef(versionKey, v.toValue())); -} - -void IndirectShadowPager::logPagesAllocated() { - BinaryWriter v(Unversioned()); - v << pagerFile.getPagesAllocated(); - - pageTableLog->set(KeyValueRef(PAGES_ALLOCATED_KEY, v.toValue())); -} - -void IndirectShadowPager::logPageTableUpdate(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID) { - BinaryWriter k(Unversioned()); - k << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(version); - - BinaryWriter v(Unversioned()); - v << physicalPageID; - - pageTableLog->set(KeyValueRef(k.toValue(), v.toValue())); -} - -void IndirectShadowPager::logPageTableClearToEnd(LogicalPageID logicalPageID, Version start) { - BinaryWriter b(Unversioned()); - b << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(start); - - BinaryWriter e(Unversioned()); - e << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID); - - pageTableLog->clear(KeyRangeRef(b.toValue(), strinc(e.toValue()))); -} - -void IndirectShadowPager::logPageTableClear(LogicalPageID logicalPageID, Version start, Version end) { - BinaryWriter b(Unversioned()); - b << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(start); - - BinaryWriter e(Unversioned()); - e << TABLE_ENTRY_PREFIX.begin()[0] << bigEndian(logicalPageID) << bigEndian(end); - - pageTableLog->clear(KeyRangeRef(b.toValue(), e.toValue())); -} - -const StringRef IndirectShadowPager::LATEST_VERSION_KEY = LiteralStringRef("\xff/LatestVersion"); -const StringRef IndirectShadowPager::OLDEST_VERSION_KEY = LiteralStringRef("\xff/OldestVersion"); -const StringRef IndirectShadowPager::PAGES_ALLOCATED_KEY = LiteralStringRef("\xff/PagesAllocated"); -const StringRef IndirectShadowPager::TABLE_ENTRY_PREFIX = LiteralStringRef("\x00"); - -ACTOR Future copyPage(IndirectShadowPager *pager, Reference page, LogicalPageID logical, PhysicalPageID from, PhysicalPageID to) { - state bool zeroCopied = true; - state int bytes = IndirectShadowPage::PAGE_BYTES; - state void *data = nullptr; - - try { - try { - wait(pager->dataFile->readZeroCopy(&data, &bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES)); - } - catch(Error &e) { - zeroCopied = false; - data = page->mutate(); - int _bytes = wait(pager->dataFile->read(data, page->size(), (int64_t)from * IndirectShadowPage::PAGE_BYTES)); - bytes = _bytes; - } - - ASSERT(bytes == IndirectShadowPage::PAGE_BYTES); - checksumWrite(pager->dataFile.getPtr(), page->mutate(), bytes, logical, to); - wait(pager->dataFile->write(data, bytes, (int64_t)to * IndirectShadowPage::PAGE_BYTES)); - if(zeroCopied) { - pager->dataFile->releaseZeroCopy(data, bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES); - } - } - catch(Error &e) { - if(zeroCopied) { - pager->dataFile->releaseZeroCopy(data, bytes, (int64_t)from * IndirectShadowPage::PAGE_BYTES); - } - pager->pagerFile.freePage(to); - throw e; - } - - return Void(); -} - -ACTOR Future vacuumer(IndirectShadowPager *pager, PagerFile *pagerFile) { - state Reference page(new IndirectShadowPage()); - - loop { - state double start = now(); - while(!pagerFile->canVacuum()) { - wait(delay(1.0)); - } - - ASSERT(!pagerFile->freePages.empty()); - - if(!pagerFile->vacuumQueue.empty()) { - state PhysicalPageID lastUsedPage = pagerFile->vacuumQueue.rbegin()->first; - PhysicalPageID lastFreePage = *pagerFile->freePages.rbegin(); - debug_printf("%s: Vacuuming: evaluating (free list size=%lu, lastFreePage=%u, lastUsedPage=%u, pagesAllocated=%u)\n", pager->pageFileName.c_str(), pagerFile->freePages.size(), lastFreePage, lastUsedPage, pagerFile->pagesAllocated); - ASSERT(lastFreePage < pagerFile->pagesAllocated); - ASSERT(lastUsedPage < pagerFile->pagesAllocated); - ASSERT(lastFreePage != lastUsedPage); - - if(lastFreePage < lastUsedPage) { - state std::pair logicalPageInfo = pagerFile->vacuumQueue[lastUsedPage]; - state PhysicalPageID newPage = pagerFile->allocatePage(logicalPageInfo.first, logicalPageInfo.second); - - debug_printf("%s: Vacuuming: copying page %u to %u\n", pager->pageFileName.c_str(), lastUsedPage, newPage); - wait(copyPage(pager, page, logicalPageInfo.first, lastUsedPage, newPage)); - - auto &pageVersionMap = pager->pageTable[logicalPageInfo.first]; - auto itr = IndirectShadowPager::pageVersionMapLowerBound(pageVersionMap, logicalPageInfo.second); - if(itr != pageVersionMap.end() && itr->second == lastUsedPage) { - itr->second = newPage; - pager->logPageTableUpdate(logicalPageInfo.first, itr->first, newPage); - pagerFile->freePage(lastUsedPage); - } - else { - TEST(true); // page was freed while vacuuming - pagerFile->freePage(newPage); - } - } - } - - PhysicalPageID firstFreePage = pagerFile->vacuumQueue.empty() ? pagerFile->minVacuumQueuePage : (pagerFile->vacuumQueue.rbegin()->first + 1); - ASSERT(pagerFile->pagesAllocated >= firstFreePage); - - uint64_t pagesToErase = 0; - if(pagerFile->freePages.size() >= SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD) { - pagesToErase = std::min(pagerFile->freePages.size() - SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD + 1, pagerFile->pagesAllocated - firstFreePage); - } - - debug_printf("%s: Vacuuming: got %llu pages to erase (freePages=%lu, pagesAllocated=%u, vacuumQueueEmpty=%u, minVacuumQueuePage=%u, firstFreePage=%u)\n", pager->pageFileName.c_str(), pagesToErase, pagerFile->freePages.size(), pagerFile->pagesAllocated, pagerFile->vacuumQueue.empty(), pagerFile->minVacuumQueuePage, firstFreePage); - - if(pagesToErase > 0) { - PhysicalPageID eraseStartPage = pagerFile->pagesAllocated - pagesToErase; - debug_printf("%s: Vacuuming: truncating last %llu pages starting at %u\n", pager->pageFileName.c_str(), pagesToErase, eraseStartPage); - - ASSERT(pagesToErase <= pagerFile->pagesAllocated); - - pagerFile->pagesAllocated = eraseStartPage; - pager->logPagesAllocated(); - - auto freePageItr = pagerFile->freePages.find(eraseStartPage); - ASSERT(freePageItr != pagerFile->freePages.end()); - - pagerFile->freePages.erase(freePageItr, pagerFile->freePages.end()); - ASSERT(pagerFile->vacuumQueue.empty() || pagerFile->vacuumQueue.rbegin()->first < eraseStartPage); - - wait(pager->dataFile->truncate((int64_t)pagerFile->pagesAllocated * IndirectShadowPage::PAGE_BYTES)); - } - - wait(delayUntil(start + (double)IndirectShadowPage::PAGE_BYTES / SERVER_KNOBS->VACUUM_BYTES_PER_SECOND)); // TODO: figure out the correct mechanism here - } -} - -PagerFile::PagerFile(IndirectShadowPager *pager) : fileSize(0), pagesAllocated(0), pager(pager), vacuumQueueReady(false), minVacuumQueuePage(0) {} - -PhysicalPageID PagerFile::allocatePage(LogicalPageID logicalPageID, Version version) { - ASSERT((int64_t)pagesAllocated * IndirectShadowPage::PAGE_BYTES <= fileSize); - ASSERT(fileSize % IndirectShadowPage::PAGE_BYTES == 0); - - PhysicalPageID allocatedPage; - if(!freePages.empty()) { - allocatedPage = *freePages.begin(); - freePages.erase(freePages.begin()); - } - else { - if((int64_t)pagesAllocated * IndirectShadowPage::PAGE_BYTES == fileSize) { - fileSize += (1 << 24); - // TODO: extend the file before writing beyond the end. - } - - ASSERT(pagesAllocated < INVALID_PAGE); // TODO: we should throw a better error here - allocatedPage = pagesAllocated++; - pager->logPagesAllocated(); - } - - markPageAllocated(logicalPageID, version, allocatedPage); - - debug_printf("%s: Allocated physical %u\n", pager->pageFileName.c_str(), allocatedPage); - return allocatedPage; -} - -void PagerFile::freePage(PhysicalPageID pageID) { - freePages.insert(pageID); - - if(pageID >= minVacuumQueuePage) { - vacuumQueue.erase(pageID); - } -} - -void PagerFile::markPageAllocated(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID) { - if(physicalPageID != INVALID_PAGE && physicalPageID >= minVacuumQueuePage) { - vacuumQueue[physicalPageID] = std::make_pair(logicalPageID, version); - } -} - -void PagerFile::finishedMarkingPages() { - if(minVacuumQueuePage >= pagesAllocated) { - minVacuumQueuePage = pagesAllocated >= SERVER_KNOBS->VACUUM_QUEUE_SIZE ? pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE : 0; - vacuumQueueReady = false; - } - else { - if(!vacuumQueueReady) { - vacuumQueueReady = true; - } - if(pagesAllocated > SERVER_KNOBS->VACUUM_QUEUE_SIZE && minVacuumQueuePage < pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE) { - minVacuumQueuePage = pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE; - auto itr = vacuumQueue.lower_bound(minVacuumQueuePage); - vacuumQueue.erase(vacuumQueue.begin(), itr); - } - } -} - -uint64_t PagerFile::size() { - return fileSize; -} - -uint32_t PagerFile::getPagesAllocated() { - return pagesAllocated; -} - -uint32_t PagerFile::getFreePages() { - return freePages.size(); -} - -void PagerFile::init(uint64_t fileSize, uint32_t pagesAllocated) { - this->fileSize = fileSize; - this->pagesAllocated = pagesAllocated; - this->minVacuumQueuePage = pagesAllocated >= SERVER_KNOBS->VACUUM_QUEUE_SIZE ? pagesAllocated - SERVER_KNOBS->VACUUM_QUEUE_SIZE : 0; -} - -void PagerFile::startVacuuming() { - vacuuming = Never(); //vacuumer(pager, this); -} - -void PagerFile::shutdown() { - vacuuming.cancel(); -} - -bool PagerFile::canVacuum() { - if(freePages.size() < SERVER_KNOBS->FREE_PAGE_VACUUM_THRESHOLD // Not enough free pages - || minVacuumQueuePage >= pagesAllocated // We finished processing all pages in the vacuum queue - || !vacuumQueueReady) // Populating vacuum queue - { - debug_printf("%s: Vacuuming: waiting for vacuumable pages (free list size=%lu, minVacuumQueuePage=%u, pages allocated=%u, vacuumQueueReady=%d)\n", pager->pageFileName.c_str(), freePages.size(), minVacuumQueuePage, pagesAllocated, vacuumQueueReady); - return false; - } - - return true; -} - -const PhysicalPageID PagerFile::INVALID_PAGE = std::numeric_limits::max(); - -extern Future simplePagerTest(IPager* const& pager); - -TEST_CASE("/fdbserver/indirectshadowpager/simple") { - state IPager *pager = new IndirectShadowPager("unittest_pageFile"); - - wait(simplePagerTest(pager)); - - Future closedFuture = pager->onClosed(); - pager->close(); - wait(closedFuture); - - return Void(); -} diff --git a/fdbserver/IndirectShadowPager.h b/fdbserver/IndirectShadowPager.h deleted file mode 100644 index 1b097df639..0000000000 --- a/fdbserver/IndirectShadowPager.h +++ /dev/null @@ -1,215 +0,0 @@ -/* - * IndirectShadowPager.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_INDIRECTSHADOWPAGER_H -#define FDBSERVER_INDIRECTSHADOWPAGER_H -#pragma once - -#include "fdbserver/IKeyValueStore.h" -#include "fdbserver/IPager.h" - -#include "flow/ActorCollection.h" -#include "fdbclient/Notified.h" - -#include "fdbrpc/IAsyncFile.h" - -typedef uint32_t PhysicalPageID; -typedef std::vector> PageVersionMap; -typedef std::vector LogicalPageTable; - -class IndirectShadowPager; - -class IndirectShadowPage : public IPage, ReferenceCounted { -public: - IndirectShadowPage(); - IndirectShadowPage(uint8_t *data, Reference file, PhysicalPageID pageID) - : file(file), physicalPageID(pageID), fastAllocated(false), data(data) {} - virtual ~IndirectShadowPage(); - - virtual void addref() const { - ReferenceCounted::addref(); - } - - virtual void delref() const { - ReferenceCounted::delref(); - } - - virtual int size() const; - virtual uint8_t const* begin() const; - virtual uint8_t* mutate(); - -//private: - static const int PAGE_BYTES; - static const int PAGE_OVERHEAD_BYTES; - -private: - Reference file; - PhysicalPageID physicalPageID; - bool fastAllocated; - uint8_t *data; -}; - -class IndirectShadowPagerSnapshot : public IPagerSnapshot, ReferenceCounted { -public: - IndirectShadowPagerSnapshot(IndirectShadowPager *pager, Version version); - - virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable); - - virtual Version getVersion() const { - return version; - } - - virtual ~IndirectShadowPagerSnapshot() { - } - - virtual void addref() { - ReferenceCounted::addref(); - } - - virtual void delref() { - ReferenceCounted::delref(); - } - -private: - IndirectShadowPager *pager; - Version version; - Future pagerError; -}; - -class PagerFile { -public: - PagerFile(IndirectShadowPager *pager); - - PhysicalPageID allocatePage(LogicalPageID logicalPageID, Version version); - void freePage(PhysicalPageID physicalPageID); - void markPageAllocated(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID); - - void finishedMarkingPages(); - - uint64_t size(); - uint32_t getPagesAllocated(); - uint32_t getFreePages(); - - void init(uint64_t fileSize, uint32_t pagesAllocated); - void startVacuuming(); - void shutdown(); - -//private: - Future vacuuming; - IndirectShadowPager *pager; - - uint32_t pagesAllocated; - uint64_t fileSize; - - std::set freePages; - - PhysicalPageID minVacuumQueuePage; - bool vacuumQueueReady; - std::map> vacuumQueue; - - bool canVacuum(); - - static const PhysicalPageID INVALID_PAGE; -}; - -class IndirectShadowPager : public IPager { -public: - IndirectShadowPager(std::string basename); - virtual ~IndirectShadowPager() { - } - - virtual Reference newPageBuffer(); - virtual int getUsablePageSize(); - - virtual Reference getReadSnapshot(Version version); - - virtual LogicalPageID allocateLogicalPage(); - virtual void freeLogicalPage(LogicalPageID pageID, Version version); - virtual void writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID); - virtual void forgetVersions(Version begin, Version end); - virtual Future commit(); - - virtual void setLatestVersion(Version version); - virtual Future getLatestVersion(); - - virtual StorageBytes getStorageBytes(); - - virtual Future getError(); - virtual Future onClosed(); - virtual void dispose(); - virtual void close(); - - Future> getPage(Reference snapshot, LogicalPageID pageID, Version version); - -//private: - std::string basename; - std::string pageFileName; - - Version latestVersion; - Version committedVersion; - - LogicalPageTable pageTable; - IKeyValueStore *pageTableLog; - - Reference dataFile; - Future recovery; - - Future housekeeping; - Future vacuuming; - Version oldestVersion; - - // TODO: This structure maybe isn't needed - struct BusyPage { - Future> read; - }; - - typedef std::map BusyPageMapT; - BusyPageMapT busyPages; - - SignalableActorCollection operations; - SignalableActorCollection writeActors; - Future committing; - - Promise closed; - Promise errorPromise; - - std::deque logicalFreeList; - PagerFile pagerFile; - - static PageVersionMap::iterator pageVersionMapLowerBound(PageVersionMap &pageVersionMap, Version v); - static PageVersionMap::iterator pageVersionMapUpperBound(PageVersionMap &pageVersionMap, Version v); - - void freeLogicalPageID(LogicalPageID pageID); - void freePhysicalPageID(PhysicalPageID pageID); - - void logVersion(StringRef versionKey, Version version); - void logPagesAllocated(); - void logPageTableUpdate(LogicalPageID logicalPageID, Version version, PhysicalPageID physicalPageID); - void logPageTableClearToEnd(LogicalPageID logicalPageID, Version start); - void logPageTableClear(LogicalPageID logicalPageID, Version start, Version end); - - static const StringRef LATEST_VERSION_KEY; - static const StringRef OLDEST_VERSION_KEY; - static const StringRef PAGES_ALLOCATED_KEY; - static const StringRef TABLE_ENTRY_PREFIX; - -}; - -#endif diff --git a/fdbserver/MemoryPager.actor.cpp b/fdbserver/MemoryPager.actor.cpp deleted file mode 100644 index 9e6474dd01..0000000000 --- a/fdbserver/MemoryPager.actor.cpp +++ /dev/null @@ -1,456 +0,0 @@ -/* - * MemoryPager.actor.cpp - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "fdbserver/MemoryPager.h" -#include "fdbserver/Knobs.h" - -#include "flow/Arena.h" -#include "flow/UnitTest.h" -#include "flow/actorcompiler.h" - -typedef uint8_t* PhysicalPageID; -typedef std::vector> PageVersionMap; -typedef std::vector LogicalPageTable; - -class MemoryPager; - -class MemoryPage : public IPage, ReferenceCounted { -public: - MemoryPage(); - MemoryPage(uint8_t *data); - virtual ~MemoryPage(); - - virtual void addref() const { - ReferenceCounted::addref(); - } - - virtual void delref() const { - ReferenceCounted::delref(); - } - - virtual int size() const; - virtual uint8_t const* begin() const; - virtual uint8_t* mutate(); - -private: - friend class MemoryPager; - uint8_t *data; - bool allocated; - - static const int PAGE_BYTES; -}; - -class MemoryPagerSnapshot : public IPagerSnapshot, ReferenceCounted { -public: - MemoryPagerSnapshot(MemoryPager *pager, Version version) : pager(pager), version(version) {} - virtual Future> getPhysicalPage(LogicalPageID pageID, bool cacheable); - virtual Version getVersion() const { - return version; - } - - virtual void addref() { - ReferenceCounted::addref(); - } - - virtual void delref() { - ReferenceCounted::delref(); - } - -private: - MemoryPager *pager; - Version version; -}; - -class MemoryPager : public IPager, ReferenceCounted { -public: - MemoryPager(); - - virtual Reference newPageBuffer(); - virtual int getUsablePageSize(); - - virtual Reference getReadSnapshot(Version version); - - virtual LogicalPageID allocateLogicalPage(); - virtual void freeLogicalPage(LogicalPageID pageID, Version version); - virtual void writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID); - virtual void forgetVersions(Version begin, Version end); - virtual Future commit(); - - virtual StorageBytes getStorageBytes() { - // TODO: Get actual values for used and free memory - return StorageBytes(); - } - - virtual void setLatestVersion(Version version); - virtual Future getLatestVersion(); - - virtual Future getError(); - virtual Future onClosed(); - virtual void dispose(); - virtual void close(); - - virtual Reference getPage(LogicalPageID pageID, Version version); - -private: - Version latestVersion; - Version committedVersion; - Standalone>> data; - LogicalPageTable pageTable; - - Promise closed; - - std::vector freeList; // TODO: is this good enough for now? - - PhysicalPageID allocatePage(Reference contents); - void extendData(); - - static const PhysicalPageID INVALID_PAGE; -}; - -IPager * createMemoryPager() { - return new MemoryPager(); -} - -MemoryPage::MemoryPage() : allocated(true) { - data = (uint8_t*)FastAllocator<4096>::allocate(); -} - -MemoryPage::MemoryPage(uint8_t *data) : data(data), allocated(false) {} - -MemoryPage::~MemoryPage() { - if(allocated) { - FastAllocator<4096>::release(data); - } -} - -uint8_t const* MemoryPage::begin() const { - return data; -} - -uint8_t* MemoryPage::mutate() { - return data; -} - -int MemoryPage::size() const { - return PAGE_BYTES; -} - -const int MemoryPage::PAGE_BYTES = 4096; - -Future> MemoryPagerSnapshot::getPhysicalPage(LogicalPageID pageID, bool cacheable) { - return pager->getPage(pageID, version); -} - -MemoryPager::MemoryPager() : latestVersion(0), committedVersion(0) { - extendData(); - pageTable.resize(SERVER_KNOBS->PAGER_RESERVED_PAGES); -} - -Reference MemoryPager::newPageBuffer() { - return Reference(new MemoryPage()); -} - -int MemoryPager::getUsablePageSize() { - return MemoryPage::PAGE_BYTES; -} - -Reference MemoryPager::getReadSnapshot(Version version) { - ASSERT(version <= latestVersion); - return Reference(new MemoryPagerSnapshot(this, version)); -} - -LogicalPageID MemoryPager::allocateLogicalPage() { - ASSERT(pageTable.size() >= SERVER_KNOBS->PAGER_RESERVED_PAGES); - pageTable.push_back(PageVersionMap()); - return pageTable.size() - 1; -} - -void MemoryPager::freeLogicalPage(LogicalPageID pageID, Version version) { - ASSERT(pageID < pageTable.size()); - - PageVersionMap &pageVersionMap = pageTable[pageID]; - ASSERT(!pageVersionMap.empty()); - - auto itr = std::lower_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](std::pair p, Version v) { - return p.first < v; - }); - - pageVersionMap.erase(itr, pageVersionMap.end()); - if(pageVersionMap.size() > 0 && pageVersionMap.back().second != INVALID_PAGE) { - pageVersionMap.push_back(std::make_pair(version, INVALID_PAGE)); - } -} - -void MemoryPager::writePage(LogicalPageID pageID, Reference contents, Version updateVersion, LogicalPageID referencePageID) { - ASSERT(updateVersion > latestVersion || updateVersion == 0); - ASSERT(pageID < pageTable.size()); - - if(referencePageID != invalidLogicalPageID) { - PageVersionMap &rpv = pageTable[referencePageID]; - ASSERT(!rpv.empty()); - updateVersion = rpv.back().first; - } - - PageVersionMap &pageVersionMap = pageTable[pageID]; - - ASSERT(updateVersion >= committedVersion || updateVersion == 0); - PhysicalPageID physicalPageID = allocatePage(contents); - - ASSERT(pageVersionMap.empty() || pageVersionMap.back().second != INVALID_PAGE); - - if(updateVersion == 0) { - ASSERT(pageVersionMap.size()); - updateVersion = pageVersionMap.back().first; - pageVersionMap.back().second = physicalPageID; - // TODO: what to do with old page? - } - else { - ASSERT(pageVersionMap.empty() || pageVersionMap.back().first < updateVersion); - pageVersionMap.push_back(std::make_pair(updateVersion, physicalPageID)); - } - -} - -void MemoryPager::forgetVersions(Version begin, Version end) { - ASSERT(begin <= end); - ASSERT(end <= latestVersion); - // TODO -} - -Future MemoryPager::commit() { - ASSERT(committedVersion < latestVersion); - committedVersion = latestVersion; - return Void(); -} - -void MemoryPager::setLatestVersion(Version version) { - ASSERT(version > latestVersion); - latestVersion = version; -} - -Future MemoryPager::getLatestVersion() { - return latestVersion; -} - -Reference MemoryPager::getPage(LogicalPageID pageID, Version version) { - ASSERT(pageID < pageTable.size()); - PageVersionMap const& pageVersionMap = pageTable[pageID]; - - auto itr = std::upper_bound(pageVersionMap.begin(), pageVersionMap.end(), version, [](Version v, std::pair p) { - return v < p.first; - }); - - if(itr == pageVersionMap.begin()) { - return Reference(); // TODO: should this be an error? - } - - --itr; - - ASSERT(itr->second != INVALID_PAGE); - return Reference(new MemoryPage(itr->second)); // TODO: Page memory owned by the pager. Change this? -} - -Future MemoryPager::getError() { - return Void(); -} - -Future MemoryPager::onClosed() { - return closed.getFuture(); -} - -void MemoryPager::dispose() { - closed.send(Void()); - delete this; -} - -void MemoryPager::close() { - dispose(); -} - -PhysicalPageID MemoryPager::allocatePage(Reference contents) { - if(freeList.size()) { - PhysicalPageID pageID = freeList.back(); - freeList.pop_back(); - - memcpy(pageID, contents->begin(), contents->size()); - return pageID; - } - else { - ASSERT(data.size() && data.back().capacity() - data.back().size() >= contents->size()); - PhysicalPageID pageID = data.back().end(); - - data.back().append(data.arena(), contents->begin(), contents->size()); - if(data.back().size() == data.back().capacity()) { - extendData(); - } - else { - ASSERT(data.back().size() <= data.back().capacity() - 4096); - } - - return pageID; - } -} - -void MemoryPager::extendData() { - if(data.size() > 1000) { // TODO: is this an ok way to handle large data size? - throw io_error(); - } - - VectorRef d; - d.reserve(data.arena(), 1 << 22); - data.push_back(data.arena(), d); -} - -// TODO: these tests are not MemoryPager specific, we should make them more general - -void fillPage(Reference page, LogicalPageID pageID, Version version) { - ASSERT(page->size() > sizeof(LogicalPageID) + sizeof(Version)); - - memset(page->mutate(), 0, page->size()); - memcpy(page->mutate(), (void*)&pageID, sizeof(LogicalPageID)); - memcpy(page->mutate() + sizeof(LogicalPageID), (void*)&version, sizeof(Version)); -} - -bool validatePage(Reference page, LogicalPageID pageID, Version version) { - bool valid = true; - - LogicalPageID readPageID = *(LogicalPageID*)page->begin(); - if(readPageID != pageID) { - fprintf(stderr, "Invalid PageID detected: %u (expected %u)\n", readPageID, pageID); - valid = false; - } - - Version readVersion = *(Version*)(page->begin()+sizeof(LogicalPageID)); - if(readVersion != version) { - fprintf(stderr, "Invalid Version detected on page %u: %" PRId64 "(expected %" PRId64 ")\n", pageID, readVersion, version); - valid = false; - } - - return valid; -} - -void writePage(IPager *pager, Reference page, LogicalPageID pageID, Version version, bool updateVersion=true) { - fillPage(page, pageID, version); - pager->writePage(pageID, page, updateVersion ? version : 0); -} - -ACTOR Future commit(IPager *pager) { - static int commitNum = 1; - state int myCommit = commitNum++; - - debug_printf("Commit%d\n", myCommit); - wait(pager->commit()); - debug_printf("FinishedCommit%d\n", myCommit); - return Void(); -} - -ACTOR Future read(IPager *pager, LogicalPageID pageID, Version version, Version expectedVersion=-1) { - static int readNum = 1; - state int myRead = readNum++; - state Reference readSnapshot = pager->getReadSnapshot(version); - debug_printf("Read%d\n", myRead); - Reference readPage = wait(readSnapshot->getPhysicalPage(pageID, true)); - debug_printf("FinishedRead%d\n", myRead); - ASSERT(validatePage(readPage, pageID, expectedVersion >= 0 ? expectedVersion : version)); - return Void(); -} - -ACTOR Future simplePagerTest(IPager *pager) { - state Reference page = pager->newPageBuffer(); - - Version latestVersion = wait(pager->getLatestVersion()); - debug_printf("Got latest version: %lld\n", latestVersion); - - state Version version = latestVersion+1; - state Version v1 = version; - - state LogicalPageID pageID1 = pager->allocateLogicalPage(); - - writePage(pager, page, pageID1, v1); - pager->setLatestVersion(v1); - wait(commit(pager)); - - state LogicalPageID pageID2 = pager->allocateLogicalPage(); - - state Version v2 = ++version; - - writePage(pager, page, pageID1, v2); - writePage(pager, page, pageID2, v2); - pager->setLatestVersion(v2); - wait(commit(pager)); - - wait(read(pager, pageID1, v2)); - wait(read(pager, pageID1, v1)); - - state Version v3 = ++version; - writePage(pager, page, pageID1, v3, false); - pager->setLatestVersion(v3); - - wait(read(pager, pageID1, v2, v3)); - wait(read(pager, pageID1, v3, v3)); - - state LogicalPageID pageID3 = pager->allocateLogicalPage(); - - state Version v4 = ++version; - writePage(pager, page, pageID2, v4); - writePage(pager, page, pageID3, v4); - pager->setLatestVersion(v4); - wait(commit(pager)); - - wait(read(pager, pageID2, v4, v4)); - - state Version v5 = ++version; - writePage(pager, page, pageID2, v5); - - state LogicalPageID pageID4 = pager->allocateLogicalPage(); - writePage(pager, page, pageID4, v5); - - state Version v6 = ++version; - pager->freeLogicalPage(pageID2, v5); - pager->freeLogicalPage(pageID3, v3); - pager->setLatestVersion(v6); - wait(commit(pager)); - - pager->forgetVersions(0, v4); - wait(commit(pager)); - - wait(delay(3.0)); - - wait(commit(pager)); - - return Void(); -} - -/* -TEST_CASE("/fdbserver/memorypager/simple") { - state IPager *pager = new MemoryPager(); - - wait(simplePagerTest(pager)); - - Future closedFuture = pager->onClosed(); - pager->dispose(); - - wait(closedFuture); - return Void(); -} -*/ - -const PhysicalPageID MemoryPager::INVALID_PAGE = nullptr; diff --git a/fdbserver/MemoryPager.h b/fdbserver/MemoryPager.h deleted file mode 100644 index 359c443de7..0000000000 --- a/fdbserver/MemoryPager.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * MemoryPager.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef FDBSERVER_MEMORYPAGER_H -#define FDBSERVER_MEMORYPAGER_H -#pragma once - -#include "fdbserver/IPager.h" - -IPager * createMemoryPager(); - -#endif \ No newline at end of file diff --git a/fdbserver/PrefixTree.h b/fdbserver/PrefixTree.h deleted file mode 100644 index 2f67c20ccd..0000000000 --- a/fdbserver/PrefixTree.h +++ /dev/null @@ -1,1049 +0,0 @@ -/* - * PrefixTree.h - * - * This source file is part of the FoundationDB open source project - * - * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "flow/flow.h" -#include "flow/Arena.h" -#include "fdbclient/FDBTypes.h" -#include "fdbserver/Knobs.h" -#include - -typedef uint64_t Word; -static inline int commonPrefixLength(uint8_t const* ap, uint8_t const* bp, int cl) { - int i = 0; - const int wordEnd = cl - sizeof(Word) + 1; - - for(; i < wordEnd; i += sizeof(Word)) { - Word a = *(Word *)ap; - Word b = *(Word *)bp; - if(a != b) { - return i + ctzll(a ^ b) / 8; - } - ap += sizeof(Word); - bp += sizeof(Word); - } - - for (; i < cl; i++) { - if (*ap != *bp) { - return i; - } - ++ap; - ++bp; - } - return cl; -} - -static int commonPrefixLength(StringRef a, StringRef b) { - return commonPrefixLength(a.begin(), b.begin(), std::min(a.size(), b.size())); -} - -// This appears to be the fastest version -static int lessOrEqualPowerOfTwo(int n) { - int p; - for (p = 1; p+p <= n; p+=p); - return p; -} - -/* -static int _lessOrEqualPowerOfTwo(uint32_t n) { - if(n == 0) - return n; - int trailing = __builtin_ctz(n); - int leading = __builtin_clz(n); - if(trailing + leading == ((sizeof(n) * 8) - 1)) - return n; - return 1 << ( (sizeof(n) * 8) - leading - 1); -} - -static int __lessOrEqualPowerOfTwo(unsigned int n) { - int p = 1; - for(; p <= n; p <<= 1); - return p >> 1; -} -*/ - -static int perfectSubtreeSplitPoint(int subtree_size) { - // return the inorder index of the root node in a subtree of the given size - // consistent with the resulting binary search tree being "perfect" (having minimal height - // and all missing nodes as far right as possible). - // There has to be a simpler way to do this. - int s = lessOrEqualPowerOfTwo((subtree_size - 1) / 2 + 1) - 1; - return std::min(s * 2 + 1, subtree_size - s - 1); -} - -static int perfectSubtreeSplitPointCached(int subtree_size) { - static uint16_t *points = nullptr; - static const int max = 500; - if(points == nullptr) { - points = new uint16_t[max]; - for(int i = 0; i < max; ++i) - points[i] = perfectSubtreeSplitPoint(i); - } - - if(subtree_size < max) - return points[subtree_size]; - return perfectSubtreeSplitPoint(subtree_size); -} - -struct PrefixTree { - // TODO: Make PrefixTree use a more complex record type with a multi column key - typedef KeyValueRef EntryRef; - typedef Standalone Entry; - - static int MaximumTreeSize() { - return std::numeric_limits::max(); - }; - - struct Node { - uint8_t flags; - -/* - * Node fields - * - * Logically, a node has the following things - * - Flags describing what is in the node - * - Optional left child - * - Optional right child - * - Prefix string, described by a length and a source (which is the most recent left or right ancestor) - * - Optional split string, which contains any bytes after prefix which are needed to make a branching decision - * - Optional suffix string, containing any remaining key bytes after the split string - * - Optional value string - * - * The physical layout places the left child subtree immediately after the split string so that it is likely - * that the bytes read to make a branching decision and then choosing left (as should happen half of the time) - * will have a high cache hit rate. - * - * If necessary, the flags byte could be an enumeration into a set of possible options, since not all options - * combinations are needed. For example, - * - * - The tree is balanced and filled from the left at the last level, so a node cannot have only a right child. - * - If there are no children, there is no point in splitting any key bytes after the prefix into separate strings. - * - If there is exactly one child (left) then the key bytes after the prefix can all go in the split string. The - * traversal decision is to either stop or go left and one of those options (stop) will still have good memory - * locality. - * - * 8 valid/necessary option combinations for presense of (Left, Right, Split, Suffix) out of 16 possibilities - * - * L R Split Suffix - * - * N N N N # No children, key has no bytes after prefix - * N N Y N # No children, key has bytes after prefix - * Y N N N # One child, key has no bytes after prefix - * Y N Y N # One child, key has bytes after prefix - * Y Y N N # Two children, key has no bytes after prefix - * Y Y N Y # Two children, branch decision can be made using only prefix bytes but there are more key bytes after - * Y Y Y N # Two children, branch decision requires all key bytes after prefix - * Y Y Y Y # Two children, branch decision requires some but not all bytes after prefix - * - * This can be represent with just 3 bits, if necessary, but for now there is space in the flags byte for all 4. - * - * Flag Bits - * - * prefix borrow from next - * true - borrow from the closest ancestor greater than this node - * false - borrow from the closest ancestor less than this node - * large lengths = use 2 byte ints instead of 1 byte for prefix, split, suffix, and value lengths - * (TODO: It might be better to just not use a suffix at all when large is lengths is set) - * left child present - * right child present - * split string present - * suffix string present - * value string present - * - * Serialized format: - * All lengths are in the header, which has variable size - * - * flags 1 byte - * prefix length 1-2 bytes based on large lengths flag - * split length 0-2 bytes based on split string present flag - * suffix length 0-2 bytes based on suffix string present and large lengths flags - * value length 0-1 bytes based on value string present and large lengths flag - * left length 0 or 2 bytes depending on left child present - * split 0+ bytes - * left child 0+ bytes - * suffix 0+ bytes - * value 0+ bytes - * right child 0+ bytes - * - */ - enum EFlags { - USE_LARGE_LENGTHS = 1 << 0, - PREFIX_SOURCE_NEXT = 1 << 1, - HAS_LEFT_CHILD = 1 << 2, - HAS_RIGHT_CHILD = 1 << 3, - HAS_SPLIT = 1 << 4, - HAS_SUFFIX = 1 << 5, - HAS_VALUE = 1 << 6 - }; - - // Stores decoded offsets (from beginning) of Node components - struct Parser { - Parser() {} - Parser(const Node *n) { - init(n); - } - - const Node *node; - - typedef uint16_t OffsetT; - OffsetT headerLen; - OffsetT prefixLen; - OffsetT leftPos; - OffsetT suffixPos; - OffsetT valuePos; - OffsetT rightPos; - - StringRef splitString() const { - return StringRef((const uint8_t *)node + headerLen, leftPos); - } - StringRef suffixString() const { - return StringRef((const uint8_t *)node + headerLen + suffixPos, valuePos - suffixPos); - } - StringRef valueString() const { - return StringRef((const uint8_t *)node + headerLen + valuePos, rightPos - valuePos); - } - const Node *leftChild() const { - if(node->flags & HAS_LEFT_CHILD) - return (const Node *)((const uint8_t *)node + headerLen + leftPos); - return nullptr; - } - const Node *rightChild() const { - if(node->flags & HAS_RIGHT_CHILD) - return (const Node *)((const uint8_t *)node + headerLen + rightPos); - return nullptr; - } - int keyLen() const { - int len = prefixLen + leftPos + (valuePos - suffixPos); - ASSERT(len >= 0); - return len; - } - - void init(const Node *n) { - node = n; - union { - const uint8_t *p8; - const uint16_t *p16; - }; - p8 = (const uint8_t *)&n->flags + 1; - - int flags = n->flags; - bool large = flags & USE_LARGE_LENGTHS; - - prefixLen = large ? *p16++ : *p8++; - - if(flags & HAS_SPLIT) - leftPos = large ? *p16++ : *p8++; - else - leftPos = 0; - suffixPos = leftPos; - if(flags & HAS_LEFT_CHILD) - suffixPos += *p16++; - - valuePos = suffixPos; - if(flags & HAS_SUFFIX) - valuePos += (large ? *p16++ : *p8++); - - rightPos = valuePos; - if(flags & HAS_VALUE) - rightPos += (large ? *p16++ : *p8++); - - int header = 2; // flags byte, first prefix len byte - if(large) - ++header; // second prefix len byte - if(flags & HAS_SPLIT) - header += large ? 2 : 1; - if(flags & HAS_LEFT_CHILD) - header += 2; - if(flags & HAS_SUFFIX) - header += large ? 2 : 1; - if(flags & HAS_VALUE) - header += large ? 2 : 1; - headerLen = header; - } - }; - - static inline int getMaxOverhead(int index, int keySize, int valueSize) { - bool large = keySize > 255 || valueSize > 255; - int overhead = 1 + (large ? 2 : 1); // flags and prefix len - // Value length size if present - if(valueSize > 0) - overhead += large ? 2 : 1; - overhead += large ? 6 : 3; // Worst case scenario for value, split and suffix lengths - if((index & 0x01) != 0) - overhead += 2; // Left child length, one less than half of nodes will have one. - return overhead; - } - - public: - - // Methods for decoding specific Node members on-demand - inline int getPrefixLen() const { - return Parser(this).prefixLen; - } - - inline StringRef getSplitString() const { - return Parser(this).splitString(); - } - - inline StringRef getSuffixString() const { - return Parser(this).suffixString(); - } - - inline StringRef getValueString() const { - return Parser(this).valueString(); - } - - inline const Node * getLeftChild() const { - return Parser(this).leftChild(); - } - - inline const Node * getRightChild() const { - return Parser(this).rightChild(); - } - - inline int getKeySize() const { - return Parser(this).keyLen(); - } - }; - -#pragma pack(push,1) - uint16_t size; // size in bytes - Node root; -#pragma pack(pop) - - static inline int GetHeaderSize() { - return sizeof(PrefixTree) - sizeof(root); - } - -private: - struct PathEntry { - const Node *node; - Node::Parser parser; - - // Key may or may not point to the space within keyBuffer. - // Key will always contain at least the prefix bytes borrowed by node - // KeyBuffer will always be large enough to hold the entire reconstituted key for node - // - // These are mutable because getting key bytes from this PathEntry can change these - // but they're really just a read cache for reconstituted key bytes. - mutable StringRef key; - mutable Standalone> keyBuffer; - - // Path entry was reached by going left from the previous node - bool nodeIsLeftChild; - // number of consecutive moves in same direction - int moves; - - PathEntry() : node(nullptr) { - } - PathEntry(const PathEntry &rhs) { - *this = rhs; - } - - // Initialize the key byte buffer to hold bytes of a new node. Use a new arena - // if the old arena is being held by any users. - void initKeyBufferSpace() { - if(node != nullptr) { - int size = parser.keyLen(); - if(keyBuffer.arena().impl && !keyBuffer.arena().impl->isSoleOwnerUnsafe()) { - keyBuffer = Standalone>(); - } - keyBuffer.reserve(keyBuffer.arena(), size); - } - } - - PathEntry & operator= (const PathEntry &rhs) { - node = rhs.node; - parser = rhs.parser; - nodeIsLeftChild = rhs.nodeIsLeftChild; - moves = rhs.moves; - // New key buffer must be able to hold full reconstituted key, not just the - // part of it referenced by rhs.key (which may not be the whole thing) - initKeyBufferSpace(); - if(node != nullptr && rhs.key.size() > 0) { - // Copy rhs.key into keyBuffer and set key to the destination bytes - memcpy(keyBuffer.begin(), rhs.key.begin(), rhs.key.size()); - key = StringRef(keyBuffer.begin(), rhs.key.size()); - } - else { - key = rhs.key; - } - return *this; - } - - void init(StringRef s) { - node = nullptr; - key = s; - } - - void init(const Node *_node, const PathEntry *prefixSource, bool isLeft, int numMoves) { - node = _node; - parser.init(node); - nodeIsLeftChild = isLeft; - moves = numMoves; - - // keyBuffer will be large enough to hold the full reconstituted key but initially - // key will be a reference returned from prefixSource->getKeyRef() - // See comments near keyBuffer and key for more info. - initKeyBufferSpace(); - key = prefixSource->getKeyRef(parser.prefixLen); - } - - inline bool valid() const { - return node != nullptr; - } - - int compareToKey(StringRef s) const { - // Key has at least this node's borrowed prefix bytes in it. - // If s is shorter than key, we only need to compare it to key - if(s.size() < key.size()) - return s.compare(key); - - int cmp = s.substr(0, key.size()).compare(key); - if(cmp != 0) - return cmp; - - // The borrowed prefix bytes and possibly more have already been compared and were equal - int comparedLen = key.size(); - s = s.substr(comparedLen); - StringRef split = parser.splitString(); - int splitSizeOriginal = split.size(); - int splitStart = comparedLen - parser.prefixLen; - if(splitStart < split.size()) { - split = split.substr(splitStart); - if(s.size() < split.size()) - return s.compare(split); - cmp = s.substr(0, split.size()).compare(split); - if(cmp != 0) - return cmp; - s = s.substr(split.size()); - comparedLen += split.size(); - } - - int suffixStart = comparedLen - (parser.prefixLen + splitSizeOriginal); - StringRef suffix = parser.suffixString(); - ASSERT(suffixStart >= 0 && suffixStart <= suffix.size()); - return s.compare(suffix.substr(suffixStart)); - } - - // Make sure that key refers to bytes in keyBuffer, copying if necessary - void ensureKeyInBuffer() const { - if(key.begin() != keyBuffer.begin()) { - memcpy(keyBuffer.begin(), key.begin(), key.size()); - key = StringRef(keyBuffer.begin(), key.size()); - } - } - - // Get the borrowed prefix string. Key must contain all of those bytes but it could contain more. - StringRef getPrefix() const { - if(node == nullptr) - return key; - return key.substr(0, parser.prefixLen); - } - - // Return a reference to the first size bytes of the key. - // - // If size <= key's size then a substring of key will be returned, but if alwaysUseKeyBuffer - // is true then before returning the existing value of key (not just the first size bytes) - // will be copied into keyBuffer and key will be updated to point there. - // - // If size is greater than key's size, then key will be moved into keyBuffer if it is not already there - // and the remaining needed bytes will be copied into keyBuffer from the split and suffix strings. - KeyRef getKeyRef(int size = -1, bool alwaysUseKeyBuffer = false) const { - if(size < 0) - size = parser.keyLen(); - - // If size is less than key then return a substring of it, possibly after moving it to the keyBuffer. - if(size <= key.size()) { - if(alwaysUseKeyBuffer) - ensureKeyInBuffer(); - return key.substr(0, size); - } - - ASSERT(node != nullptr); - ensureKeyInBuffer(); - - // The borrowed prefix bytes and possibly more must already be in key - int writtenLen = key.size(); - StringRef split = parser.splitString(); - StringRef suffix = parser.suffixString(); - int splitStart = writtenLen - parser.prefixLen; - if(splitStart < split.size()) { - int splitLen = std::min(split.size() - splitStart, size - writtenLen); - memcpy(mutateString(key) + writtenLen, split.begin() + splitStart, splitLen); - writtenLen += splitLen; - } - int suffixStart = writtenLen - parser.prefixLen - split.size(); - if(suffixStart < suffix.size()) { - int suffixLen = std::min(suffix.size() - suffixStart, size - writtenLen); - memcpy(mutateString(key) + writtenLen, suffix.begin() + suffixStart, suffixLen); - writtenLen += suffixLen; - } - ASSERT(writtenLen == size); - key = StringRef(key.begin(), size); - return key; - } - - // Return keyRef(size) and the arena that keyBuffer resides in. - Key getKey(int size = -1) const { - StringRef k = getKeyRef(size, true); - return Key(k, keyBuffer.arena()); - } - }; - -public: - // Cursor provides a way to seek into a PrefixTree and iterate over its content - // Seek and move methods can return false can return false if they fail to achieve the desired effect - // but a cursor will remain 'valid' as long as the tree is not empty. - // - // It coalesces prefix bytes into a contiguous buffer for each node along the traversal - // path to make iteration faster. - struct Cursor { - Cursor() : pathLen(0) { - } - - Cursor(const Node *root, StringRef prevAncestor, StringRef nextAncestor) { - init(root, prevAncestor, nextAncestor); - } - - static const int initialPathLen = 3; - static const int initialPathCapacity = 20; - // This is a separate function so that Cursors can be reused to search different PrefixTrees - // which avoids cursor destruction and creation which involves unnecessary memory churn. - // The root node is arbitrarily assumed to be a right child of prevAncestor which itself is a left child of nextAncestor - void init(const Node *root, StringRef prevAncestor, StringRef nextAncestor) { - if(path.size() < initialPathCapacity) - path.resize(initialPathCapacity); - pathLen = initialPathLen; - path[0].init(nextAncestor); - path[1].init(prevAncestor); - path[2].init(root, &path[root->flags & Node::PREFIX_SOURCE_NEXT ? 0 : 1], false, 1); - } - - bool operator == (const Cursor &rhs) const { - return pathBack().node == rhs.pathBack().node; - } - - StringRef leftParentBoundary; - StringRef rightParentBoundary; - std::vector path; - // pathLen is the number of elements in path which are in use. This is to prevent constantly destroying - // and constructing PathEntry objects which would unnecessarily churn through memory in Arena for storing - // coalesced prefixes. - int pathLen; - - bool valid() const { - return pathLen != 0 && pathBack().valid(); - } - - // Get a reference to the current key which is valid until the Cursor is moved. - KeyRef getKeyRef() const { - return pathBack().getKeyRef(); - } - - // Get a Standalone for the current key which will still be valid after the Cursor is moved. - Key getKey() const { - return pathBack().getKey(); - } - - // Get a reference to the current value which is valid as long as the Cursor's page memory exists. - ValueRef getValueRef() const { - return pathBack().parser.valueString(); - } - - // Get a key/value reference that is valid until the Cursor is moved. - EntryRef getKVRef() const { - return EntryRef(getKeyRef(), getValueRef()); - } - - // Returns a standalone EntryRef where both key and value exist in the standalone's arena, - // unless copyValue is false in which case the value will be a reference into tree memory. - Entry getKV(bool copyValue = true) const { - Key k = getKey(); - ValueRef v = getValueRef(); - if(copyValue) - v = ValueRef(k.arena(), getValueRef()); - return Entry(EntryRef(k, v), k.arena()); - } - - // Moves the cursor to the node with the greatest key less than or equal to s. If successful, - // returns true, otherwise returns false and the cursor will be at the node with the next key - // greater than s. - bool seekLessThanOrEqual(StringRef s) { - if(pathLen == 0) - return false; - - pathLen = initialPathLen; - - // TODO: Track position of difference and use prefix reuse bytes and prefix sources - // to skip comparison of some prefix bytes when possible - while(1) { - const PathEntry &p = pathBack(); - const Node *right = p.parser.rightChild(); - _mm_prefetch((const char*)right, _MM_HINT_T0); - - int cmp = p.compareToKey(s); - if(cmp == 0) - return true; - - if(cmp < 0) { - // Try to traverse left - const Node *left = p.parser.leftChild(); - if(left == nullptr) { - // If we're at the root, cursor should now be before the first element - if(pathLen == initialPathLen) { - return false; - } - - if(p.nodeIsLeftChild) { - // If we only went left, cursor should now be before the first element - if((p.moves + initialPathLen) == pathLen) { - return false; - } - - // Otherwise, go to the parent of the last right child traversed, - // which is the last node from which we went right - popPath(p.moves + 1); - return true; - } - - // p.directionLeft is false, so p.node is a right child, so go to its parent. - popPath(1); - return true; - } - - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - } - else { - // Try to traverse right - if(right == nullptr) { - return true; - } - - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - } - } - } - - inline const PathEntry &pathBack() const { - return path[pathLen - 1]; - } - - inline PathEntry &pathBack() { - return path[pathLen - 1]; - } - - inline void pushPath(const Node *node, const PathEntry *borrowSource, bool left, int moves) { - ++pathLen; - if(path.size() < pathLen) { - path.resize(pathLen); - } - pathBack().init(node, borrowSource, left, moves); - } - - inline void popPath(int n) { - pathLen -= n; - } - - std::string pathToString() const { - std::string s; - for(int i = 0; i < pathLen; ++i) { - s += format("(%d: ", i); - const Node *node = path[i].node; - if(node != nullptr) { - s += "childDir="; - s += (path[i].nodeIsLeftChild ? "left " : "right "); - } - s += format("prefix='%s'", path[i].getPrefix().toHexString(20).c_str()); - if(node != nullptr) { - s += format(" split='%s' suffix='%s' value='%s'", node->getSplitString().toHexString(20).c_str(), node->getSuffixString().toHexString(20).c_str(), node->getValueString().toHexString(20).c_str()); - } - else - s += ") "; - } - return s; - } - - bool moveFirst() { - if(pathLen == 0) - return false; - - pathLen = initialPathLen; - - while(1) { - const PathEntry &p = pathBack(); - const Node *left = p.parser.leftChild(); - - if(left == nullptr) - break; - - // TODO: This can be simpler since it only goes left - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - } - - return true; - } - - bool moveLast() { - if(pathLen == 0) - return false; - - pathLen = initialPathLen; - - while(1) { - const PathEntry &p = pathBack(); - const Node *right = p.parser.rightChild(); - - if(right == nullptr) - break; - - // TODO: This can be simpler since it only goes right - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - } - - return true; - } - - bool moveNext() { - const PathEntry &p = pathBack(); - - // If p isn't valid - if(!p.valid()) { - return false; - } - - const Node *right = p.parser.rightChild(); - - // If we can't go right, then go upward to the parent of the last left child - if(right == nullptr) { - // If current node was a left child then pop one node and we're done - if(p.nodeIsLeftChild) { - popPath(1); - return true; - } - - // Current node is a right child. - // If we are at the rightmost tree node return false and don't move. - if(p.moves + initialPathLen - 1 == pathLen) { - return false; - } - - // Truncate path to the parent of the last left child - popPath(p.moves + 1); - return true; - } - - // Go right - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - - // Go left as far as possible - while(1) { - const PathEntry &p = pathBack(); - const Node *left = p.parser.leftChild(); - if(left == nullptr) { - return true; - } - - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - } - } - - bool movePrev() { - const PathEntry &p = pathBack(); - - // If p isn't valid - if(!p.valid()) { - return false; - } - - const Node *left = p.parser.leftChild(); - - // If we can't go left, then go upward to the parent of the last right child - if(left == nullptr) { - // If current node was a right child - if(!p.nodeIsLeftChild) { - // If we are at the root then don't move and return false. - if(pathLen == initialPathLen) - return false; - - // Otherwise, pop one node from the path and return true. - popPath(1); - return true; - } - - // Current node is a left child. - // If we are at the leftmost tree node then return false and don't move. - if(p.moves + 3 == pathLen) { - return false; - } - - // Truncate path to the parent of the last right child - popPath(p.moves + 1); - return true; - } - - // Go left - int newMoves = p.nodeIsLeftChild ? p.moves + 1 : 1; - const PathEntry *borrowSource = (left->flags & Node::PREFIX_SOURCE_NEXT) ? &p : &p - newMoves; - pushPath(left, borrowSource, true, newMoves); - - // Go right as far as possible - while(1) { - const PathEntry &p = pathBack(); - const Node *right = p.parser.rightChild(); - if(right == nullptr) { - return true; - } - - int newMoves = p.nodeIsLeftChild ? 1 : p.moves + 1; - const PathEntry *borrowSource = (right->flags & Node::PREFIX_SOURCE_NEXT) ? &p - newMoves : &p; - pushPath(right, borrowSource, false, newMoves); - } - } - - }; - - Cursor getCursor(StringRef prevAncestor, StringRef nextAncestor) const { - return (size != 0) ? Cursor(&root, prevAncestor, nextAncestor) : Cursor(); - } - - static std::string escapeForDOT(StringRef s) { - std::string r = "\""; - for(char c : s) { - if(c == '\n') - r += "\\n"; - else if(isprint(c) && c != '"') - r += c; - else - r += format("{%02X}", c); - } - return r + '"'; - } - - std::string toDOT(StringRef prevAncestor, StringRef nextAncestor) const { - auto c = getCursor(prevAncestor, nextAncestor); - c.moveFirst(); - - std::string r; - r += format("digraph PrefixTree%p {\n", this); - - do { - const PathEntry &p = c.pathBack(); - const Node *n = p.node; - const Node *left = p.parser.leftChild(); - const Node *right = p.parser.rightChild(); - - std::string label = escapeForDOT(format("PrefixSource: %s\nPrefix: [%s]\nSplit: %s\nSuffix: %s", - n->flags & Node::PREFIX_SOURCE_NEXT ? "Left" : "Right", - p.getPrefix().toString().c_str(), - p.parser.splitString().toString().c_str(), - p.parser.suffixString().toString().c_str() - )); - - r += format("node%p [ label = %s ];\nnode%p -> { %s %s };\n", n, label.c_str(), n, - left ? format("node%p", left).c_str() : "", - right ? format("node%p", right).c_str() : "" - ); - - } while(c.moveNext()); - - r += "}\n"; - - return r; - } - - // Returns number of bytes written - int build(const EntryRef *begin, const EntryRef *end, StringRef prevAncestor, StringRef nextAncestor) { - // The boundary leading to the new page acts as the last time we branched right - if(begin == end) { - size = 0; - } - else { - size = sizeof(size) + build(root, begin, end, nextAncestor, prevAncestor); - } - ASSERT(size <= MaximumTreeSize()); - return size; - } - -private: - static uint16_t build(Node &root, const EntryRef *begin, const EntryRef *end, const StringRef &nextAncestor, const StringRef &prevAncestor) { - ASSERT(end != begin); - - int count = end - begin; - - // Find key to be stored in root - int mid = perfectSubtreeSplitPointCached(count); - const StringRef &key = begin[mid].key; - const StringRef &val = begin[mid].value; - - // Since key must be between lastLeft and lastRight, any common prefix they share must be shared by key - // so rather than comparing all of key to each one separately we can just compare lastLeft and lastRight - // to each other and then skip over the resulting length in key - int nextPrevCommon = commonPrefixLength(nextAncestor.begin(), prevAncestor.begin(), std::min(nextAncestor.size(), prevAncestor.size())); - - // Pointer to remainder of key after the left/right common bytes - const uint8_t *keyExt = key.begin() + nextPrevCommon; - - // Find out how many bytes beyond leftRightCommon key has with each last left/right string separately - int extNext = commonPrefixLength(keyExt, nextAncestor.begin() + nextPrevCommon, std::min(key.size(), nextAncestor.size()) - nextPrevCommon); - int extPrev = commonPrefixLength(keyExt, prevAncestor.begin() + nextPrevCommon, std::min(key.size(), prevAncestor.size()) - nextPrevCommon); - - // Use the longer result - bool prefixSourceNext = extNext > extPrev; - - int prefixLen = nextPrevCommon + (prefixSourceNext ? extNext : extPrev); - - int splitLen; // Bytes after prefix required to make traversal decision - int suffixLen; // Remainder of key bytes after split key portion - - //printf("build: '%s'\n prefixLen %d prefixSourceNext %d\n", key.toHexString(20).c_str(), prefixLen, prefixSourceNext); - - // 2 entries or less means no right child, so just put all remaining key bytes into split string. - if(count < 3) { - splitLen = key.size() - prefixLen; - suffixLen = 0; - } - else { - // There are 2 children - // Avoid using the suffix at all if the remainder is small enough. - splitLen = key.size() - prefixLen; - if(splitLen < SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_LIMIT) { - suffixLen = 0; - } - else { - // Remainder of the key was not small enough to put entirely before the left child, so find the actual required to make the branch decision - const StringRef &prevKey = begin[mid - 1].key; - splitLen = commonPrefixLength(key.begin(), prevKey.begin(), std::min(key.size(), prevKey.size())) + 1 - prefixLen; - - // Put at least the minimum immediate byte count in the split key (before the left child) - if(splitLen < SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN) - splitLen = std::min(key.size() - prefixLen, SERVER_KNOBS->PREFIX_TREE_IMMEDIATE_KEY_SIZE_MIN); - - suffixLen = key.size() - splitLen - prefixLen; - } - } - - // We now know enough about the fields present and their lengths to set the flag bits and write a header - // If any int is more than 8 bits then use large ints - bool large = prefixLen > 255 || splitLen > 255 || suffixLen > 255 || val.size() > 255; - root.flags = large ? Node::USE_LARGE_LENGTHS : 0; - - if(prefixSourceNext) - root.flags |= Node::PREFIX_SOURCE_NEXT; - - union { - uint8_t *p8; - uint16_t *p16; - }; - p8 = &root.flags + 1; - - if(large) - *p16++ = prefixLen; - else - *p8++ = prefixLen; - - if(splitLen > 0) { - root.flags |= Node::HAS_SPLIT; - if(large) - *p16++ = splitLen; - else - *p8++ = splitLen; - } - - uint16_t *pLeftLen = p16; - if(count > 1) { - ++p16; - } - - if(suffixLen > 0) { - root.flags |= Node::HAS_SUFFIX; - if(large) - *p16++ = suffixLen; - else - *p8++ = suffixLen; - } - - if(val.size() > 0) { - root.flags |= Node::HAS_VALUE; - if(large) - *p16++ = val.size(); - else - *p8++ = val.size(); - } - - // Header is written, now write strings and children in order. - const uint8_t *keyPtr = key.begin() + prefixLen; - - // Serialize split bytes - if(splitLen > 0) { - memcpy(p8, keyPtr, splitLen); - p8 += splitLen; - keyPtr += splitLen; - } - - // Serialize left child - if(count > 1) { - root.flags |= Node::HAS_LEFT_CHILD; - int leftLen = build(*(Node *)(p8), begin, begin + mid, key, prevAncestor); - *pLeftLen = leftLen; - p8 += leftLen; - } - - // Serialize suffix bytes - if(suffixLen > 0) { - memcpy(p8, keyPtr, suffixLen); - p8 += suffixLen; - } - - // Serialize value bytes - if(val.size() > 0) { - memcpy(p8, val.begin(), val.size()); - p8 += val.size(); - } - - // Serialize right child - if(count > 2) { - root.flags |= Node::HAS_RIGHT_CHILD; - int rightLen = build(*(Node *)(p8), begin + mid + 1, end, nextAncestor, key); - p8 += rightLen; - } - -/* -printf("\nBuilt: key '%s' c %d p %d spl %d suf %d\nRaw: %s\n", key.toString().c_str(), count, prefixLen, splitLen, suffixLen, StringRef(&root.flags, p8 - &root.flags).toHexString(20).c_str()); -Node::Parser p(&root); -printf("parser: headerLen %d prefixLen %d leftPos %d rightPos %d split %s suffix %s val %s\n", - p.headerLen, p.prefixLen, p.leftPos, p.rightPos, p.splitString().toString().c_str(), p.suffixString().toString().c_str(), p.valueString().toString().c_str()); -*/ - return p8 - (uint8_t *)&root; - } -}; diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 22ca40784e..945cc7c726 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -29,8 +29,6 @@ #include "fdbrpc/IAsyncFile.h" #include "fdbrpc/crc32c.h" #include "flow/ActorCollection.h" -#include "fdbserver/MemoryPager.h" -#include "fdbserver/IndirectShadowPager.h" #include #include #include "fdbclient/CommitTransaction.h" @@ -738,15 +736,22 @@ private: // Future onEvictable() const; // ready when entry can be evicted // indicating if it is safe to evict. template -class ObjectCache { +class ObjectCache : NonCopyable { struct Entry : public boost::intrusive::list_base_hook<> { + Entry() : hits(0) { + } IndexType index; ObjectType item; + int hits; }; public: - ObjectCache(int sizeLimit = 0) : sizeLimit(sizeLimit) { + ObjectCache(int sizeLimit = 0) : sizeLimit(sizeLimit), cacheHits(0), cacheMisses(0), noHitEvictions(0) { + } + + void setSizeLimit(int n) { + sizeLimit = n; } // Get the object for i if it exists, else return nullptr. @@ -754,6 +759,7 @@ public: ObjectType * getIfExists(const IndexType &index) { auto i = cache.find(index); if(i != cache.end()) { + ++i->second.hits; return &i->second.item; } return nullptr; @@ -761,26 +767,36 @@ public: // Get the object for i or create a new one. // After a get(), the object for i is the last in evictionOrder. - ObjectType & get(const IndexType &index) { + ObjectType & get(const IndexType &index, bool noHit = false) { Entry &entry = cache[index]; // If entry is linked into evictionOrder then move it to the back of the order if(entry.is_linked()) { + if(!noHit) { + ++entry.hits; + ++cacheHits; + } // Move the entry to the back of the eviction order evictionOrder.erase(evictionOrder.iterator_to(entry)); evictionOrder.push_back(entry); } else { + ++cacheMisses; // Finish initializing entry entry.index = index; + entry.hits = noHit ? 0 : 1; // Insert the newly created Entry at the back of the eviction order evictionOrder.push_back(entry); // If the cache is too big, try to evict the first Entry in the eviction order if(cache.size() > sizeLimit) { Entry &toEvict = evictionOrder.front(); + debug_printf("Trying to evict %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); // Don't evict the entry that was just added as then we can't return a reference to it. if(toEvict.index != index && toEvict.item.evictable()) { + if(toEvict.hits == 0) { + ++noHitEvictions; + } debug_printf("Evicting %s to make room for %s\n", toString(toEvict.index).c_str(), toString(index).c_str()); evictionOrder.pop_front(); cache.erase(toEvict.index); @@ -827,12 +843,14 @@ public: } private: - int sizeLimit; + int64_t sizeLimit; + int64_t cacheHits; + int64_t cacheMisses; + int64_t noHitEvictions; // TODO: Use boost intrusive unordered set instead, with a comparator that only considers entry.index std::unordered_map cache; boost::intrusive::list evictionOrder; - }; ACTOR template Future forwardError(Future f, Promise target) { @@ -900,7 +918,7 @@ public: // If the file already exists, pageSize might be different than desiredPageSize // Use pageCacheSizeBytes == 0 for default - DWALPager(int desiredPageSize, std::string filename, int pageCacheSizeBytes) + DWALPager(int desiredPageSize, std::string filename, int64_t pageCacheSizeBytes) : desiredPageSize(desiredPageSize), filename(filename), pHeader(nullptr), pageCacheBytes(pageCacheSizeBytes) { if(pageCacheBytes == 0) { @@ -919,8 +937,7 @@ public: if(pHeader != nullptr) { pHeader->pageSize = logicalPageSize; } - ASSERT(pageCache.count() == 0); - pageCache = PageCacheT(pageCacheBytes / physicalPageSize); + pageCache.setSizeLimit(pageCacheBytes / physicalPageSize); } void updateCommittedHeader() { @@ -1139,8 +1156,8 @@ public: } void updatePage(LogicalPageID pageID, Reference data) override { - // Get the cache entry for this page - PageCacheEntry &cacheEntry = pageCache.get(pageID); + // Get the cache entry for this page, without counting it as a cache hit as we're replacing its contents now + PageCacheEntry &cacheEntry = pageCache.get(pageID, true); debug_printf("DWALPager(%s) op=write %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing()); // If the page is still being read then it's not also being written because a write places @@ -1253,7 +1270,7 @@ public: } // Reads the most recent version of pageID either committed or written using updatePage() - Future> readPage(LogicalPageID pageID, bool cacheable) override { + Future> readPage(LogicalPageID pageID, bool cacheable, bool noHit = false) override { // Use cached page if present, without triggering a cache hit. // Otherwise, read the page and return it but don't add it to the cache if(!cacheable) { @@ -1268,8 +1285,8 @@ public: return forwardError(readPhysicalPage(this, (PhysicalPageID)pageID), errorPromise); } - PageCacheEntry &cacheEntry = pageCache.get(pageID); - debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing()); + PageCacheEntry &cacheEntry = pageCache.get(pageID, noHit); + debug_printf("DWALPager(%s) op=read %s cached=%d reading=%d writing=%d noHit=%d\n", filename.c_str(), toString(pageID).c_str(), cacheEntry.initialized(), cacheEntry.initialized() && cacheEntry.reading(), cacheEntry.initialized() && cacheEntry.writing(), noHit); if(!cacheEntry.initialized()) { debug_printf("DWALPager(%s) issuing actual read of %s\n", filename.c_str(), toString(pageID).c_str()); @@ -1281,7 +1298,7 @@ public: return cacheEntry.readFuture; } - Future> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable) { + Future> readPageAtVersion(LogicalPageID pageID, Version v, bool cacheable, bool noHit) { auto i = remappedPages.find(pageID); if(i != remappedPages.end()) { @@ -1296,7 +1313,7 @@ public: debug_printf("DWALPager(%s) read %s @%" PRId64 " (not remapped)\n", filename.c_str(), toString(pageID).c_str(), v); } - return readPage(pageID, cacheable); + return readPage(pageID, cacheable, noHit); } // Get snapshot as of the most recent committed version of the pager @@ -1451,7 +1468,6 @@ public: } Key getMetaKey() const override { - ASSERT(recoverFuture.isReady()); return pHeader->getMetaKey(); } @@ -1691,11 +1707,11 @@ public: virtual ~DWALPagerSnapshot() { } - Future> getPhysicalPage(LogicalPageID pageID, bool cacheable) override { + Future> getPhysicalPage(LogicalPageID pageID, bool cacheable, bool noHit) override { if(expired.isError()) { throw expired.getError(); } - return map(pager->readPageAtVersion(pageID, version, cacheable), [=](Reference p) { + return map(pager->readPageAtVersion(pageID, version, cacheable, noHit), [=](Reference p) { return Reference(p); }); } @@ -2448,9 +2464,6 @@ struct RedwoodRecordRef { }; struct BTreePage { - - enum EPageFlags { IS_LEAF = 1}; - typedef DeltaTree BinaryTree; typedef DeltaTree ValueTree; @@ -2458,7 +2471,6 @@ struct BTreePage { #pragma pack(push,1) struct { uint16_t formatVersion; - uint8_t flags; uint8_t height; uint16_t itemCount; uint32_t kvBytes; @@ -2471,7 +2483,7 @@ struct BTreePage { } bool isLeaf() const { - return flags & IS_LEAF; + return height == 1; } BinaryTree & tree() { @@ -2488,8 +2500,8 @@ struct BTreePage { std::string toString(bool write, BTreePageID id, Version ver, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound) const { std::string r; - r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p flags=0x%X count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", - write ? "write" : "read", ::toString(id).c_str(), ver, this, (int)flags, (int)itemCount, (int)kvBytes, + r += format("BTreePage op=%s %s @%" PRId64 " ptr=%p height=%d count=%d kvBytes=%d\n lowerBound: %s\n upperBound: %s\n", + write ? "write" : "read", ::toString(id).c_str(), ver, this, height, (int)itemCount, (int)kvBytes, lowerBound->toString().c_str(), upperBound->toString().c_str()); try { if(itemCount > 0) { @@ -2534,7 +2546,6 @@ struct BTreePage { static void makeEmptyRoot(Reference page) { BTreePage *btpage = (BTreePage *)page->begin(); btpage->formatVersion = BTreePage::FORMAT_VERSION; - btpage->flags = BTreePage::IS_LEAF; btpage->height = 1; btpage->kvBytes = 0; btpage->itemCount = 0; @@ -2663,6 +2674,7 @@ public: struct Counts { Counts() { memset(this, 0, sizeof(Counts)); + startTime = g_network ? now() : 0; } void clear() { @@ -2671,6 +2683,8 @@ public: int64_t pageReads; int64_t extPageReads; + int64_t pagePreloads; + int64_t extPagePreloads; int64_t setBytes; int64_t pageWrites; int64_t extPageWrites; @@ -2681,13 +2695,22 @@ public: int64_t getRanges; int64_t commitToPage; int64_t commitToPageStart; + double startTime; std::string toString(bool clearAfter = false) { - std::string s = format("set=%" PRId64 " clear=%" PRId64 " get=%" PRId64 " getRange=%" PRId64 " commit=%" PRId64 " pageRead=%" PRId64 " extPageRead=%" PRId64 " pageWrite=%" PRId64 " extPageWrite=%" PRId64 " commitPage=%" PRId64 " commitPageStart=%" PRId64 "", - sets, clears, gets, getRanges, commits, pageReads, extPageReads, pageWrites, extPageWrites, commitToPage, commitToPageStart); + const char *labels[] = {"set", "clear", "get", "getRange", "commit", "pageReads", "extPageRead", "pagePreloads", "extPagePreloads", "pageWrite", "extPageWrite", "commitPage", "commitPageStart"}; + const int64_t values[] = {sets, clears, gets, getRanges, commits, pageReads, extPageReads, pagePreloads, extPagePreloads, pageWrites, extPageWrites, commitToPage, commitToPageStart}; + + double elapsed = now() - startTime; + std::string s; + for(int i = 0; i < sizeof(values) / sizeof(int64_t); ++i) { + s += format("%s=%" PRId64 " (%d/s) ", labels[i], values[i], int(values[i] / elapsed)); + } + if(clearAfter) { clear(); } + return s; } }; @@ -2697,11 +2720,11 @@ public: // All async opts on the btree are based on pager reads, writes, and commits, so // we can mostly forward these next few functions to the pager - virtual Future getError() { + Future getError() { return m_pager->getError(); } - virtual Future onClosed() { + Future onClosed() { return m_pager->onClosed(); } @@ -2714,24 +2737,24 @@ public: pager->close(); } - virtual void dispose() { + void dispose() { return close_impl(true); } - virtual void close() { + void close() { return close_impl(false); } - virtual KeyValueStoreType getType() NOT_IMPLEMENTED - virtual bool supportsMutation(int op) NOT_IMPLEMENTED - virtual StorageBytes getStorageBytes() { + KeyValueStoreType getType() NOT_IMPLEMENTED + bool supportsMutation(int op) NOT_IMPLEMENTED + StorageBytes getStorageBytes() { return m_pager->getStorageBytes(); } // Writes are provided in an ordered stream. // A write is considered part of (a change leading to) the version determined by the previous call to setWriteVersion() // A write shall not become durable until the following call to commit() begins, and shall be durable once the following call to commit() returns - virtual void set(KeyValueRef keyValue) { + void set(KeyValueRef keyValue) { ++counts.sets; SingleKeyMutationsByVersion &changes = insertMutationBoundary(keyValue.key)->second.startKeyMutations; @@ -2750,7 +2773,7 @@ public: } } } - virtual void clear(KeyRangeRef range) { + void clear(KeyRangeRef range) { ++counts.clears; MutationBufferT::iterator iBegin = insertMutationBoundary(range.begin); MutationBufferT::iterator iEnd = insertMutationBoundary(range.end); @@ -2782,17 +2805,17 @@ public: } } - virtual void mutate(int op, StringRef param1, StringRef param2) NOT_IMPLEMENTED + void mutate(int op, StringRef param1, StringRef param2) NOT_IMPLEMENTED - virtual void setOldestVersion(Version v) { + void setOldestVersion(Version v) { m_newOldestVersion = v; } - virtual Version getOldestVersion() { + Version getOldestVersion() { return m_pager->getOldestVersion(); } - virtual Version getLatestVersion() { + Version getLatestVersion() { if(m_writeVersion != invalidVersion) return m_writeVersion; return m_pager->getLatestVersion(); @@ -2931,12 +2954,7 @@ public: m_latestCommit.cancel(); } - // readAtVersion() may only be called on a committed v which has previously been passed to setWriteVersion() and never previously passed - // to setOldestVersion. The returned results when violating this precondition are unspecified; the store is not required to be able to detect violations. - // The returned read cursor provides a consistent snapshot of the versioned store, corresponding to all the writes done with write versions less - // than or equal to the given version. - // v must be a committed version. - virtual Reference readAtVersion(Version v) { + Reference readAtVersion(Version v) { // Only committed versions can be read. Version recordVersion = singleVersion ? 0 : v; ASSERT(v <= m_lastCommittedVersion); @@ -2944,13 +2962,15 @@ public: ASSERT(v == m_lastCommittedVersion); } Reference snapshot = m_pager->getReadSnapshot(v); - Key m = snapshot->getMetaKey(); + + // Snapshot will continue to hold the metakey value memory + KeyRef m = snapshot->getMetaKey(); return Reference(new Cursor(snapshot, ((MetaKey *)m.begin())->root.get(), recordVersion)); } // Must be nondecreasing - virtual void setWriteVersion(Version v) { + void setWriteVersion(Version v) { ASSERT(v > m_lastCommittedVersion); // If there was no current mutation buffer, create one in the buffer map and update m_pBuffer if(m_pBuffer == nullptr) { @@ -2972,7 +2992,7 @@ public: m_writeVersion = v; } - virtual Future commit() { + Future commit() { if(m_pBuffer == nullptr) return m_latestCommit; return commit_impl(this); @@ -3334,7 +3354,7 @@ private: } // Writes entries to 1 or more pages and return a vector of boundary keys with their IPage(s) - ACTOR static Future>> writePages(VersionedBTree *self, bool minimalBoundaries, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, VectorRef entries, uint8_t newFlags, int height, Version v, BTreePageID previousID) { + ACTOR static Future>> writePages(VersionedBTree *self, bool minimalBoundaries, const RedwoodRecordRef *lowerBound, const RedwoodRecordRef *upperBound, VectorRef entries, int height, Version v, BTreePageID previousID) { ASSERT(entries.size() > 0); state Standalone> records; @@ -3451,7 +3471,6 @@ private: } btPage->formatVersion = BTreePage::FORMAT_VERSION; - btPage->flags = newFlags; btPage->height = height; btPage->kvBytes = kvBytes; btPage->itemCount = i - start; @@ -3544,7 +3563,7 @@ private: // While there are multiple child pages for this version we must write new tree levels. while(records.size() > 1) { self->m_header.height = ++height; - Standalone> newRecords = wait(writePages(self, false, &dbBegin, &dbEnd, records, 0, height, version, BTreePageID())); + Standalone> newRecords = wait(writePages(self, false, &dbBegin, &dbEnd, records, height, version, BTreePageID())); debug_printf("Wrote a new root level at version %" PRId64 " height %d size %lu pages\n", version, height, newRecords.size()); records = newRecords; } @@ -3552,7 +3571,7 @@ private: return records; } - class SuperPage : public IPage, ReferenceCounted { + class SuperPage : public IPage, ReferenceCounted, public FastAllocated{ public: SuperPage(std::vector> pages) { int blockSize = pages.front()->size(); @@ -3570,23 +3589,23 @@ private: delete [] m_data; } - virtual void addref() const { + void addref() const { ReferenceCounted::addref(); } - virtual void delref() const { + void delref() const { ReferenceCounted::delref(); } - virtual int size() const { + int size() const { return m_size; } - virtual uint8_t const* begin() const { + uint8_t const* begin() const { return m_data; } - virtual uint8_t* mutate() { + uint8_t* mutate() { return m_data; } @@ -3609,14 +3628,15 @@ private: ++counts.pageReads; if(id.size() == 1) { - wait(store(page, snapshot->getPhysicalPage(id.front(), !forLazyDelete))); + Reference p = wait(snapshot->getPhysicalPage(id.front(), !forLazyDelete, false)); + page = p; } else { ASSERT(!id.empty()); counts.extPageReads += (id.size() - 1); std::vector>> reads; for(auto &pageID : id) { - reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyDelete)); + reads.push_back(snapshot->getPhysicalPage(pageID, !forLazyDelete, false)); } std::vector> pages = wait(getAll(reads)); // TODO: Cache reconstituted super pages somehow, perhaps with help from the Pager. @@ -3640,6 +3660,15 @@ private: return page; } + static void preLoadPage(IPagerSnapshot *snapshot, BTreePageID id) { + ++counts.pagePreloads; + counts.extPagePreloads += (id.size() - 1); + + for(auto pageID : id) { + snapshot->getPhysicalPage(pageID, true, true); + } + } + void freeBtreePage(BTreePageID btPageID, Version v) { // Free individual pages at v for(LogicalPageID id : btPageID) { @@ -3778,6 +3807,7 @@ private: self->counts.commitToPage++; state Reference rawPage = wait(readPage(snapshot, rootID, decodeLowerBound, decodeUpperBound)); state BTreePage *page = (BTreePage *) rawPage->begin(); + ASSERT(isLeaf == page->isLeaf()); debug_printf("%s commitSubtree(): %s\n", context.c_str(), page->toString(false, rootID, snapshot->getVersion(), decodeLowerBound, decodeUpperBound).c_str()); state BTreePage::BinaryTree::Cursor cursor = getReader(rawPage)->getCursor(); @@ -3786,8 +3816,7 @@ private: state Version writeVersion; // Leaf Page - if(page->flags & BTreePage::IS_LEAF) { - ASSERT(isLeaf); + if(isLeaf) { state Standalone> merged; debug_printf("%s Leaf page, merging changes.\n", context.c_str()); @@ -3958,7 +3987,7 @@ private: return results; } - state Standalone> entries = wait(writePages(self, true, lowerBound, upperBound, merged, BTreePage::IS_LEAF, page->height, writeVersion, rootID)); + state Standalone> entries = wait(writePages(self, true, lowerBound, upperBound, merged, page->height, writeVersion, rootID)); results.arena().dependsOn(entries.arena()); results.push_back(results.arena(), VersionAndChildrenRef(writeVersion, entries, *upperBound)); debug_printf("%s Merge complete, returning %s\n", context.c_str(), toString(results).c_str()); @@ -4084,7 +4113,7 @@ private: ASSERT(pageBuilder.lastUpperBound == *upperBound); - Standalone> childEntries = wait(holdWhile(pageBuilder.entries, writePages(self, false, lowerBound, upperBound, pageBuilder.entries, 0, page->height, writeVersion, rootID))); + Standalone> childEntries = wait(holdWhile(pageBuilder.entries, writePages(self, false, lowerBound, upperBound, pageBuilder.entries, page->height, writeVersion, rootID))); results.arena().dependsOn(childEntries.arena()); results.push_back(results.arena(), VersionAndChildrenRef(0, childEntries, *upperBound)); @@ -4218,23 +4247,39 @@ private: return Reference(new PageCursor(*this)); } + const BTreePage * btPage() const { + return (const BTreePage *)page->begin(); + } + // Multiple InternalCursors can share a Page BTreePage::BinaryTree::Reader & getReader() const { return *(BTreePage::BinaryTree::Reader *)page->userData; } bool isLeaf() const { - const BTreePage *p = ((const BTreePage *)page->begin()); - return p->isLeaf(); + return btPage()->isLeaf(); } - Future> getChild(Reference pager) { + Future> getChild(Reference pager, int readAheadBytes = 0) { ASSERT(!isLeaf()); BTreePage::BinaryTree::Cursor next = cursor; next.moveNext(); const RedwoodRecordRef &rec = cursor.get(); BTreePageID id = rec.getChildPage(); Future> child = readPage(pager, id, &rec, &next.getOrUpperBound()); + + // Read ahead siblings at level 2 + if(readAheadBytes > 0 && btPage()->height == 2 && next.valid()) { + do { + debug_printf("preloading %s %d bytes left\n", ::toString(next.get().getChildPage()).c_str(), readAheadBytes); + // If any part of the page was already loaded then stop + if(next.get().value.present()) { + preLoadPage(pager.getPtr(), next.get().getChildPage()); + readAheadBytes -= page->size(); + } + } while(readAheadBytes > 0 && next.moveNext()); + } + return map(child, [=](Reference page) { return Reference(new PageCursor(id, page, Reference::addRef(this))); }); @@ -4324,7 +4369,7 @@ private: }); } - ACTOR Future seekLessThanOrEqual_impl(InternalCursor *self, RedwoodRecordRef query) { + ACTOR Future seekLessThanOrEqual_impl(InternalCursor *self, RedwoodRecordRef query, int prefetchBytes) { Future f = self->moveToRoot(); // f will almost always be ready @@ -4351,7 +4396,7 @@ private: return true; } - Reference child = wait(self->pageCursor->getChild(self->pager)); + Reference child = wait(self->pageCursor->getChild(self->pager, prefetchBytes)); self->pageCursor = child; } else { @@ -4362,8 +4407,8 @@ private: } } - Future seekLTE(RedwoodRecordRef query) { - return seekLessThanOrEqual_impl(this, query); + Future seekLTE(RedwoodRecordRef query, int prefetchBytes) { + return seekLessThanOrEqual_impl(this, query, prefetchBytes); } ACTOR Future move_impl(InternalCursor *self, bool forward) { @@ -4416,13 +4461,6 @@ private: return move_impl(this, forward); } - Future moveNext() { - return move_impl(this, true); - } - Future movePrev() { - return move_impl(this, false); - } - // Move to the first or last record of the database. ACTOR Future move_end(InternalCursor *self, bool begin) { Future f = self->moveToRoot(); @@ -4500,36 +4538,56 @@ private: Optional m_kv; public: - virtual Future findEqual(KeyRef key) { return find_impl(this, key, true, 0); } - virtual Future findFirstEqualOrGreater(KeyRef key, bool needValue, int prefetchNextBytes) { return find_impl(this, key, needValue, 1); } - virtual Future findLastLessOrEqual(KeyRef key, bool needValue, int prefetchPriorBytes) { return find_impl(this, key, needValue, -1); } + Future findEqual(KeyRef key) override { + return find_impl(this, key, 0); + } + Future findFirstEqualOrGreater(KeyRef key, int prefetchBytes) override { + return find_impl(this, key, 1, prefetchBytes); + } + Future findLastLessOrEqual(KeyRef key, int prefetchBytes) override { + return find_impl(this, key, -1, prefetchBytes); + } - virtual Future next(bool needValue) { return move(this, true, needValue); } - virtual Future prev(bool needValue) { return move(this, false, needValue); } + Future next() override { + return move(this, true); + } + Future prev() override { + return move(this, false); + } - virtual bool isValid() { + bool isValid() override { return m_kv.present(); } - virtual KeyRef getKey() { + KeyRef getKey() override { return m_kv.get().key; } - virtual ValueRef getValue() { + ValueRef getValue() override { return m_kv.get().value; } - std::string toString() const { + std::string toString(bool includePaths = false) const { std::string r; r += format("Cursor(%p) ver: %" PRId64 " ", this, m_version); if(m_kv.present()) { - r += format(" KV: '%s' -> '%s'\n", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); + r += format(" KV: '%s' -> '%s'", m_kv.get().key.printable().c_str(), m_kv.get().value.printable().c_str()); } else { - r += " KV: \n"; + r += " KV: "; + } + if(includePaths) { + r += format("\n Cur1: %s", m_cur1.toString().c_str()); + r += format("\n Cur2: %s", m_cur2.toString().c_str()); + } + else { + if(m_cur1.valid()) { + r += format("\n Cur1: %s", m_cur1.get().toString().c_str()); + } + if(m_cur2.valid()) { + r += format("\n Cur2: %s", m_cur2.get().toString().c_str()); + } } - r += format(" Cur1: %s\n", m_cur1.toString().c_str()); - r += format(" Cur2: %s\n", m_cur2.toString().c_str()); return r; } @@ -4539,12 +4597,12 @@ private: // for less than or equal use cmp < 0 // for greater than or equal use cmp > 0 // for equal use cmp == 0 - ACTOR static Future find_impl(Cursor *self, KeyRef key, bool needValue, int cmp) { + ACTOR static Future find_impl(Cursor *self, KeyRef key, int cmp, int prefetchBytes = 0) { // Search for the last key at or before (key, version, \xff) state RedwoodRecordRef query(key, self->m_version, {}, 0, std::numeric_limits::max()); self->m_kv.reset(); - wait(success(self->m_cur1.seekLTE(query))); + wait(success(self->m_cur1.seekLTE(query, prefetchBytes))); debug_printf("find%sE(%s): %s\n", cmp > 0 ? "GT" : (cmp == 0 ? "" : "LT"), query.toString().c_str(), self->toString().c_str()); // If we found the target key with a present value then return it as it is valid for any cmp type @@ -4587,7 +4645,7 @@ private: } // Get the next present key at the target version. Handles invalid cursor too. - wait(self->next(needValue)); + wait(self->next()); } else if(cmp < 0) { // Mode is <=, which is the same as the seekLTE(query) @@ -4597,15 +4655,14 @@ private: } // Move to previous present kv pair at the target version - wait(self->prev(needValue)); + wait(self->prev()); } return Void(); } - // TODO: use needValue - ACTOR static Future move(Cursor *self, bool fwd, bool needValue) { - debug_printf("Cursor::move(%d): Cursor = %s\n", fwd, self->toString().c_str()); + ACTOR static Future move(Cursor *self, bool fwd) { + debug_printf("Cursor::move(%d): Start %s\n", fwd, self->toString().c_str()); ASSERT(self->m_cur1.valid()); // If kv is present then the key/version at cur1 was already returned so move to a new key @@ -4614,6 +4671,7 @@ private: ASSERT(self->m_cur1.valid()); loop { self->m_cur2 = self->m_cur1; + debug_printf("Cursor::move(%d): Advancing cur1 %s\n", fwd, self->toString().c_str()); bool valid = wait(self->m_cur1.move(fwd)); if(!valid || self->m_cur1.get().key != self->m_cur2.get().key) { break; @@ -4632,6 +4690,7 @@ private: // TODO: This may already be the case, store state to track this condition and avoid the reset here if(self->m_cur1.valid()) { self->m_cur2 = self->m_cur1; + debug_printf("Cursor::move(%d): Advancing cur2 %s\n", fwd, self->toString().c_str()); wait(success(self->m_cur2.move(true))); } @@ -4648,13 +4707,13 @@ private: if(fwd) { // Moving forward, move cur2 forward and keep cur1 pointing to the prior (predecessor) record - debug_printf("Cursor::move(%d): Moving forward, Cursor = %s\n", fwd, self->toString().c_str()); + debug_printf("Cursor::move(%d): Moving forward %s\n", fwd, self->toString().c_str()); self->m_cur1 = self->m_cur2; wait(success(self->m_cur2.move(true))); } else { // Moving backward, move cur1 backward and keep cur2 pointing to the prior (successor) record - debug_printf("Cursor::move(%d): Moving backward, Cursor = %s\n", fwd, self->toString().c_str()); + debug_printf("Cursor::move(%d): Moving backward %s\n", fwd, self->toString().c_str()); self->m_cur2 = self->m_cur1; wait(success(self->m_cur1.move(false))); } @@ -4726,7 +4785,7 @@ public: m_init = catchError(init_impl(this)); } - virtual Future init() { + Future init() { return m_init; } @@ -4756,15 +4815,15 @@ public: delete self; } - virtual void close() { + void close() { shutdown(this, false); } - virtual void dispose() { + void dispose() { shutdown(this, true); } - virtual Future< Void > onClosed() { + Future< Void > onClosed() { return m_closed.getFuture(); } @@ -4775,15 +4834,15 @@ public: return catchError(c); } - virtual KeyValueStoreType getType() { + KeyValueStoreType getType() { return KeyValueStoreType::SSD_REDWOOD_V1; } - virtual StorageBytes getStorageBytes() { + StorageBytes getStorageBytes() { return m_tree->getStorageBytes(); } - virtual Future< Void > getError() { + Future< Void > getError() { return delayed(m_error.getFuture()); }; @@ -4792,12 +4851,12 @@ public: m_tree->clear(range); } - virtual void set( KeyValueRef keyValue, const Arena* arena = NULL ) { + void set( KeyValueRef keyValue, const Arena* arena = NULL ) { debug_printf("SET %s\n", keyValue.key.printable().c_str()); m_tree->set(keyValue); } - virtual Future< Standalone< VectorRef< KeyValueRef > > > readRange(KeyRangeRef keys, int rowLimit = 1<<30, int byteLimit = 1<<30) { + Future< Standalone< VectorRef< KeyValueRef > > > readRange(KeyRangeRef keys, int rowLimit = 1<<30, int byteLimit = 1<<30) { debug_printf("READRANGE %s\n", printable(keys).c_str()); return catchError(readRange_impl(this, keys, rowLimit, byteLimit)); } @@ -4809,9 +4868,11 @@ public: ASSERT( byteLimit > 0 ); state Reference cur = self->m_tree->readAtVersion(self->m_tree->getLastCommittedVersion()); + // Prefetch is currently only done in the forward direction + state int prefetchBytes = rowLimit > 1 ? byteLimit : 0; if(rowLimit >= 0) { - wait(cur->findFirstEqualOrGreater(keys.begin, true, 0)); + wait(cur->findFirstEqualOrGreater(keys.begin, prefetchBytes)); while(cur->isValid() && cur->getKey() < keys.end) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); accumulatedBytes += kv.expectedSize(); @@ -4819,12 +4880,12 @@ public: if(--rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } - wait(cur->next(true)); + wait(cur->next()); } } else { - wait(cur->findLastLessOrEqual(keys.end, true, 0)); + wait(cur->findLastLessOrEqual(keys.end)); if(cur->isValid() && cur->getKey() == keys.end) - wait(cur->prev(true)); + wait(cur->prev()); while(cur->isValid() && cur->getKey() >= keys.begin) { KeyValueRef kv(KeyRef(result.arena(), cur->getKey()), ValueRef(result.arena(), cur->getValue())); @@ -4833,7 +4894,7 @@ public: if(++rowLimit == 0 || accumulatedBytes >= byteLimit) { break; } - wait(cur->prev(true)); + wait(cur->prev()); } } return result; @@ -4850,7 +4911,7 @@ public: return Optional(); } - virtual Future< Optional< Value > > readValue(KeyRef key, Optional< UID > debugID = Optional()) { + Future< Optional< Value > > readValue(KeyRef key, Optional< UID > debugID = Optional()) { return catchError(readValue_impl(this, key, debugID)); } @@ -4867,7 +4928,7 @@ public: return Optional(); } - virtual Future< Optional< Value > > readValuePrefix(KeyRef key, int maxLength, Optional< UID > debugID = Optional()) { + Future< Optional< Value > > readValuePrefix(KeyRef key, int maxLength, Optional< UID > debugID = Optional()) { return catchError(readValuePrefix_impl(this, key, maxLength, debugID)); } @@ -4945,11 +5006,11 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version if(deterministicRandom()->coinflip()) { state Key randomKey = randomKV().key; debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Dummy seek to '%s'\n", v, start.toString().c_str(), end.toString().c_str(), randomKey.toString().c_str()); - wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey, true, 0) : cur->findLastLessOrEqual(randomKey, true, 0)); + wait(deterministicRandom()->coinflip() ? cur->findFirstEqualOrGreater(randomKey) : cur->findLastLessOrEqual(randomKey)); } debug_printf("VerifyRange(@%" PRId64 ", %s, %s): Actual seek\n", v, start.toString().c_str(), end.toString().c_str()); - wait(cur->findFirstEqualOrGreater(start, true, 0)); + wait(cur->findFirstEqualOrGreater(start)); state std::vector results; @@ -4997,7 +5058,7 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version ASSERT(errors == 0); results.push_back(KeyValue(KeyValueRef(cur->getKey(), cur->getValue()))); - wait(cur->next(true)); + wait(cur->next()); } // Make sure there are no further written kv pairs that would be present at this version. @@ -5031,9 +5092,9 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version } // Now read the range from the tree in reverse order and compare to the saved results - wait(cur->findLastLessOrEqual(end, true, 0)); + wait(cur->findLastLessOrEqual(end)); if(cur->isValid() && cur->getKey() == end) - wait(cur->prev(true)); + wait(cur->prev()); state std::vector::const_reverse_iterator r = results.rbegin(); @@ -5059,7 +5120,7 @@ ACTOR Future verifyRange(VersionedBTree *btree, Key start, Key end, Version } ++r; - wait(cur->prev(true)); + wait(cur->prev()); } if(r != results.rend()) { @@ -5174,10 +5235,10 @@ ACTOR Future randomReader(VersionedBTree *btree) { } state KeyValue kv = randomKV(10, 0); - wait(cur->findFirstEqualOrGreater(kv.key, true, 0)); + wait(cur->findFirstEqualOrGreater(kv.key)); state int c = deterministicRandom()->randomInt(0, 100); while(cur->isValid() && c-- > 0) { - wait(success(cur->next(true))); + wait(success(cur->next())); wait(yield()); } } @@ -5972,9 +6033,8 @@ ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, printf("Executing %d random seeks\n", count); state Reference cur = btree->readAtVersion(readVer); while(c < count) { - wait(yield()); state Key k = randomString(20, firstChar, lastChar); - wait(success(cur->findFirstEqualOrGreater(k, false, 0))); + wait(success(cur->findFirstEqualOrGreater(k))); ++c; } double elapsed = timer() - readStart; @@ -5982,6 +6042,33 @@ ACTOR Future randomSeeks(VersionedBTree *btree, int count, char firstChar, return Void(); } +ACTOR Future randomScans(VersionedBTree *btree, int count, int width, int readAhead, char firstChar, char lastChar) { + state Version readVer = btree->getLatestVersion(); + state int c = 0; + state double readStart = timer(); + printf("Executing %d random scans\n", count); + state Reference cur = btree->readAtVersion(readVer); + state bool adaptive = readAhead < 0; + state int totalScanBytes = 0; + while(c++ < count) { + state Key k = randomString(20, firstChar, lastChar); + wait(success(cur->findFirstEqualOrGreater(k, readAhead))); + if(adaptive) { + readAhead = totalScanBytes / c; + } + state int w = width; + while(w > 0 && cur->isValid()) { + totalScanBytes += cur->getKey().size(); + totalScanBytes += cur->getValue().size(); + wait(cur->next()); + --w; + } + } + double elapsed = timer() - readStart; + printf("Completed %d scans: readAhead=%d width=%d bytesRead=%d scansRate=%d/s\n", count, readAhead, width, totalScanBytes, int(count / elapsed)); + return Void(); +} + TEST_CASE("!/redwood/correctness/pager/cow") { state std::string pagerFile = "unittest_pageFile.redwood"; printf("Deleting old test data\n"); @@ -6010,26 +6097,50 @@ TEST_CASE("!/redwood/correctness/pager/cow") { } TEST_CASE("!/redwood/performance/set") { - state std::string pagerFile = "unittest_pageFile.redwood"; - printf("Deleting old test data\n"); - deleteFile(pagerFile); + state SignalableActorCollection actors; + VersionedBTree::counts.clear(); - int pageSize = 4096; - IPager2 *pager = new DWALPager(pageSize, pagerFile, FLOW_KNOBS->PAGE_CACHE_4K / pageSize); + // If a test file is passed in by environment then don't write new data to it. + state bool reload = getenv("TESTFILE") == nullptr; + state std::string pagerFile = reload ? "unittest.redwood" : getenv("TESTFILE"); + + if(reload) { + printf("Deleting old test data\n"); + deleteFile(pagerFile); + } + + state int pageSize = 4096; + state int64_t pageCacheBytes = FLOW_KNOBS->PAGE_CACHE_4K; + DWALPager *pager = new DWALPager(pageSize, pagerFile, pageCacheBytes); state bool singleVersion = true; state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); state int nodeCount = 1e9; state int maxChangesPerVersion = 5000; - state int64_t kvBytesTarget = 4000e6; + state int64_t kvBytesTarget = 4e9; state int commitTarget = 20e6; - state int maxKeyPrefixSize = 25; + state int minKeyPrefixBytes = 0; + state int maxKeyPrefixBytes = 25; + state int minValueSize = 0; state int maxValueSize = 500; state int maxConsecutiveRun = 10; - state int minValueSize = 0; state char firstKeyChar = 'a'; state char lastKeyChar = 'b'; + + printf("pageSize: %d\n", pageSize); + printf("pageCacheBytes: %" PRId64 "\n", pageCacheBytes); + printf("trailingIntegerIndexRange: %d\n", nodeCount); + printf("maxChangesPerVersion: %d\n", maxChangesPerVersion); + printf("minKeyPrefixBytes: %d\n", minKeyPrefixBytes); + printf("maxKeyPrefixBytes: %d\n", maxKeyPrefixBytes); + printf("maxConsecutiveRun: %d\n", maxConsecutiveRun); + printf("minValueSize: %d\n", minValueSize); + printf("maxValueSize: %d\n", maxValueSize); + printf("commitTarget: %d\n", commitTarget); + printf("kvBytesTarget: %" PRId64 "\n", kvBytesTarget); + printf("KeyLexicon '%c' to '%c'\n", firstKeyChar, lastKeyChar); + state int64_t kvBytes = 0; state int64_t kvBytesTotal = 0; state int records = 0; @@ -6040,65 +6151,110 @@ TEST_CASE("!/redwood/performance/set") { state double intervalStart = timer(); state double start = intervalStart; - while(kvBytesTotal < kvBytesTarget) { - wait(yield()); + if(reload) { + while(kvBytesTotal < kvBytesTarget) { + wait(yield()); - Version lastVer = btree->getLatestVersion(); - state Version version = lastVer + 1; - btree->setWriteVersion(version); - int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); + Version lastVer = btree->getLatestVersion(); + state Version version = lastVer + 1; + btree->setWriteVersion(version); + int changes = deterministicRandom()->randomInt(0, maxChangesPerVersion); - while(changes > 0 && kvBytes < commitTarget) { - KeyValue kv; - kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(sizeof(uint32_t), maxKeyPrefixSize + sizeof(uint32_t) + 1), firstKeyChar, lastKeyChar); - int32_t index = deterministicRandom()->randomInt(0, nodeCount); - int runLength = deterministicRandom()->randomInt(1, maxConsecutiveRun + 1); + while(changes > 0 && kvBytes < commitTarget) { + KeyValue kv; + kv.key = randomString(kv.arena(), deterministicRandom()->randomInt(minKeyPrefixBytes + sizeof(uint32_t), maxKeyPrefixBytes + sizeof(uint32_t) + 1), firstKeyChar, lastKeyChar); + int32_t index = deterministicRandom()->randomInt(0, nodeCount); + int runLength = deterministicRandom()->randomInt(1, maxConsecutiveRun + 1); - while(runLength > 0 && changes > 0) { - *(uint32_t *)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); - kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); + while(runLength > 0 && changes > 0) { + *(uint32_t *)(kv.key.end() - sizeof(uint32_t)) = bigEndian32(index++); + kv.value = StringRef((uint8_t *)value.data(), deterministicRandom()->randomInt(minValueSize, maxValueSize + 1)); - btree->set(kv); + btree->set(kv); - --runLength; - --changes; - kvBytes += kv.key.size() + kv.value.size(); - ++records; + --runLength; + --changes; + kvBytes += kv.key.size() + kv.value.size(); + ++records; + } + } + + if(kvBytes >= commitTarget) { + btree->setOldestVersion(btree->getLastCommittedVersion()); + wait(commit); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + + // Avoid capturing via this to freeze counter values + int recs = records; + int kvb = kvBytes; + + // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object + double *pIntervalStart = &intervalStart; + + commit = map(btree->commit(), [=](Void result) { + printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); + double elapsed = timer() - *pIntervalStart; + printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); + *pIntervalStart = timer(); + return Void(); + }); + + kvBytesTotal += kvBytes; + kvBytes = 0; + records = 0; } } - if(kvBytes >= commitTarget) { - btree->setOldestVersion(btree->getLastCommittedVersion()); - wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); - - // Avoid capturing via this to freeze counter values - int recs = records; - int kvb = kvBytes; - - // Capturing invervalStart via this->intervalStart makes IDE's unhappy as they do not know about the actor state object - double *pIntervalStart = &intervalStart; - - commit = map(btree->commit(), [=](Void result) { - printf("Committed: %s\n", VersionedBTree::counts.toString(true).c_str()); - double elapsed = timer() - *pIntervalStart; - printf("Committed %d kvBytes in %d records in %f seconds, %.2f MB/s\n", kvb, recs, elapsed, kvb / elapsed / 1e6); - *pIntervalStart = timer(); - return Void(); - }); - - kvBytesTotal += kvBytes; - kvBytes = 0; - records = 0; - } + wait(commit); + printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); } - wait(commit); - printf("Cumulative %.2f MB keyValue bytes written at %.2f MB/s\n", kvBytesTotal / 1e6, kvBytesTotal / (timer() - start) / 1e6); + int seeks = 1e6; + printf("Warming cache with seeks\n"); + actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, seeks/3, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); - printf("Starting random seeks\n"); - state int reads = 30000; - wait(randomSeeks(btree, reads, firstKeyChar, lastKeyChar) && randomSeeks(btree, reads, firstKeyChar, lastKeyChar) && randomSeeks(btree, reads, firstKeyChar, lastKeyChar)); + state int ops = 10000; + + printf("Serial scans with adaptive readAhead...\n"); + actors.add(randomScans(btree, ops, 50, -1, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans with readAhead 3 pages...\n"); + actors.add(randomScans(btree, ops, 50, 12000, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans with readAhead 2 pages...\n"); + actors.add(randomScans(btree, ops, 50, 8000, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans with readAhead 1 page...\n"); + actors.add(randomScans(btree, ops, 50, 4000, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial scans...\n"); + actors.add(randomScans(btree, ops, 50, 0, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Serial seeks...\n"); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); + + printf("Parallel seeks...\n"); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + actors.add(randomSeeks(btree, ops, firstKeyChar, lastKeyChar)); + wait(actors.signalAndReset()); + printf("Stats: %s\n", VersionedBTree::counts.toString(true).c_str()); Future closedFuture = btree->onClosed(); btree->close(); diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 01c49e6c35..1144f4c8ec 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -46,7 +46,6 @@ - @@ -57,7 +56,6 @@ - @@ -179,7 +177,6 @@ - @@ -189,7 +186,6 @@ - false false diff --git a/fdbserver/fdbserver.vcxproj.filters b/fdbserver/fdbserver.vcxproj.filters index 5e9360f8c0..c01c9e458a 100644 --- a/fdbserver/fdbserver.vcxproj.filters +++ b/fdbserver/fdbserver.vcxproj.filters @@ -274,8 +274,6 @@ workloads - - @@ -385,8 +383,6 @@ - - From eb67886b75f0479872c9ef303eb06fe4158a631e Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Mon, 11 Nov 2019 15:10:25 -0800 Subject: [PATCH 162/184] FastRestore:Move comment to func definition Resolve review comments. --- fdbclient/SystemData.cpp | 1 + fdbserver/RestoreApplier.actor.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 8db79b42ce..681ecec0e8 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -641,6 +641,7 @@ const KeyRangeRef restoreApplierKeys(LiteralStringRef("\xff\x02/restoreApplier/" const KeyRef restoreApplierTxnValue = LiteralStringRef("1"); // restoreApplierKeys: track atomic transaction progress to ensure applying atomicOp exactly once +// Version integer must be BigEndian to maintain ordering in lexical order const Key restoreApplierKeyFor(UID const& applierID, Version version) { BinaryWriter wr(Unversioned()); wr.serializeBytes(restoreApplierKeys.begin); diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index a58ad8db73..6c015c1694 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -277,8 +277,8 @@ ACTOR Future applyToDB(Reference self, Database cx) { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Key begin = restoreApplierKeyFor( - self->id(), bigEndian64(0)); // Integer must be BigEndian to maintain ordering in lexical order + // Version integer must be BigEndian to maintain ordering in lexical order + Key begin = restoreApplierKeyFor(self->id(), bigEndian64(0)); Key end = restoreApplierKeyFor(self->id(), bigEndian64(std::numeric_limits::max())); Standalone txnIds = wait(tr->getRange(KeyRangeRef(begin, end), CLIENT_KNOBS->TOO_MANY)); if (txnIds.size() > 0) { From 9227de5c20fa2a9e6622e99fb2f90f2e831f0496 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 11 Nov 2019 15:13:58 -0800 Subject: [PATCH 163/184] Redwood correctness unit test was using wallclock based time limit which breaks determinism. --- fdbserver/VersionedBTree.actor.cpp | 6 +++--- tests/CMakeLists.txt | 2 ++ tests/rare/RedwoodCorrectnessBTree.txt | 6 ++++++ 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 tests/rare/RedwoodCorrectnessBTree.txt diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 945cc7c726..68fa42707d 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -5766,7 +5766,7 @@ TEST_CASE("!/redwood/correctness/btree") { state double clearPostSetProbability = deterministicRandom()->random01() * .1; state double coldStartProbability = deterministicRandom()->random01(); state double advanceOldVersionProbability = deterministicRandom()->random01(); - state double maxWallClockDuration = 60; + state double maxDuration = 60; printf("\n"); printf("serialTest: %d\n", serialTest); @@ -5787,7 +5787,7 @@ TEST_CASE("!/redwood/correctness/btree") { deleteFile(pagerFile); printf("Initializing...\n"); - state double startTime = timer(); + state double startTime = now(); pager = new DWALPager(pageSize, pagerFile, 0); state VersionedBTree *btree = new VersionedBTree(pager, pagerFile, singleVersion); wait(btree->init()); @@ -5817,7 +5817,7 @@ TEST_CASE("!/redwood/correctness/btree") { state Future commit = Void(); - while(mutationBytes.get() < mutationBytesTarget && (timer() - startTime) < maxWallClockDuration) { + while(mutationBytes.get() < mutationBytesTarget && (now() - startTime) < maxDuration) { if(now() - startTime > 600) { mutationBytesTarget = mutationBytes.get(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9e4d17aa29..c16b36a1f1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -139,6 +139,8 @@ add_fdb_test(TEST_FILES rare/LargeApiCorrectnessStatus.txt) add_fdb_test(TEST_FILES rare/RYWDisable.txt) add_fdb_test(TEST_FILES rare/RandomReadWriteTest.txt) add_fdb_test(TEST_FILES rare/SwizzledLargeApiCorrectness.txt) +add_fdb_test(TEST_FILES rare/RedwoodCorrectnessBTree.txt) + add_fdb_test( TEST_FILES restarting/ConfigureTestRestart-1.txt restarting/ConfigureTestRestart-2.txt) diff --git a/tests/rare/RedwoodCorrectnessBTree.txt b/tests/rare/RedwoodCorrectnessBTree.txt new file mode 100644 index 0000000000..3bde204032 --- /dev/null +++ b/tests/rare/RedwoodCorrectnessBTree.txt @@ -0,0 +1,6 @@ +testTitle=UnitTests +testName=UnitTests +startDelay=0 +useDB=false +maxTestCases=0 +testsMatching=!/redwood/correctness/btree From f841d14141a11d8b7bd9f02dd2f030c30750a7f1 Mon Sep 17 00:00:00 2001 From: Stephen Atherton Date: Mon, 11 Nov 2019 16:28:21 -0800 Subject: [PATCH 164/184] Bumped format versions, also simplified version scheme to a pager version and a btree version, removing per-page versions for queue and btree pages. --- fdbserver/VersionedBTree.actor.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fdbserver/VersionedBTree.actor.cpp b/fdbserver/VersionedBTree.actor.cpp index 68fa42707d..b4facd88f2 100644 --- a/fdbserver/VersionedBTree.actor.cpp +++ b/fdbserver/VersionedBTree.actor.cpp @@ -269,8 +269,6 @@ public: #pragma pack(push, 1) struct RawPage { - static constexpr int FORMAT_VERSION = 1; - uint16_t formatVersion; LogicalPageID nextPageID; uint16_t nextOffset; uint16_t endOffset; @@ -305,7 +303,6 @@ public: debug_printf("FIFOQueue::Cursor(%s) loadPage\n", toString().c_str()); return map(queue->pager->readPage(pageID, true), [=](Reference p) { page = p; - ASSERT(raw()->formatVersion == RawPage::FORMAT_VERSION); debug_printf("FIFOQueue::Cursor(%s) loadPage done\n", toString().c_str()); return Void(); }); @@ -345,7 +342,6 @@ public: page = queue->pager->newPageBuffer(); setNext(0, 0); auto p = raw(); - p->formatVersion = RawPage::FORMAT_VERSION; ASSERT(newOffset == 0); p->endOffset = 0; } @@ -1002,8 +998,18 @@ public: } self->pHeader = (Header *)self->headerPage->begin(); - self->setPageSize(self->pHeader->pageSize); + if(self->pHeader->formatVersion != Header::FORMAT_VERSION) { + Error e = internal_error(); // TODO: Something better? + TraceEvent(SevError, "DWALPagerRecoveryFailedWrongVersion") + .detail("Filename", self->filename) + .detail("Version", self->pHeader->formatVersion) + .detail("ExpectedVersion", Header::FORMAT_VERSION) + .error(e); + throw e; + } + + self->setPageSize(self->pHeader->pageSize); if(self->logicalPageSize != self->desiredPageSize) { TraceEvent(SevWarn, "DWALPagerPageSizeNotDesired") .detail("Filename", self->filename) @@ -1579,7 +1585,7 @@ private: #pragma pack(push, 1) // Header is the format of page 0 of the database struct Header { - static constexpr int FORMAT_VERSION = 1; + static constexpr int FORMAT_VERSION = 2; uint16_t formatVersion; uint32_t pageSize; int64_t pageCount; @@ -1598,7 +1604,6 @@ private: ASSERT(key.size() < (smallestPhysicalBlock - sizeof(Header))); metaKeySize = key.size(); memcpy(this + 1, key.begin(), key.size()); - ASSERT(formatVersion == FORMAT_VERSION); } int size() const { @@ -2467,10 +2472,8 @@ struct BTreePage { typedef DeltaTree BinaryTree; typedef DeltaTree ValueTree; - static constexpr int FORMAT_VERSION = 1; #pragma pack(push,1) struct { - uint16_t formatVersion; uint8_t height; uint16_t itemCount; uint32_t kvBytes; @@ -2545,7 +2548,6 @@ struct BTreePage { static void makeEmptyRoot(Reference page) { BTreePage *btpage = (BTreePage *)page->begin(); - btpage->formatVersion = BTreePage::FORMAT_VERSION; btpage->height = 1; btpage->kvBytes = 0; btpage->itemCount = 0; @@ -2649,7 +2651,8 @@ public: #pragma pack(push, 1) struct MetaKey { - static constexpr int FORMAT_VERSION = 1; + static constexpr int FORMAT_VERSION = 2; + // This serves as the format version for the entire tree, individual pages will not be versioned uint16_t formatVersion; uint8_t height; LazyDeleteQueueT::QueueState lazyDeleteQueue; @@ -3470,7 +3473,6 @@ private: btPage = (BTreePage *)new uint8_t[size]; } - btPage->formatVersion = BTreePage::FORMAT_VERSION; btPage->height = height; btPage->kvBytes = kvBytes; btPage->itemCount = i - start; @@ -3645,7 +3647,6 @@ private: debug_printf("readPage() op=readComplete %s @%" PRId64 " \n", toString(id).c_str(), snapshot->getVersion()); const BTreePage *pTreePage = (const BTreePage *)page->begin(); - ASSERT(pTreePage->formatVersion == BTreePage::FORMAT_VERSION); if(!forLazyDelete && page->userData == nullptr) { debug_printf("readPage() Creating Reader for %s @%" PRId64 " lower=%s upper=%s\n", toString(id).c_str(), snapshot->getVersion(), lowerBound->toString().c_str(), upperBound->toString().c_str()); From 630c29d160d85bf443c18cc6d721cdab61f936d6 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Mon, 11 Nov 2019 16:24:41 -0800 Subject: [PATCH 165/184] FastRestore:resolve review comments 1) wait on whenAtLeast; 2) Put BigEndian64 into the function call and the decoder to prevent future people from making the same mistake. --- fdbclient/SystemData.cpp | 6 +++--- fdbserver/RestoreApplier.actor.cpp | 18 ++++++++---------- fdbserver/RestoreLoader.actor.cpp | 2 +- fdbserver/RestoreRoleCommon.actor.cpp | 5 +++-- fdbserver/RestoreRoleCommon.actor.h | 2 +- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 681ecec0e8..3c3de2e5e5 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -641,11 +641,11 @@ const KeyRangeRef restoreApplierKeys(LiteralStringRef("\xff\x02/restoreApplier/" const KeyRef restoreApplierTxnValue = LiteralStringRef("1"); // restoreApplierKeys: track atomic transaction progress to ensure applying atomicOp exactly once -// Version integer must be BigEndian to maintain ordering in lexical order +// Version is passed in as LittleEndian, it must be converted to BigEndian to maintain ordering in lexical order const Key restoreApplierKeyFor(UID const& applierID, Version version) { BinaryWriter wr(Unversioned()); wr.serializeBytes(restoreApplierKeys.begin); - wr << applierID << version; + wr << applierID << bigEndian64(version); return wr.toValue(); } @@ -654,7 +654,7 @@ std::pair decodeRestoreApplierKey(ValueRef const& key) { UID applierID; Version version; rd >> applierID >> version; - return std::make_pair(applierID, version); + return std::make_pair(applierID, bigEndian64(version)); } // Encode restore worker key for workerID diff --git a/fdbserver/RestoreApplier.actor.cpp b/fdbserver/RestoreApplier.actor.cpp index 6c015c1694..8f99e5349a 100644 --- a/fdbserver/RestoreApplier.actor.cpp +++ b/fdbserver/RestoreApplier.actor.cpp @@ -65,7 +65,7 @@ ACTOR Future restoreApplierCore(RestoreApplierInterface applierInterf, int } when(RestoreVersionBatchRequest req = waitNext(applierInterf.initVersionBatch.getFuture())) { requestTypeStr = "initVersionBatch"; - handleInitVersionBatchRequest(req, self); + wait(handleInitVersionBatchRequest(req, self)); } when(RestoreVersionBatchRequest req = waitNext(applierInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; @@ -277,9 +277,8 @@ ACTOR Future applyToDB(Reference self, Database cx) { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - // Version integer must be BigEndian to maintain ordering in lexical order - Key begin = restoreApplierKeyFor(self->id(), bigEndian64(0)); - Key end = restoreApplierKeyFor(self->id(), bigEndian64(std::numeric_limits::max())); + Key begin = restoreApplierKeyFor(self->id(), 0); + Key end = restoreApplierKeyFor(self->id(), std::numeric_limits::max()); Standalone txnIds = wait(tr->getRange(KeyRangeRef(begin, end), CLIENT_KNOBS->TOO_MANY)); if (txnIds.size() > 0) { TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean").detail("TxnIds", txnIds.size()); @@ -287,7 +286,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { std::pair applierInfo = decodeRestoreApplierKey(kv.key); TraceEvent(SevError, "FastRestore_ApplyTxnStateNotClean") .detail("Applier", applierInfo.first) - .detail("ResidueTxnID", bigEndian64(applierInfo.second)); + .detail("ResidueTxnID", applierInfo.second); } } break; @@ -303,8 +302,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); - Optional txnSucceeded = - wait(tr->get(restoreApplierKeyFor(self->id(), bigEndian64(progress.curTxnId)))); + Optional txnSucceeded = wait(tr->get(restoreApplierKeyFor(self->id(), progress.curTxnId))); if (!txnSucceeded.present()) { progress.rollback(); continue; @@ -330,7 +328,7 @@ ACTOR Future applyToDB(Reference self, Database cx) { .detail("Version", progress.curItInCurTxn->first); // restoreApplierKeyFor(self->id(), curTxnId) to tell if txn succeeds at an unknown error - tr->set(restoreApplierKeyFor(self->id(), bigEndian64(progress.curTxnId)), restoreApplierTxnValue); + tr->set(restoreApplierKeyFor(self->id(), progress.curTxnId), restoreApplierTxnValue); while (1) { // Loop: Accumulate mutations in a transaction MutationRef m = progress.getCurrentMutation(); @@ -409,8 +407,8 @@ ACTOR Future applyToDB(Reference self, Database cx) { tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); // Clear txnIds in [0, progress.curTxnId). We add 100 to curTxnId just to be safe. - tr->clear(KeyRangeRef(restoreApplierKeyFor(self->id(), bigEndian64(0)), - restoreApplierKeyFor(self->id(), bigEndian64(progress.curTxnId + 100)))); + tr->clear(KeyRangeRef(restoreApplierKeyFor(self->id(), 0), + restoreApplierKeyFor(self->id(), progress.curTxnId + 100))); wait(tr->commit()); break; } catch (Error& e) { diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 4263cad3d4..c7312aeab0 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -86,7 +86,7 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.initVersionBatch.getFuture())) { requestTypeStr = "initVersionBatch"; - handleInitVersionBatchRequest(req, self); + wait(handleInitVersionBatchRequest(req, self)); } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.finishRestore.getFuture())) { requestTypeStr = "finishRestore"; diff --git a/fdbserver/RestoreRoleCommon.actor.cpp b/fdbserver/RestoreRoleCommon.actor.cpp index 8f378f08d3..80b9db92a2 100644 --- a/fdbserver/RestoreRoleCommon.actor.cpp +++ b/fdbserver/RestoreRoleCommon.actor.cpp @@ -55,9 +55,9 @@ void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference req.reply.send(RestoreCommonReply(self->id())); } -void handleInitVersionBatchRequest(const RestoreVersionBatchRequest& req, Reference self) { +ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self) { // batchId is continuous. (req.batchID-1) is the id of the just finished batch. - self->versionBatchId.whenAtLeast(req.batchID - 1); + wait(self->versionBatchId.whenAtLeast(req.batchID - 1)); if (self->versionBatchId.get() == req.batchID - 1) { self->resetPerVersionBatch(); @@ -69,6 +69,7 @@ void handleInitVersionBatchRequest(const RestoreVersionBatchRequest& req, Refere } req.reply.send(RestoreCommonReply(self->id())); + return Void(); } //-------Helper functions diff --git a/fdbserver/RestoreRoleCommon.actor.h b/fdbserver/RestoreRoleCommon.actor.h index 765e1b46fd..f6a5c5b658 100644 --- a/fdbserver/RestoreRoleCommon.actor.h +++ b/fdbserver/RestoreRoleCommon.actor.h @@ -55,7 +55,7 @@ struct RestoreSimpleRequest; typedef std::map>> VersionedMutationsMap; ACTOR Future handleHeartbeat(RestoreSimpleRequest req, UID id); -void handleInitVersionBatchRequest(const RestoreVersionBatchRequest& req, Reference self); +ACTOR Future handleInitVersionBatchRequest(RestoreVersionBatchRequest req, Reference self); void handleFinishRestoreRequest(const RestoreVersionBatchRequest& req, Reference self); // Helper class for reading restore data from a buffer and throwing the right errors. From a4a0bf18f92b0dee13eb1425e69050f17474fabe Mon Sep 17 00:00:00 2001 From: negoyal Date: Tue, 12 Nov 2019 13:01:29 -0800 Subject: [PATCH 166/184] Merging with Master. --- .../source/mr-status-json-schemas.rst.inc | 4 +- fdbclient/FDBTypes.h | 5 + fdbclient/Schemas.cpp | 4 +- fdbclient/StorageServerInterface.h | 3 +- fdbclient/SystemData.cpp | 66 +- fdbclient/SystemData.h | 20 + fdbclient/VersionedMap.h | 73 +- fdbrpc/Locality.cpp | 23 +- fdbrpc/Locality.h | 6 +- fdbrpc/simulator.h | 1 + fdbserver/ApplyMetadataMutation.cpp | 105 +- fdbserver/ApplyMetadataMutation.h | 4 +- fdbserver/CMakeLists.txt | 1 + fdbserver/ClusterController.actor.cpp | 222 +++- fdbserver/ClusterRecruitmentInterface.h | 12 +- fdbserver/Knobs.cpp | 1 + fdbserver/Knobs.h | 1 + fdbserver/MasterProxyServer.actor.cpp | 28 +- fdbserver/OldTLogServer_6_0.actor.cpp | 168 +-- fdbserver/ServerDBInfo.h | 3 +- fdbserver/SimulatedCluster.actor.cpp | 14 + fdbserver/StorageCache.actor.cpp | 1007 +++++++++++++++++ fdbserver/TLogServer.actor.cpp | 170 +-- fdbserver/TagPartitionedLogSystem.actor.cpp | 6 +- fdbserver/WorkerInterface.actor.h | 2 + fdbserver/fdbserver.vcxproj | 1 + fdbserver/fdbserver.vcxproj.filters | 1 + fdbserver/masterserver.actor.cpp | 14 +- fdbserver/storageserver.actor.cpp | 89 +- fdbserver/worker.actor.cpp | 43 +- flow/network.h | 1 + tests/fast/CycleTest.txt | 2 +- 32 files changed, 1855 insertions(+), 245 deletions(-) create mode 100644 fdbserver/StorageCache.actor.cpp diff --git a/documentation/sphinx/source/mr-status-json-schemas.rst.inc b/documentation/sphinx/source/mr-status-json-schemas.rst.inc index 911e7d8baf..e8d4cb1b7f 100644 --- a/documentation/sphinx/source/mr-status-json-schemas.rst.inc +++ b/documentation/sphinx/source/mr-status-json-schemas.rst.inc @@ -29,7 +29,8 @@ "resolution", "proxy", "master", - "test" + "test", + "storage_cache" ] }, "degraded":true, @@ -66,6 +67,7 @@ "cluster_controller", "data_distributor", "ratekeeper", + "storage_cache", "router", "coordinator" ] diff --git a/fdbclient/FDBTypes.h b/fdbclient/FDBTypes.h index aa332666f9..6f06e19432 100644 --- a/fdbclient/FDBTypes.h +++ b/fdbclient/FDBTypes.h @@ -105,6 +105,7 @@ struct struct_like_traits : std::true_type { static const Tag invalidTag {tagLocalitySpecial, 0}; static const Tag txsTag {tagLocalitySpecial, 1}; +static const Tag cacheTag {tagLocalitySpecial, 2}; enum { txsTagOld = -1, invalidTagOld = -100 }; @@ -549,6 +550,10 @@ inline KeySelectorRef operator + (const KeySelectorRef& s, int off) { inline KeySelectorRef operator - (const KeySelectorRef& s, int off) { return KeySelectorRef(s.getKey(), s.orEqual, s.offset-off); } +inline bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) { + // Returns true if the given range suffices to at least begin to resolve the given KeySelectorRef + return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end); +} template struct KeyRangeWith : KeyRange { diff --git a/fdbclient/Schemas.cpp b/fdbclient/Schemas.cpp index d0f93884da..53fd7641c6 100644 --- a/fdbclient/Schemas.cpp +++ b/fdbclient/Schemas.cpp @@ -49,7 +49,8 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "resolution", "proxy", "master", - "test" + "test", + "storage_cache" ] }, "degraded":true, @@ -86,6 +87,7 @@ const KeyRef JSONSchemas::statusSchema = LiteralStringRef(R"statusSchema( "cluster_controller", "data_distributor", "ratekeeper", + "storage_cache", "router", "coordinator" ] diff --git a/fdbclient/StorageServerInterface.h b/fdbclient/StorageServerInterface.h index 2505bf5a31..423b099018 100644 --- a/fdbclient/StorageServerInterface.h +++ b/fdbclient/StorageServerInterface.h @@ -189,8 +189,9 @@ struct GetKeyValuesReply : public LoadBalancedReply { VectorRef data; Version version; // useful when latestVersion was requested bool more; + bool cached; - GetKeyValuesReply() : version(invalidVersion), more(false) {} + GetKeyValuesReply() : version(invalidVersion), more(false), cached(false) {} template void serialize( Ar& ar ) { diff --git a/fdbclient/SystemData.cpp b/fdbclient/SystemData.cpp index 5f1b4b03d7..a41517f041 100644 --- a/fdbclient/SystemData.cpp +++ b/fdbclient/SystemData.cpp @@ -58,6 +58,28 @@ void decodeKeyServersValue( const ValueRef& value, vector& src, vector } } +// "\xff/storageCache/[[begin]]" := "[[vector]]" +const KeyRangeRef storageCacheKeys( LiteralStringRef("\xff/storageCache/"), LiteralStringRef("\xff/storageCache0") ); +const KeyRef storageCachePrefix = storageCacheKeys.begin; + +const Key storageCacheKey( const KeyRef& k ) { + return k.withPrefix( storageCachePrefix ); +} + +const Value storageCacheValue( const vector& serverIndices ) { + BinaryWriter wr((IncludeVersion())); + wr << serverIndices; + return wr.toValue(); +} + +void decodeStorageCacheValue( const ValueRef& value, vector& serverIndices ) { + serverIndices.clear(); + if (value.size()) { + BinaryReader rd(value, IncludeVersion()); + rd >> serverIndices; + } +} + const Value logsValue( const vector>& logs, const vector>& oldLogs ) { BinaryWriter wr(IncludeVersion()); wr << logs; @@ -73,7 +95,6 @@ std::pair>,vector> idx; + return idx; +} +KeyRef cacheKeysDecodeKey( const KeyRef& key ) { + return key.substr( cacheKeysPrefix.size() + sizeof(uint16_t) + 1); +} + +const KeyRef cacheChangeKey = LiteralStringRef("\xff\x02/cacheChangeKey"); +const KeyRangeRef cacheChangeKeys( LiteralStringRef("\xff\x02/cacheChangeKeys/"), LiteralStringRef("\xff\x02/cacheChangeKeys0") ); +const KeyRef cacheChangePrefix = cacheChangeKeys.begin; +const Key cacheChangeKeyFor( uint16_t idx ) { + BinaryWriter wr(Unversioned()); + wr.serializeBytes( cacheChangePrefix ); + wr << idx; + return wr.toValue(); +} +uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key ) { + uint16_t idx; + BinaryReader rd( key.removePrefix(cacheChangePrefix), Unversioned() ); + rd >> idx; + return idx; +} + const KeyRangeRef serverTagKeys( LiteralStringRef("\xff/serverTag/"), LiteralStringRef("\xff/serverTag0") ); diff --git a/fdbclient/SystemData.h b/fdbclient/SystemData.h index dd40289902..236634ac5f 100644 --- a/fdbclient/SystemData.h +++ b/fdbclient/SystemData.h @@ -49,6 +49,13 @@ const Value keyServersValue( void decodeKeyServersValue( const ValueRef& value, vector& src, vector& dest ); +// "\xff/storageCache/[[begin]]" := "[[vector]]" +extern const KeyRangeRef storageCacheKeys; +extern const KeyRef storageCachePrefix; +const Key storageCacheKey( const KeyRef& k ); +const Value storageCacheValue( const vector& serverIndices ); +void decodeStorageCacheValue( const ValueRef& value, vector& serverIndices ); + // "\xff/serverKeys/[[serverID]]/[[begin]]" := "" | "1" | "2" extern const KeyRef serverKeysPrefix; extern const ValueRef serverKeysTrue, serverKeysFalse; @@ -57,6 +64,19 @@ const Key serverKeysPrefixFor( UID serverID ); UID serverKeysDecodeServer( const KeyRef& key ); bool serverHasKey( ValueRef storedValue ); +extern const KeyRef cacheKeysPrefix; + +const Key cacheKeysKey( uint16_t idx, const KeyRef& key ); +const Key cacheKeysPrefixFor( uint16_t idx ); +uint16_t cacheKeysDecodeIndex( const KeyRef& key ); +KeyRef cacheKeysDecodeKey( const KeyRef& key ); + +extern const KeyRef cacheChangeKey; +extern const KeyRangeRef cacheChangeKeys; +extern const KeyRef cacheChangePrefix; +const Key cacheChangeKeyFor( uint16_t idx ); +uint16_t cacheChangeKeyDecodeIndex( const KeyRef& key ); + extern const KeyRangeRef serverTagKeys; extern const KeyRef serverTagPrefix; extern const KeyRangeRef serverTagMaxKeys; diff --git a/fdbclient/VersionedMap.h b/fdbclient/VersionedMap.h index c82ce673c8..8f49f4e25e 100644 --- a/fdbclient/VersionedMap.h +++ b/fdbclient/VersionedMap.h @@ -414,16 +414,19 @@ namespace PTreeImpl { if (p->left(at)) printTree(p->left(at), at, depth+1); for (int i=0;idata).c_str()); + //printf(":%s\n", describe(p->data.value.first).c_str()); + printf(":%s\n", describe(p->data.key).c_str()); if (p->right(at)) printTree(p->right(at), at, depth+1); } template void printTreeDetails(const Reference>& p, int depth = 0) { - printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data).c_str()); + //printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data.value.first).c_str()); + printf("Node %p (depth %d): %s\n", p.getPtr(), depth, describe(p->data.key).c_str()); printf(" Left: %p\n", p->pointer[0].getPtr()); printf(" Right: %p\n", p->pointer[1].getPtr()); - if (p->pointer[2]) + //if (p->pointer[2]) + if (p->updated) printf(" Version %lld %s: %p\n", p->lastUpdateVersion, p->replacedPointer ? "Right" : "Left", p->pointer[2].getPtr()); for(int i=0; i<3; i++) if (p->pointer[i]) printTreeDetails(p->pointer[i], depth+1); @@ -462,8 +465,47 @@ namespace PTreeImpl { } } + //Remove pointers to any child nodes that have been updated at or before the given version + //This essentially gets rid of node versions that will never be read (beyond 5s worth of versions) + //TODO look into making this per-version compaction. (We could keep track of updated nodes at each version for example) + template + void compact(Reference>& p, Version newOldestVersion){ + if (!p) { + return; + } + if (p->updated && p->lastUpdateVersion <= newOldestVersion) { + /* If the node has been updated, figure out which pointer was repalced. And delete that pointer */ + auto which = p->replacedPointer; + p->pointer[which] = p->pointer[2]; + p->updated = false; + p->pointer[2] = Reference>(); + //p->pointer[which] = Reference>(); + } + Reference> left = p->left(newOldestVersion); + Reference> right = p->right(newOldestVersion); + compact(left, newOldestVersion); + compact(right, newOldestVersion); + } + } +class ValueOrClearToRef { +public: + static ValueOrClearToRef value(ValueRef const& v) { return ValueOrClearToRef(v, false); } + static ValueOrClearToRef clearTo(KeyRef const& k) { return ValueOrClearToRef(k, true); } + + bool isValue() const { return !isClear; }; + bool isClearTo() const { return isClear; } + + ValueRef const& getValue() const { ASSERT( isValue() ); return item; }; + KeyRef const& getEndKey() const { ASSERT(isClearTo()); return item; }; + +private: + ValueOrClearToRef( StringRef item, bool isClear ) : item(item), isClear(isClear) {} + StringRef item; + bool isClear; +}; + // VersionedMap provides an interface to a partially persistent tree, allowing you to read the values at a particular version, // create new versions, modify the current version of the tree, and forget versions prior to a specific version. template @@ -597,6 +639,26 @@ public: erase(key); } + void printDetail() { + PTreeImpl::printTreeDetails(roots.back().second, 0); + } + + void printTree(Version at) { + PTreeImpl::printTree(roots.back().second, at, 0); + } + + void compact(Version newOldestVersion) { + ASSERT( newOldestVersion <= latestVersion ); + //auto newBegin = roots.lower_bound(newOldestVersion); + auto newBegin = lower_bound(roots.begin(), roots.end(), newOldestVersion, rootsComparator()); + for(auto root = roots.begin(); root != newBegin; ++root) { + if(root->second) + PTreeImpl::compact(root->second, newOldestVersion); + } + //printf("\nPrinting the tree at latest version after compaction.\n"); + //PTreeImpl::printTreeDetails(roots.back().second(), 0); + } + // for(auto i = vm.at(version).lower_bound(range.begin); i < range.end; ++i) struct iterator{ explicit iterator(Tree const& root, Version at) : root(root), at(at) {} @@ -686,6 +748,11 @@ public: ViewAtVersion at( Version v ) const { return ViewAtVersion(getRoot(v), v); } ViewAtVersion atLatest() const { return ViewAtVersion(roots.back().second, latestVersion); } + bool isClearContaining( ViewAtVersion const& view, KeyRef key ) { + auto i = view.lastLessOrEqual(key); + return i && i->isClearTo() && i->getEndKey() > key; + } + // TODO: getHistory? }; diff --git a/fdbrpc/Locality.cpp b/fdbrpc/Locality.cpp index 424cf81733..27fa654372 100644 --- a/fdbrpc/Locality.cpp +++ b/fdbrpc/Locality.cpp @@ -40,8 +40,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::LogClass: return ProcessClass::WorstFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::NeverAssign; @@ -57,8 +57,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::StorageClass: return ProcessClass::WorstFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::NeverAssign; @@ -76,8 +76,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::TransactionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -93,8 +93,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::ResolutionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -110,8 +110,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::TransactionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -129,8 +129,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::TransactionClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -154,8 +154,8 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons case ProcessClass::LogRouterClass: return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: - return ProcessClass::NeverAssign; case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -172,6 +172,7 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; @@ -188,10 +189,18 @@ ProcessClass::Fitness ProcessClass::machineClassFitness( ClusterRole role ) cons return ProcessClass::OkayFit; case ProcessClass::CoordinatorClass: case ProcessClass::TesterClass: + case ProcessClass::StorageCacheClass: return ProcessClass::NeverAssign; default: return ProcessClass::WorstFit; } + case ProcessClass::StorageCache: + switch( _class ) { + case ProcessClass::StorageCacheClass: + return ProcessClass::BestFit; + default: + return ProcessClass::NeverAssign; + } default: return ProcessClass::NeverAssign; } diff --git a/fdbrpc/Locality.h b/fdbrpc/Locality.h index c8f2b096ae..78cb49b638 100644 --- a/fdbrpc/Locality.h +++ b/fdbrpc/Locality.h @@ -43,11 +43,12 @@ struct ProcessClass { DataDistributorClass, CoordinatorClass, RatekeeperClass, + StorageCacheClass, InvalidClass = -1 }; enum Fitness { BestFit, GoodFit, UnsetFit, OkayFit, WorstFit, ExcludeFit, NeverAssign }; //cannot be larger than 7 because of leader election mask - enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, Ratekeeper, NoRole }; + enum ClusterRole { Storage, TLog, Proxy, Master, Resolver, LogRouter, ClusterController, DataDistributor, Ratekeeper, StorageCache, NoRole }; enum ClassSource { CommandLineSource, AutoSource, DBSource, InvalidSource = -1 }; int16_t _class; int16_t _source; @@ -72,6 +73,7 @@ public: else if (s=="data_distributor") _class = DataDistributorClass; else if (s=="coordinator") _class = CoordinatorClass; else if (s=="ratekeeper") _class = RatekeeperClass; + else if (s=="storage_cache") _class = StorageCacheClass; else _class = InvalidClass; } @@ -91,6 +93,7 @@ public: else if (classStr=="data_distributor") _class = DataDistributorClass; else if (classStr=="coordinator") _class = CoordinatorClass; else if (classStr=="ratekeeper") _class = RatekeeperClass; + else if (classStr=="storage_cache") _class = StorageCacheClass; else _class = InvalidClass; if (sourceStr=="command_line") _source = CommandLineSource; @@ -125,6 +128,7 @@ public: case DataDistributorClass: return "data_distributor"; case CoordinatorClass: return "coordinator"; case RatekeeperClass: return "ratekeeper"; + case StorageCacheClass: return "storage_cache"; default: return "invalid"; } } diff --git a/fdbrpc/simulator.h b/fdbrpc/simulator.h index c78fd82edb..bff5cf93b9 100644 --- a/fdbrpc/simulator.h +++ b/fdbrpc/simulator.h @@ -98,6 +98,7 @@ public: case ProcessClass::ClusterControllerClass: return false; case ProcessClass::DataDistributorClass: return false; case ProcessClass::RatekeeperClass: return false; + case ProcessClass::StorageCacheClass: return false; default: return false; } } diff --git a/fdbserver/ApplyMetadataMutation.cpp b/fdbserver/ApplyMetadataMutation.cpp index 7dea8d1723..ffe84e8e4d 100644 --- a/fdbserver/ApplyMetadataMutation.cpp +++ b/fdbserver/ApplyMetadataMutation.cpp @@ -46,8 +46,10 @@ Reference getStorageInfo(UID id, std::map const& mutations, IKeyValueStore* txnStateStore, LogPushData* toCommit, bool *confChange, Reference logSystem, Version popVersion, - KeyRangeMap >* vecBackupKeys, KeyRangeMap* keyInfo, std::map* uid_applyMutationsData, RequestStream commit, - Database cx, NotifiedVersion* commitVersion, std::map>* storageCache, std::map* tag_popped, bool initialCommit ) { + KeyRangeMap >* vecBackupKeys, KeyRangeMap* keyInfo, KeyRangeMap* cacheInfo, std::map* uid_applyMutationsData, RequestStream commit, + Database cx, NotifiedVersion* commitVersion, std::map>* storageCache, std::map* tag_popped, bool initialCommit ) { + //std::map> cacheRangeInfo; + std::map cachedRangeInfo; for (auto const& m : mutations) { //TraceEvent("MetadataMutation", dbgid).detail("M", m.toString()); @@ -129,6 +131,37 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefrangeContaining(k).end(); + vector serverIndices; + decodeStorageCacheValue(m.param2, serverIndices); + cacheInfo->insert(KeyRangeRef(k,end),serverIndices.size() > 0); + } + } + if(!initialCommit) txnStateStore->set(KeyValueRef(m.param1, m.param2)); + } else if (m.param1.startsWith(cacheKeysPrefix)) { + // Create a private mutation for cache servers + // This is done to make the cache servers aware of the cached key-ranges + if(toCommit) { + MutationRef privatized = m; + privatized.param1 = m.param1.withPrefix(systemKeys.begin, arena); + TraceEvent(SevDebug, "SendingPrivateMutation", dbgid).detail("Original", m.toString()).detail("Privatized", privatized.toString()); + toCommit->addTag( cacheTag ); + toCommit->addTypedMessage(privatized); + } } else if (m.param1.startsWith(configKeysPrefix) || m.param1 == coordinatorsKey) { if(Optional(m.param2) != txnStateStore->readValue(m.param1).get().castTo()) { // FIXME: Make this check more specific, here or by reading configuration whenever there is a change @@ -138,7 +171,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefset(KeyValueRef(m.param1, m.param2)); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { Key uid = m.param1.removePrefix(applyMutationsEndRange.begin); auto &p = (*uid_applyMutationsData)[uid]; p.endVersion = BinaryReader::fromStringRef(m.param2, Unversioned()); @@ -190,7 +223,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefset(KeyValueRef(m.param1, m.param2)); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { if(m.param1.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)) { Key uid = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID)); Key k = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)); @@ -205,7 +238,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefset(KeyValueRef(m.param1, m.param2)); if (vecBackupKeys) { Key logDestination; - KeyRef logRangeBegin = logRangesDecodeKey(m.param1, NULL); + KeyRef logRangeBegin = logRangesDecodeKey(m.param1, nullptr); Key logRangeEnd = logRangesDecodeValue(m.param2, &logDestination); // Insert the logDestination into each range of vecBackupKeys overlapping the decoded range @@ -345,7 +378,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefclear(commonEndRange); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { uid_applyMutationsData->erase(uid_applyMutationsData->lower_bound(m.param1.substr(applyMutationsEndRange.begin.size())), m.param2 == applyMutationsEndRange.end ? uid_applyMutationsData->end() : uid_applyMutationsData->lower_bound(m.param2.substr(applyMutationsEndRange.begin.size()))); } @@ -353,7 +386,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRefclear(commonApplyRange); - if(uid_applyMutationsData != NULL) { + if(uid_applyMutationsData != nullptr) { if(m.param1.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID) && m.param2.size() >= applyMutationsKeyVersionMapRange.begin.size() + sizeof(UID)) { Key uid = m.param1.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID)); Key uid2 = m.param2.substr(applyMutationsKeyVersionMapRange.begin.size(), sizeof(UID)); @@ -389,7 +422,7 @@ void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef::iterator itr; + KeyRef keyBegin, keyEnd; + vector serverIndices; + MutationRef mutationBegin, mutationEnd; + + for (itr = cachedRangeInfo.begin(); itr != cachedRangeInfo.end(); ++itr) { + // first figure out the begin and end keys for the cached-range, + // the begin and end mutations can be in any order + decodeStorageCacheValue(itr->second.param2, serverIndices); + // serverIndices count should be greater than zero for beginKey mutations + if (serverIndices.size() > 0) { + keyBegin = itr->first; + mutationBegin = itr->second; + ++itr; + keyEnd = itr->first; + mutationEnd = itr->second; + } else { + keyEnd = itr->first; + mutationEnd = itr->second; + ++itr; + keyBegin = itr->first; + mutationBegin = itr->second; + } + + // Now get all the storage server tags for the cached key-ranges + std::set allTags; + auto ranges = keyInfo->intersectingRanges(KeyRangeRef(keyBegin, keyEnd)); + for(auto it : ranges) { + auto& r = it.value(); + for(auto info : r.src_info) { + allTags.insert(info->tag); + } + for(auto info : r.dest_info) { + allTags.insert(info->tag); + } + } + + // Add the tags to both begin and end mutations + toCommit->addTags(allTags); + toCommit->addTypedMessage(mutationBegin); + toCommit->addTags(allTags); + toCommit->addTypedMessage(mutationEnd); + } + } } diff --git a/fdbserver/ApplyMetadataMutation.h b/fdbserver/ApplyMetadataMutation.h index 78bd1cc5d2..54cd140f3c 100644 --- a/fdbserver/ApplyMetadataMutation.h +++ b/fdbserver/ApplyMetadataMutation.h @@ -45,7 +45,7 @@ struct applyMutationsData { Reference getStorageInfo(UID id, std::map>* storageCache, IKeyValueStore* txnStateStore); void applyMetadataMutations(UID const& dbgid, Arena &arena, VectorRef const& mutations, IKeyValueStore* txnStateStore, LogPushData* toCommit, bool *confChange, Reference logSystem = Reference(), Version popVersion = 0, - KeyRangeMap >* vecBackupKeys = NULL, KeyRangeMap* keyInfo = NULL, std::map* uid_applyMutationsData = NULL, RequestStream commit = RequestStream(), - Database cx = Database(), NotifiedVersion* commitVersion = NULL, std::map>* storageCache = NULL, std::map* tag_popped = NULL, bool initialCommit = false ); + KeyRangeMap >* vecBackupKeys = nullptr, KeyRangeMap* keyInfo = nullptr, KeyRangeMap* cacheInfo = nullptr, std::map* uid_applyMutationsData = nullptr, RequestStream commit = RequestStream(), + Database cx = Database(), NotifiedVersion* commitVersion = nullptr, std::map>* storageCache = nullptr, std::map* tag_popped = nullptr, bool initialCommit = false ); #endif diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt index 11f3d1f203..1dd87fee55 100644 --- a/fdbserver/CMakeLists.txt +++ b/fdbserver/CMakeLists.txt @@ -84,6 +84,7 @@ set(FDBSERVER_SRCS SkipList.cpp Status.actor.cpp Status.h + StorageCache.actor.cpp StorageMetrics.actor.h StorageMetrics.h storageserver.actor.cpp diff --git a/fdbserver/ClusterController.actor.cpp b/fdbserver/ClusterController.actor.cpp index be97ec41a3..ffcacce08c 100644 --- a/fdbserver/ClusterController.actor.cpp +++ b/fdbserver/ClusterController.actor.cpp @@ -57,13 +57,15 @@ struct WorkerInfo : NonCopyable { WorkerDetails details; Future haltRatekeeper; Future haltDistributor; + Optional storageCacheInfo; WorkerInfo() : gen(-1), reboots(0), lastAvailableTime(now()), priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} WorkerInfo( Future watcher, ReplyPromise reply, Generation gen, WorkerInterface interf, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, bool degraded ) : watcher(watcher), reply(reply), gen(gen), reboots(0), lastAvailableTime(now()), initialClass(initialClass), priorityInfo(priorityInfo), details(interf, processClass, degraded) {} WorkerInfo( WorkerInfo&& r ) BOOST_NOEXCEPT : watcher(std::move(r.watcher)), reply(std::move(r.reply)), gen(r.gen), - reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)) {} + reboots(r.reboots), lastAvailableTime(r.lastAvailableTime), initialClass(r.initialClass), priorityInfo(r.priorityInfo), details(std::move(r.details)), + haltRatekeeper(r.haltRatekeeper), haltDistributor(r.haltDistributor), storageCacheInfo(r.storageCacheInfo) {} void operator=( WorkerInfo&& r ) BOOST_NOEXCEPT { watcher = std::move(r.watcher); reply = std::move(r.reply); @@ -73,6 +75,9 @@ struct WorkerInfo : NonCopyable { initialClass = r.initialClass; priorityInfo = r.priorityInfo; details = std::move(r.details); + haltRatekeeper = r.haltRatekeeper; + haltDistributor = r.haltDistributor; + storageCacheInfo = r.storageCacheInfo; } }; @@ -101,9 +106,11 @@ public: Database db; int unfinishedRecoveries; int logGenerations; + std::map, Optional>> cacheInterfaces; + bool cachePopulated; std::map> clientStatus; - DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), + DBInfo() : masterRegistrationCount(0), recoveryStalled(false), forceRecovery(false), unfinishedRecoveries(0), logGenerations(0), cachePopulated(false), clientInfo( new AsyncVar( ClientDBInfo() ) ), serverInfo( new AsyncVar>( CachedSerialization() ) ), db( DatabaseContext::create( clientInfo, Future(), LocalityData(), true, TaskPriority::DefaultEndpoint, true ) ) // SOMEDAY: Locality! @@ -126,6 +133,27 @@ public: serverInfo->set( newInfoCache ); } + void setStorageCache(uint16_t id, const StorageServerInterface& interf) { + CachedSerialization newInfoCache = serverInfo->get(); + auto& newInfo = newInfoCache.mutate(); + bool found = false; + for(auto& it : newInfo.storageCaches) { + if(it.first == id) { + if(it.second != interf) { + newInfo.id = deterministicRandom()->randomUniqueID(); + it.second = interf; + } + found = true; + break; + } + } + if(!found) { + newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.storageCaches.push_back(std::make_pair(id, interf)); + } + serverInfo->set( newInfoCache ); + } + void clearInterf(ProcessClass::ClassType t) { CachedSerialization newInfoCache = serverInfo->get(); auto& newInfo = newInfoCache.mutate(); @@ -137,6 +165,19 @@ public: } serverInfo->set( newInfoCache ); } + + void clearStorageCache(uint16_t id) { + CachedSerialization newInfoCache = serverInfo->get(); + auto& newInfo = newInfoCache.mutate(); + for(auto it = newInfo.storageCaches.begin(); it != newInfo.storageCaches.end(); ++it) { + if(it->first == id) { + newInfo.id = deterministicRandom()->randomUniqueID(); + newInfo.storageCaches.erase(it); + break; + } + } + serverInfo->set( newInfoCache ); + } }; struct UpdateWorkerList { @@ -201,6 +242,11 @@ public: return ( now() - startTime < 2 * FLOW_KNOBS->SERVER_REQUEST_INTERVAL ) || ( IFailureMonitor::failureMonitor().getState(worker.details.interf.storage.getEndpoint()).isAvailable() && ( !checkStable || worker.reboots < 2 ) ); } + bool isLongLivedStateless( Optional const& processId ) { + return (db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == processId) || + (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == processId); + } + WorkerDetails getStorageWorker( RecruitStorageRequest const& req ) { std::set>> excludedMachines( req.excludeMachines.begin(), req.excludeMachines.end() ); std::set>> includeDCs( req.includeDCs.begin(), req.includeDCs.end() ); @@ -453,8 +499,7 @@ public: fitness = std::max(fitness, ProcessClass::ExcludeFit); } if( workerAvailable(it.second, checkStable) && fitness < unacceptableFitness && it.second.details.interf.locality.dcId()==dcId ) { - if ((db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == it.first) || - (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == it.first)) { + if (isLongLivedStateless(it.first)) { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details); } else { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].first.push_back(it.second.details); @@ -486,8 +531,7 @@ public: auto fitness = it.second.details.processClass.machineClassFitness( role ); if( workerAvailable(it.second, checkStable) && !conf.isExcludedServer(it.second.details.interf.address()) && it.second.details.interf.locality.dcId() == dcId && ( !minWorker.present() || ( it.second.details.interf.id() != minWorker.get().worker.interf.id() && ( fitness < minWorker.get().fitness || (fitness == minWorker.get().fitness && id_used[it.first] <= minWorker.get().used ) ) ) ) ) { - if ((db.serverInfo->get().read().distributor.present() && db.serverInfo->get().read().distributor.get().locality.processId() == it.first) || - (db.serverInfo->get().read().ratekeeper.present() && db.serverInfo->get().read().ratekeeper.get().locality.processId() == it.first)) { + if (isLongLivedStateless(it.first)) { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].second.push_back(it.second.details); } else { fitness_workers[ std::make_pair(fitness, id_used[it.first]) ].first.push_back(it.second.details); @@ -1328,6 +1372,7 @@ ACTOR Future clusterWatchDatabase( ClusterControllerData* cluster, Cluster dbInfo.clusterInterface = db->serverInfo->get().read().clusterInterface; dbInfo.distributor = db->serverInfo->get().read().distributor; dbInfo.ratekeeper = db->serverInfo->get().read().ratekeeper; + dbInfo.storageCaches = db->serverInfo->get().read().storageCaches; TraceEvent("CCWDB", cluster->id).detail("Lifetime", dbInfo.masterLifetime.toString()).detail("ChangeID", dbInfo.id); db->serverInfo->set( cachedInfo ); @@ -1580,8 +1625,27 @@ ACTOR Future workerAvailabilityWatch( WorkerInterface worker, ProcessClass } when( wait( failed ) ) { // remove workers that have failed WorkerInfo& failedWorkerInfo = cluster->id_worker[ worker.locality.processId() ]; + if(failedWorkerInfo.storageCacheInfo.present()) { + bool found = false; + for(auto& it : cluster->id_worker) { + if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) { + found = true; + it.second.storageCacheInfo = failedWorkerInfo.storageCacheInfo; + cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional(), it.first); + if(!it.second.reply.isSet()) { + it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, failedWorkerInfo.storageCacheInfo) ); + } + break; + } + } + if(!found) { + cluster->db.cacheInterfaces[failedWorkerInfo.storageCacheInfo.get()] = std::make_pair(Optional(), Optional()); + } + cluster->db.clearStorageCache(failedWorkerInfo.storageCacheInfo.get()); + } + if (!failedWorkerInfo.reply.isSet()) { - failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo) ); + failedWorkerInfo.reply.send( RegisterWorkerReply(failedWorkerInfo.details.processClass, failedWorkerInfo.priorityInfo, Optional()) ); } if (worker.locality.processId() == cluster->masterProcessId) { cluster->masterProcessId = Optional(); @@ -1855,7 +1919,7 @@ void clusterRegisterMaster( ClusterControllerData* self, RegisterMasterRequest c if ( it.second.priorityInfo.isExcluded != isExcludedFromConfig ) { it.second.priorityInfo.isExcluded = isExcludedFromConfig; if( !it.second.reply.isSet() ) { - it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) ); } } } @@ -1957,11 +2021,6 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { if ( self->gotFullyRecoveredConfig ) { newPriorityInfo.isExcluded = self->db.fullyRecoveredConfig.isExcludedServer(w.address()); } - - // Notify the worker to register again with new process class/exclusive property - if ( !req.reply.isSet() && newPriorityInfo != req.priorityInfo ) { - req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo) ); - } } if( info == self->id_worker.end() ) { @@ -2021,6 +2080,57 @@ void registerWorker( RegisterWorkerRequest req, ClusterControllerData *self ) { } } } + Optional newStorageCache = req.storageCacheInterf.present() ? req.storageCacheInterf.get().first : Optional(); + auto& cacheInfo = self->id_worker[w.locality.processId()].storageCacheInfo; + if (req.storageCacheInterf.present()) { + auto it = self->db.cacheInterfaces.find(req.storageCacheInterf.get().first); + if(it == self->db.cacheInterfaces.end()) { + if(self->db.cachePopulated) { + if(cacheInfo.present()) { + self->db.clearStorageCache(cacheInfo.get()); + } + newStorageCache = Optional(); + cacheInfo = Optional(); + } else { + self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second); + self->db.cacheInterfaces[req.storageCacheInterf.get().first] = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId()); + cacheInfo = req.storageCacheInterf.get().first; + } + } else { + if(!it->second.second.present() || (cacheInfo.present() && cacheInfo.get() == it->first) ) { + self->db.setStorageCache(req.storageCacheInterf.get().first, req.storageCacheInterf.get().second); + it->second = std::make_pair(req.storageCacheInterf.get().second, w.locality.processId()); + cacheInfo = req.storageCacheInterf.get().first; + } + else { + if(cacheInfo.present()) { + self->db.clearStorageCache(cacheInfo.get()); + } + newStorageCache = Optional(); + cacheInfo = Optional(); + } + } + } else { + newStorageCache = cacheInfo; + } + + if(self->gotProcessClasses && newProcessClass == ProcessClass::StorageCacheClass && !newStorageCache.present()) { + for(auto& it : self->db.cacheInterfaces) { + if(!it.second.second.present()) { + it.second.second = w.locality.processId(); + self->id_worker[w.locality.processId()].storageCacheInfo = it.first; + newStorageCache = it.first; + break; + } + } + } + + // Notify the worker to register again with new process class/exclusive property + if ( !req.reply.isSet() && ( newPriorityInfo != req.priorityInfo || + newStorageCache.present() != req.storageCacheInterf.present() || + (newStorageCache.present() && newStorageCache.get() != req.storageCacheInterf.get().first) ) ) { + req.reply.send( RegisterWorkerReply(newProcessClass, newPriorityInfo, newStorageCache) ); + } } #define TIME_KEEPER_VERSION LiteralStringRef("1") @@ -2240,7 +2350,7 @@ ACTOR Future monitorProcessClasses(ClusterControllerData *self) { w.second.details.processClass = newProcessClass; w.second.priorityInfo.processClassFitness = newProcessClass.machineClassFitness(ProcessClass::ClusterController); if (!w.second.reply.isSet()) { - w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo) ); + w.second.reply.send( RegisterWorkerReply(w.second.details.processClass, w.second.priorityInfo, w.second.storageCacheInfo) ); } } } @@ -2300,6 +2410,80 @@ ACTOR Future monitorServerInfoConfig(ClusterControllerData::DBInfo* db) { } } +ACTOR Future monitorStorageCache(ClusterControllerData* self) { + loop { + state ReadYourWritesTransaction tr(self->db.db); + loop { + try { + tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); + tr.setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); + tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); + + Optional changeVal = wait(tr.get(cacheChangeKey)); + Standalone changeKeys = wait(tr.getRange(cacheChangeKeys, CLIENT_KNOBS->TOO_MANY)); + ASSERT( !changeKeys.more && changeKeys.size() < CLIENT_KNOBS->TOO_MANY ); + + std::set changeIDs; + for(auto& it : changeKeys) { + changeIDs.insert(cacheChangeKeyDecodeIndex(it.key)); + } + + for(auto& it : changeIDs) { + if(!self->db.cacheInterfaces.count(it)) { + self->db.cacheInterfaces[it] = std::make_pair(Optional(), Optional()); + } + } + + std::vector removeIDs; + for(auto& it : self->db.cacheInterfaces) { + if(!changeIDs.count(it.first)) { + removeIDs.push_back(it.first); + if(it.second.second.present()) { + self->id_worker[it.second.second.get()].storageCacheInfo = Optional(); + } + self->db.clearStorageCache(it.first); + } + } + + for(auto& it : removeIDs) { + self->db.cacheInterfaces.erase(it); + } + + for(auto& c : self->db.cacheInterfaces) { + if(!c.second.second.present()) { + bool found = false; + for(auto& it : self->id_worker) { + if(!it.second.storageCacheInfo.present() && it.second.details.processClass == ProcessClass::StorageCacheClass) { + found = true; + it.second.storageCacheInfo = c.first; + c.second.second = it.first; + if(!it.second.reply.isSet()) { + it.second.reply.send( RegisterWorkerReply(it.second.details.processClass, it.second.priorityInfo, c.first) ); + } + break; + } + } + if(!found) { + break; + } + } + } + + state Future configChangeFuture = tr.watch(cacheChangeKey); + + self->db.cachePopulated = true; + wait(tr.commit()); + wait(configChangeFuture); + + break; + } + catch (Error &e) { + wait(tr.onError(e)); + } + } + } +} + ACTOR Future monitorClientTxnInfoConfigs(ClusterControllerData::DBInfo* db) { loop { state ReadYourWritesTransaction tr(db->db); @@ -2350,7 +2534,7 @@ ACTOR Future updatedChangingDatacenters(ClusterControllerData *self) { if ( worker.priorityInfo.dcFitness > newFitness ) { worker.priorityInfo.dcFitness = newFitness; if(!worker.reply.isSet()) { - worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) ); + worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) ); } } else { state int currentFit = ProcessClass::BestFit; @@ -2363,7 +2547,7 @@ ACTOR Future updatedChangingDatacenters(ClusterControllerData *self) { updated = true; it.second.priorityInfo.dcFitness = fitness; if(!it.second.reply.isSet()) { - it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) ); } } } @@ -2402,7 +2586,7 @@ ACTOR Future updatedChangedDatacenters(ClusterControllerData *self) { if( worker.priorityInfo.dcFitness != newFitness ) { worker.priorityInfo.dcFitness = newFitness; if(!worker.reply.isSet()) { - worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo ) ); + worker.reply.send( RegisterWorkerReply( worker.details.processClass, worker.priorityInfo, worker.storageCacheInfo ) ); } } } else { @@ -2416,7 +2600,7 @@ ACTOR Future updatedChangedDatacenters(ClusterControllerData *self) { updated = true; it.second.priorityInfo.dcFitness = fitness; if(!it.second.reply.isSet()) { - it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo ) ); + it.second.reply.send( RegisterWorkerReply( it.second.details.processClass, it.second.priorityInfo, it.second.storageCacheInfo ) ); } } } @@ -2703,8 +2887,8 @@ ACTOR Future clusterControllerCore( ClusterControllerFullInterface interf, self.addActor.send( handleForcedRecoveries(&self, interf) ); self.addActor.send( monitorDataDistributor(&self) ); self.addActor.send( monitorRatekeeper(&self) ); + self.addActor.send( monitorStorageCache(&self) ); self.addActor.send( traceCounters("ClusterControllerMetrics", self.id, SERVER_KNOBS->STORAGE_LOGGING_DELAY, &self.clusterControllerMetrics, self.id.toString() + "/ClusterControllerMetrics") ); - //printf("%s: I am the cluster controller\n", g_network->getLocalAddress().toString().c_str()); loop choose { diff --git a/fdbserver/ClusterRecruitmentInterface.h b/fdbserver/ClusterRecruitmentInterface.h index d8432c7d1e..cf238f1b79 100644 --- a/fdbserver/ClusterRecruitmentInterface.h +++ b/fdbserver/ClusterRecruitmentInterface.h @@ -175,13 +175,14 @@ struct RegisterWorkerReply { constexpr static FileIdentifier file_identifier = 16475696; ProcessClass processClass; ClusterControllerPriorityInfo priorityInfo; + Optional storageCache; RegisterWorkerReply() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown) {} - RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo) : processClass(processClass), priorityInfo(priorityInfo) {} + RegisterWorkerReply(ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Optional storageCache) : processClass(processClass), priorityInfo(priorityInfo), storageCache(storageCache) {} template void serialize( Ar& ar ) { - serializer(ar, processClass, priorityInfo); + serializer(ar, processClass, priorityInfo, storageCache); } }; @@ -194,16 +195,17 @@ struct RegisterWorkerRequest { Generation generation; Optional distributorInterf; Optional ratekeeperInterf; + Optional> storageCacheInterf; ReplyPromise reply; bool degraded; RegisterWorkerRequest() : priorityInfo(ProcessClass::UnsetFit, false, ClusterControllerPriorityInfo::FitnessUnknown), degraded(false) {} - RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf, bool degraded) : - wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), degraded(degraded) {} + RegisterWorkerRequest(WorkerInterface wi, ProcessClass initialClass, ProcessClass processClass, ClusterControllerPriorityInfo priorityInfo, Generation generation, Optional ddInterf, Optional rkInterf, Optional> storageCacheInterf, bool degraded) : + wi(wi), initialClass(initialClass), processClass(processClass), priorityInfo(priorityInfo), generation(generation), distributorInterf(ddInterf), ratekeeperInterf(rkInterf), storageCacheInterf(storageCacheInterf), degraded(degraded) {} template void serialize( Ar& ar ) { - serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, reply, degraded); + serializer(ar, wi, initialClass, processClass, priorityInfo, generation, distributorInterf, ratekeeperInterf, storageCacheInterf, reply, degraded); } }; diff --git a/fdbserver/Knobs.cpp b/fdbserver/Knobs.cpp index cda58c32d3..9ca58cb830 100644 --- a/fdbserver/Knobs.cpp +++ b/fdbserver/Knobs.cpp @@ -81,6 +81,7 @@ ServerKnobs::ServerKnobs(bool randomize, ClientKnobs* clientKnobs) { init( DISK_QUEUE_MAX_TRUNCATE_BYTES, 2<<30 ); if ( randomize && BUGGIFY ) DISK_QUEUE_MAX_TRUNCATE_BYTES = 0; init( TLOG_DEGRADED_DELAY_COUNT, 5 ); init( TLOG_DEGRADED_DURATION, 5.0 ); + init( MAX_CACHE_VERSIONS, 10e6 ); init( TLOG_IGNORE_POP_AUTO_ENABLE_DELAY, 300.0 ); init( TXS_POPPED_MAX_DELAY, 1.0 ); if ( randomize && BUGGIFY ) TXS_POPPED_MAX_DELAY = deterministicRandom()->random01(); diff --git a/fdbserver/Knobs.h b/fdbserver/Knobs.h index a58a9e9fb7..3d12be885a 100644 --- a/fdbserver/Knobs.h +++ b/fdbserver/Knobs.h @@ -84,6 +84,7 @@ public: int DISK_QUEUE_MAX_TRUNCATE_BYTES; // A truncate larger than this will cause the file to be replaced instead. int TLOG_DEGRADED_DELAY_COUNT; double TLOG_DEGRADED_DURATION; + int64_t MAX_CACHE_VERSIONS; double TXS_POPPED_MAX_DELAY; // Data distribution queue diff --git a/fdbserver/MasterProxyServer.actor.cpp b/fdbserver/MasterProxyServer.actor.cpp index b7740e9896..c573f33187 100644 --- a/fdbserver/MasterProxyServer.actor.cpp +++ b/fdbserver/MasterProxyServer.actor.cpp @@ -207,6 +207,7 @@ struct ProxyCommitData { uint64_t mostRecentProcessedRequestNumber; KeyRangeMap>> keyResolvers; KeyRangeMap keyInfo; + KeyRangeMap cacheInfo; std::map uid_applyMutationsData; bool firstProxy; double lastCoalesceTime; @@ -258,6 +259,16 @@ struct ProxyCommitData { return tags; } + const bool needsCacheTag(KeyRangeRef range) { + auto ranges = cacheInfo.intersectingRanges(range); + for(auto r : ranges) { + if(r.value()) { + return true; + } + } + return false; + } + ProxyCommitData(UID dbgid, MasterInterface master, RequestStream getConsistentReadVersion, Version recoveryTransactionVersion, RequestStream commit, Reference> db, bool firstProxy) : dbgid(dbgid), stats(dbgid, &version, &committedVersion, &commitBatchesMemBytesCount), master(master), logAdapter(NULL), txnStateStore(NULL), popRemoteTxs(false), @@ -658,7 +669,7 @@ ACTOR Future commitBatch( for (int resolver = 0; resolver < resolution.size(); resolver++) committed = committed && resolution[resolver].stateMutations[versionIndex][transactionIndex].committed; if (committed) - applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, NULL, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); + applyMetadataMutations( self->dbgid, arena, resolution[0].stateMutations[versionIndex][transactionIndex].mutations, self->txnStateStore, nullptr, &forceRecovery, self->logSystem, 0, &self->vecBackupKeys, &self->keyInfo, &self->cacheInfo, self->firstProxy ? &self->uid_applyMutationsData : nullptr, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); if( resolution[0].stateMutations[versionIndex][transactionIndex].mutations.size() && firstStateMutations ) { ASSERT(committed); @@ -738,7 +749,7 @@ ACTOR Future commitBatch( { if (committed[t] == ConflictBatch::TransactionCommitted && (!locked || trs[t].isLockAware())) { commitCount++; - applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); + applyMetadataMutations(self->dbgid, arena, trs[t].transaction.mutations, self->txnStateStore, &toCommit, &forceRecovery, self->logSystem, commitVersion+1, &self->vecBackupKeys, &self->keyInfo, &self->cacheInfo, self->firstProxy ? &self->uid_applyMutationsData : NULL, self->commit, self->cx, &self->committedVersion, &self->storageCache, &self->tag_popped); } if(firstStateMutations) { ASSERT(committed[t] == ConflictBatch::TransactionCommitted); @@ -809,11 +820,16 @@ ACTOR Future commitBatch( if (debugMutation("ProxyCommit", commitVersion, m)) TraceEvent("ProxyCommitTo", self->dbgid).detail("To", describe(tags)).detail("Mutation", m.toString()).detail("Version", commitVersion); + toCommit.addTags(tags); + if(self->cacheInfo[m.param1]) { + toCommit.addTag(cacheTag); + } toCommit.addTypedMessage(m); } else if (m.type == MutationRef::ClearRange) { - auto ranges = self->keyInfo.intersectingRanges(KeyRangeRef(m.param1, m.param2)); + KeyRangeRef clearRange(KeyRangeRef(m.param1, m.param2)); + auto ranges = self->keyInfo.intersectingRanges(clearRange); auto firstRange = ranges.begin(); ++firstRange; if (firstRange == ranges.end()) { @@ -833,8 +849,12 @@ ACTOR Future commitBatch( } if (debugMutation("ProxyCommit", commitVersion, m)) TraceEvent("ProxyCommitTo", self->dbgid).detail("To", describe(allSources)).detail("Mutation", m.toString()).detail("Version", commitVersion); + toCommit.addTags(allSources); } + if(self->needsCacheTag(clearRange)) { + toCommit.addTag(cacheTag); + } toCommit.addTypedMessage(m); } else UNREACHABLE(); @@ -1780,7 +1800,7 @@ ACTOR Future masterProxyServerCore( Arena arena; bool confChanges; - applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, NULL, &confChanges, Reference(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : NULL, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, &commitData.tag_popped, true ); + applyMetadataMutations(commitData.dbgid, arena, mutations, commitData.txnStateStore, nullptr, &confChanges, Reference(), 0, &commitData.vecBackupKeys, &commitData.keyInfo, &commitData.cacheInfo, commitData.firstProxy ? &commitData.uid_applyMutationsData : nullptr, commitData.commit, commitData.cx, &commitData.committedVersion, &commitData.storageCache, &commitData.tag_popped, true ); } auto lockedKey = commitData.txnStateStore->readValue(databaseLockedKey).get(); diff --git a/fdbserver/OldTLogServer_6_0.actor.cpp b/fdbserver/OldTLogServer_6_0.actor.cpp index bdfc14306e..eb1b5b9dd3 100644 --- a/fdbserver/OldTLogServer_6_0.actor.cpp +++ b/fdbserver/OldTLogServer_6_0.actor.cpp @@ -678,6 +678,80 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD return Void(); } +ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { + if (self->ignorePopRequest) { + TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); + + if (self->toBePopped.find(inputTag) == self->toBePopped.end() + || to > self->toBePopped[inputTag]) { + self->toBePopped[inputTag] = to; + } + // add the pop to the toBePopped map + TraceEvent(SevDebug, "IgnoringPopRequest") + .detail("IgnorePopDeadline", self->ignorePopDeadline) + .detail("Tag", inputTag.toString()) + .detail("Version", to); + return Void(); + } + state Version upTo = to; + int8_t tagLocality = inputTag.locality; + if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { + upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); + tagLocality = tagLocalityLogRouter; + } + state Tag tag(tagLocality, inputTag.id); + auto tagData = logData->getTagData(tag); + if (!tagData) { + tagData = logData->createTagData(tag, upTo, true, true, false); + } else if (upTo > tagData->popped) { + tagData->popped = upTo; + tagData->poppedRecently = true; + + if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { + tagData->unpoppedRecovered = false; + logData->unpoppedRecoveredTags--; + TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); + if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { + logData->recoveryComplete.send(Void()); + } + } + + if (upTo > logData->persistentDataDurableVersion) + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); + //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); + } + return Void(); +} + +ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { + // timeout check for ignorePopRequest + if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { + + TraceEvent("EnableTLogPlayAllIgnoredPops"); + // use toBePopped and issue all the pops + state std::map::iterator it; + state vector> ignoredPops; + self->ignorePopRequest = false; + self->ignorePopUid = ""; + self->ignorePopDeadline = 0.0; + for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { + TraceEvent("PlayIgnoredPop") + .detail("Tag", it->first.toString()) + .detail("Version", it->second); + ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); + } + self->toBePopped.clear(); + wait(waitForAll(ignoredPops)); + TraceEvent("ResetIgnorePopRequest") + .detail("Now", g_network->now()) + .detail("IgnorePopRequest", self->ignorePopRequest) + .detail("IgnorePopDeadline", self->ignorePopDeadline); + } + wait(tLogPopCore(self, req.tag, req.to, logData)); + req.reply.send(Void()); + return Void(); +} + // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources. // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important // work (e.g. commits). @@ -697,6 +771,26 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; + //FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries. + // It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen. + Optional cachePopVersion; + for(auto& it : self->id_data) { + if(!it.second->stopped) { + if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) { + cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS; + } + break; + } + } + + if(cachePopVersion.present()) { + state std::vector> cachePopFutures; + for(auto& it : self->id_data) { + cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second)); + } + wait( waitForAll(cachePopFutures) ); + } + if(logData->stopped) { if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { @@ -916,80 +1010,6 @@ std::deque> & getVersionMessages( Re return tagData->versionMessages; }; -ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { - if (self->ignorePopRequest) { - TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); - - if (self->toBePopped.find(inputTag) == self->toBePopped.end() - || to > self->toBePopped[inputTag]) { - self->toBePopped[inputTag] = to; - } - // add the pop to the toBePopped map - TraceEvent(SevDebug, "IgnoringPopRequest") - .detail("IgnorePopDeadline", self->ignorePopDeadline) - .detail("Tag", inputTag.toString()) - .detail("Version", to); - return Void(); - } - state Version upTo = to; - int8_t tagLocality = inputTag.locality; - if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { - upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); - tagLocality = tagLocalityLogRouter; - } - state Tag tag(tagLocality, inputTag.id); - auto tagData = logData->getTagData(tag); - if (!tagData) { - tagData = logData->createTagData(tag, upTo, true, true, false); - } else if (upTo > tagData->popped) { - tagData->popped = upTo; - tagData->poppedRecently = true; - - if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { - tagData->unpoppedRecovered = false; - logData->unpoppedRecoveredTags--; - TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); - if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { - logData->recoveryComplete.send(Void()); - } - } - - if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); - //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); - } - return Void(); -} - -ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { - // timeout check for ignorePopRequest - if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { - - TraceEvent("EnableTLogPlayAllIgnoredPops"); - // use toBePopped and issue all the pops - state std::map::iterator it; - state vector> ignoredPops; - self->ignorePopRequest = false; - self->ignorePopUid = ""; - self->ignorePopDeadline = 0.0; - for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { - TraceEvent("PlayIgnoredPop") - .detail("Tag", it->first.toString()) - .detail("Version", it->second); - ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); - } - self->toBePopped.clear(); - wait(waitForAll(ignoredPops)); - TraceEvent("ResetIgnorePopRequest") - .detail("Now", g_network->now()) - .detail("IgnorePopRequest", self->ignorePopRequest) - .detail("IgnorePopDeadline", self->ignorePopDeadline); - } - wait(tLogPopCore(self, req.tag, req.to, logData)); - req.reply.send(Void()); - return Void(); -} - void peekMessagesFromMemory( Reference self, TLogPeekRequest const& req, BinaryWriter& messages, Version& endVersion ) { ASSERT( !messages.getLength() ); diff --git a/fdbserver/ServerDBInfo.h b/fdbserver/ServerDBInfo.h index 67407e1fa9..cf3a6178dc 100644 --- a/fdbserver/ServerDBInfo.h +++ b/fdbserver/ServerDBInfo.h @@ -50,6 +50,7 @@ struct ServerDBInfo { LogSystemConfig logSystemConfig; std::vector priorCommittedLogServers; // If !fullyRecovered and logSystemConfig refers to a new log system which may not have been committed to the coordinated state yet, then priorCommittedLogServers are the previous, fully committed generation which need to stay alive in case this recovery fails Optional latencyBandConfig; + std::vector> storageCaches; explicit ServerDBInfo() : recoveryCount(0), recoveryState(RecoveryState::UNINITIALIZED) {} @@ -58,7 +59,7 @@ struct ServerDBInfo { template void serialize( Ar& ar ) { - serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig); + serializer(ar, id, clusterInterface, client, distributor, master, ratekeeper, resolvers, recoveryCount, recoveryState, masterLifetime, logSystemConfig, priorCommittedLogServers, latencyBandConfig, storageCaches); } }; diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 4c56421b1f..e9fdfda7fc 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -1254,6 +1254,13 @@ void setupSimulatedSystem(vector>* systemActors, std::string baseFo int dcCoordinators = coordinatorCount / dataCenters + (dc < coordinatorCount%dataCenters); printf("Datacenter %d: %d/%d machines, %d/%d coordinators\n", dc, machines, machineCount, dcCoordinators, coordinatorCount); ASSERT( dcCoordinators <= machines ); + + //FIXME: temporarily code to test storage cache + //TODO: caching disabled for this merge + //if(dc==0) { + // machines++; + //} + int useSeedForMachine = deterministicRandom()->randomInt(0, machines); Standalone zoneId; Standalone newZoneId; @@ -1277,6 +1284,13 @@ void setupSimulatedSystem(vector>* systemActors, std::string baseFo nonVersatileMachines++; } + //FIXME: temporarily code to test storage cache + //TODO: caching disabled for this merge + //if(machine==machines-1 && dc==0) { + // processClass = ProcessClass(ProcessClass::StorageCacheClass, ProcessClass::CommandLineSource); + // nonVersatileMachines++; + //} + std::vector ips; for (int i = 0; i < processesPerMachine; i++) { ips.push_back(makeIPAddressForSim(useIPv6, { 2, dc, deterministicRandom()->randomInt(1, i + 2), machine })); diff --git a/fdbserver/StorageCache.actor.cpp b/fdbserver/StorageCache.actor.cpp new file mode 100644 index 0000000000..2887d2017c --- /dev/null +++ b/fdbserver/StorageCache.actor.cpp @@ -0,0 +1,1007 @@ +/* + * StorageCache.actor.cpp + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2013-2019 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "fdbserver/Knobs.h" +#include "fdbserver/ServerDBInfo.h" +#include "fdbclient/StorageServerInterface.h" +#include "fdbclient/VersionedMap.h" +#include "fdbclient/KeyRangeMap.h" +#include "fdbclient/Atomic.h" +#include "fdbclient/Notified.h" +#include "fdbserver/LogSystem.h" +#include "fdbserver/WaitFailure.h" +#include "fdbserver/WorkerInterface.actor.h" +#include "flow/actorcompiler.h" // This must be the last #include. + + +//TODO storageCache server shares quite a bit of storageServer functionality, although simplified +// Need to look into refactoring common code out for better code readability and to avoid duplication + +//TODO rename wrong_shard_server error to wrong_cache_server +inline bool canReplyWith(Error e) { + switch(e.code()) { + case error_code_transaction_too_old: + case error_code_future_version: + case error_code_wrong_shard_server: + case error_code_process_behind: + //case error_code_all_alternatives_failed: + return true; + default: + return false; + }; +} + +const int VERSION_OVERHEAD = 64 + sizeof(Version) + sizeof(Standalone) + //mutationLog, 64b overhead for map + 2 * (64 + sizeof(Version) + sizeof(Reference::PTreeT>)); //versioned map [ x2 for createNewVersion(version+1) ], 64b overhead for map +static int mvccStorageBytes( MutationRef const& m ) { return VersionedMap::overheadPerItem * 2 + (MutationRef::OVERHEAD_BYTES + m.param1.size() + m.param2.size()) * 2; } + +struct StorageCacheData { + typedef VersionedMap VersionedData; +private: + // in-memory versioned struct (PTree as of now. Subject to change) + VersionedData versionedData; + // in-memory mutationLog that the versionedData contains references to + // TODO change it to a deque, already contains mutations in version order + std::map> mutationLog; // versions (durableVersion, version] + +public: + UID thisServerID; // unique id + uint16_t index; // server index + Reference>> logSystem; + Key ck; //cacheKey + KeyRangeMap cachedRangeMap; // map of cached key-ranges + + // The following are in rough order from newest to oldest + // TODO double check which ones we need for storageCache servers + Version lastTLogVersion, lastVersionWithData; + NotifiedVersion version; // current version i.e. the max version that can be read from the cache + NotifiedVersion desiredOldestVersion; // oldestVersion can be increased to this after compaction + NotifiedVersion oldestVersion; // Min version that might be read from the cache + + // TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use + Future compactionInProgress; + + // TODO do we need otherError here? + Promise otherError; + + int64_t versionLag; // An estimate for how many versions it takes for the data to move from the logs to this cache server + bool behind; + + // TODO double check which ones we need for storageCache servers + struct Counters { + CounterCollection cc; + Counter allQueries, getKeyQueries, getValueQueries, getRangeQueries, finishedQueries, rowsQueried, bytesQueried, watchQueries; + Counter bytesInput, mutationBytes; // Like bytesInput but without MVCC accounting + Counter mutations, setMutations, clearRangeMutations, atomicMutations; + Counter updateBatches, updateVersions; + Counter loops; + Counter readsRejected; + + //LatencyBands readLatencyBands; + + Counters(StorageCacheData* self) + : cc("StorageCacheServer", self->thisServerID.toString()), + getKeyQueries("GetKeyQueries", cc), + getValueQueries("GetValueQueries",cc), + getRangeQueries("GetRangeQueries", cc), + allQueries("QueryQueue", cc), + finishedQueries("FinishedQueries", cc), + rowsQueried("RowsQueried", cc), + bytesQueried("BytesQueried", cc), + watchQueries("WatchQueries", cc), + bytesInput("BytesInput", cc), + mutationBytes("MutationBytes", cc), + mutations("Mutations", cc), + setMutations("SetMutations", cc), + clearRangeMutations("ClearRangeMutations", cc), + atomicMutations("AtomicMutations", cc), + updateBatches("UpdateBatches", cc), + updateVersions("UpdateVersions", cc), + loops("Loops", cc), + readsRejected("ReadsRejected", cc) + { + specialCounter(cc, "LastTLogVersion", [self](){ return self->lastTLogVersion; }); + specialCounter(cc, "Version", [self](){ return self->version.get(); }); + specialCounter(cc, "VersionLag", [self](){ return self->versionLag; }); + } + } counters; + + explicit StorageCacheData(UID thisServerID, uint16_t index) + : thisServerID(thisServerID), index(index), + logSystem(new AsyncVar>()), + lastTLogVersion(0), lastVersionWithData(0), + compactionInProgress(Void()), + versionLag(0), behind(false), counters(this) + { + version.initMetric(LiteralStringRef("StorageCacheData.Version"), counters.cc.id); + desiredOldestVersion.initMetric(LiteralStringRef("StorageCacheData.DesriedOldestVersion"), counters.cc.id); + oldestVersion.initMetric(LiteralStringRef("StorageCacheData.OldestVersion"), counters.cc.id); + } + + void addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation); + + bool isReadable( KeyRangeRef const& keys ) { + auto cr = cachedRangeMap.intersectingRanges(keys); + for(auto i = cr.begin(); i != cr.end(); ++i) + if (!i->value()) + return false; + return true; + } + + Arena lastArena; + std::map> const & getMutationLog() { return mutationLog; } + std::map>& getMutableMutationLog() { return mutationLog; } + VersionedData const& data() const { return versionedData; } + VersionedData& mutableData() { return versionedData; } + + Standalone& addVersionToMutationLog(Version v) { + // return existing version... + auto m = mutationLog.find(v); + if (m != mutationLog.end()) + return m->second; + + // ...or create a new one + auto& u = mutationLog[v]; + u.version = v; + if (lastArena.getSize() >= 65536) lastArena = Arena(4096); + u.arena() = lastArena; + counters.bytesInput += VERSION_OVERHEAD; + return u; + } + + MutationRef addMutationToMutationLog(Standalone &mLV, MutationRef const& m){ + //TODO find out more + //byteSampleApplyMutation(m, mLV.version); + counters.bytesInput += mvccStorageBytes(m); + return mLV.mutations.push_back_deep( mLV.arena(), m ); + } + +}; + +///////////////////////////////////// Queries ///////////////////////////////// +#pragma region Queries +ACTOR Future waitForVersion( StorageCacheData* data, Version version ) { + // This could become an Actor transparently, but for now it just does the lookup + if (version == latestVersion) + version = std::max(Version(1), data->version.get()); + if (version < data->oldestVersion.get() || version <= 0) throw transaction_too_old(); + else if (version <= data->version.get()) + return version; + + if(data->behind && version > data->version.get()) { + throw process_behind(); + } + + if(deterministicRandom()->random01() < 0.001) + TraceEvent("WaitForVersion1000x"); + choose { + when ( wait( data->version.whenAtLeast(version) ) ) { + //FIXME: A bunch of these can block with or without the following delay 0. + //wait( delay(0) ); // don't do a whole bunch of these at once + if (version < data->oldestVersion.get()) throw transaction_too_old(); + return version; + } + when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { + if(deterministicRandom()->random01() < 0.001) + TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) + .detail("Version", version) + .detail("MyVersion", data->version.get()) + .detail("ServerID", data->thisServerID); + throw future_version(); + } + } +} + +ACTOR Future waitForVersionNoTooOld( StorageCacheData* data, Version version ) { + // This could become an Actor transparently, but for now it just does the lookup + if (version == latestVersion) + version = std::max(Version(1), data->version.get()); + if (version <= data->version.get()) + return version; + choose { + when ( wait( data->version.whenAtLeast(version) ) ) { + return version; + } + when ( wait( delay( SERVER_KNOBS->FUTURE_VERSION_DELAY ) ) ) { + if(deterministicRandom()->random01() < 0.001) + TraceEvent(SevWarn, "CacheServerFutureVersion1000x", data->thisServerID) + .detail("Version", version) + .detail("MyVersion", data->version.get()) + .detail("ServerID", data->thisServerID); + throw future_version(); + } + } +} + +ACTOR Future getValueQ( StorageCacheData* data, GetValueRequest req ) { + state int64_t resultSize = 0; + + try { + ++data->counters.getValueQueries; + ++data->counters.allQueries; + //++data->readQueueSizeMetric; + //TODO later + //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + + //TODO what's this? + wait( delay(0, TaskPriority::DefaultEndpoint) ); + + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.DoRead"); //.detail("TaskID", g_network->getCurrentTask()); + + state Optional v; + state Version version = wait( waitForVersion( data, req.version ) ); + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterVersion"); //.detail("TaskID", g_network->getCurrentTask()); + + if (!data->cachedRangeMap[req.key]) { + //TraceEvent("WrongCacheServer", data->thisServerID).detail("Key", req.key).detail("Version", version).detail("In", "getValueQ"); + throw wrong_shard_server(); + } + + state int path = 0; + auto i = data->data().at(version).lastLessOrEqual(req.key); + if (i && i->isValue() && i.key() == req.key) { + v = (Value)i->getValue(); + path = 1; + } + + //debugMutation("CacheGetValue", version, MutationRef(MutationRef::DebugKey, req.key, v.present()?v.get():LiteralStringRef(""))); + //debugMutation("CacheGetPath", version, MutationRef(MutationRef::DebugKey, req.key, path==0?LiteralStringRef("0"):path==1?LiteralStringRef("1"):LiteralStringRef("2"))); + + if (v.present()) { + ++data->counters.rowsQueried; + resultSize = v.get().size(); + data->counters.bytesQueried += resultSize; + } + + if( req.debugID.present() ) + g_traceBatch.addEvent("GetValueDebug", req.debugID.get().first(), "getValueQ.AfterRead"); //.detail("TaskID", g_network->getCurrentTask()); + + GetValueReply reply(v); + req.reply.send(reply); + } catch (Error& e) { + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + //--data->readQueueSizeMetric; + //if(data->latencyBandConfig.present()) { + // int maxReadBytes = data->latencyBandConfig.get().readConfig.maxReadBytes.orDefault(std::numeric_limits::max()); + // data->counters.readLatencyBands.addMeasurement(timer() - req.requestTime(), resultSize > maxReadBytes); + //} + + return Void(); +}; + +//TODO Implement the reverse readRange +GetKeyValuesReply readRange(StorageCacheData* data, Version version, KeyRangeRef range, int limit, int* pLimitBytes) { + GetKeyValuesReply result; + StorageCacheData::VersionedData::ViewAtVersion view = data->data().at(version); + StorageCacheData::VersionedData::iterator vCurrent = view.end(); + KeyRef readBegin; + KeyRef rangeBegin = range.begin; + KeyRef rangeEnd = range.end; + + //We might care about a clear beginning before start that runs into range + vCurrent = view.lastLessOrEqual(rangeBegin); + if (vCurrent && vCurrent->isClearTo() && vCurrent->getEndKey() > rangeBegin) + readBegin = vCurrent->getEndKey(); + else + readBegin = rangeBegin; + + vCurrent = view.lower_bound(readBegin); + ASSERT(!vCurrent || vCurrent.key() >= readBegin); + if (vCurrent) { + auto b = vCurrent; + --b; + ASSERT(!b || b.key() < readBegin); + } + int accumulatedBytes = 0; + while (vCurrent && vCurrent.key() < rangeEnd && limit > 0 && accumulatedBytes < *pLimitBytes) { + if (!vCurrent->isClearTo()) { + result.data.push_back_deep(result.arena, KeyValueRef(vCurrent.key(), vCurrent->getValue())); + accumulatedBytes += sizeof(KeyValueRef) + result.data.end()[-1].expectedSize(); + --limit; + } + ++vCurrent; + } + + *pLimitBytes -= accumulatedBytes; + ASSERT(result.data.size() == 0 || *pLimitBytes + result.data.end()[-1].expectedSize() + sizeof(KeyValueRef) > 0); + result.more = limit == 0 || *pLimitBytes <= 0; // FIXME: Does this have to be exact? + result.version = version; + return result; +} + +Key findKey( StorageCacheData* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset) +// Attempts to find the key indicated by sel in the data at version, within range. +// Precondition: selectorInRange(sel, range) +// If it is found, offset is set to 0 and a key is returned which falls inside range. +// If the search would depend on any key outside range OR if the key selector offset is too large (range read returns too many bytes), it returns either +// a negative offset and a key in [range.begin, sel.getKey()], indicating the key is (the first key <= returned key) + offset, or +// a positive offset and a key in (sel.getKey(), range.end], indicating the key is (the first key >= returned key) + offset-1 +// The range passed in to this function should specify a shard. If range.begin is repeatedly not the beginning of a shard, then it is possible to get stuck looping here +{ + ASSERT( version != latestVersion ); + ASSERT( selectorInRange(sel, range) && version >= data->oldestVersion.get()); + + // Count forward or backward distance items, skipping the first one if it == key and skipEqualKey + bool forward = sel.offset > 0; // If forward, result >= sel.getKey(); else result <= sel.getKey() + int sign = forward ? +1 : -1; + bool skipEqualKey = sel.orEqual == forward; + int distance = forward ? sel.offset : 1-sel.offset; + + //Don't limit the number of bytes if this is a trivial key selector (there will be at most two items returned from the read range in this case) + int maxBytes; + if (sel.offset <= 1 && sel.offset >= 0) + maxBytes = std::numeric_limits::max(); + else + maxBytes = BUGGIFY ? SERVER_KNOBS->BUGGIFY_LIMIT_BYTES : SERVER_KNOBS->STORAGE_LIMIT_BYTES; + + GetKeyValuesReply rep = readRange( data, version, + forward ? KeyRangeRef(sel.getKey(), range.end) : KeyRangeRef(range.begin, keyAfter(sel.getKey())), + (distance + skipEqualKey)*sign, &maxBytes ); + bool more = rep.more && rep.data.size() != distance + skipEqualKey; + + //If we get only one result in the reverse direction as a result of the data being too large, we could get stuck in a loop + if(more && !forward && rep.data.size() == 1) { + TEST(true); //Reverse key selector returned only one result in range read + maxBytes = std::numeric_limits::max(); + GetKeyValuesReply rep2 = readRange( data, version, KeyRangeRef(range.begin, keyAfter(sel.getKey())), -2, &maxBytes ); + rep = rep2; + more = rep.more && rep.data.size() != distance + skipEqualKey; + ASSERT(rep.data.size() == 2 || !more); + } + + int index = distance-1; + if (skipEqualKey && rep.data.size() && rep.data[0].key == sel.getKey() ) + ++index; + + if (index < rep.data.size()) { + *pOffset = 0; + return rep.data[ index ].key; + } else { + // FIXME: If range.begin=="" && !forward, return success? + *pOffset = index - rep.data.size() + 1; + if (!forward) *pOffset = -*pOffset; + + if (more) { + TEST(true); // Key selector read range had more results + + ASSERT(rep.data.size()); + Key returnKey = forward ? keyAfter(rep.data.back().key) : rep.data.back().key; + + //This is possible if key/value pairs are very large and only one result is returned on a last less than query + //SOMEDAY: graceful handling of exceptionally sized values + ASSERT(returnKey != sel.getKey()); + + return returnKey; + } else + return forward ? range.end : range.begin; + } +} + +KeyRange getCachedKeyRange( StorageCacheData* data, const KeySelectorRef& sel ) +// Returns largest range that is cached on this server and selectorInRange(sel, range) or wrong_shard_server if no such range exists +{ + auto i = sel.isBackward() ? data->cachedRangeMap.rangeContainingKeyBefore( sel.getKey() ) : + data->cachedRangeMap.rangeContaining( sel.getKey() ); + if (!i->value()) throw wrong_shard_server(); + ASSERT( selectorInRange(sel, i->range()) ); + return i->range(); +} + +ACTOR Future getKeyValues( StorageCacheData* data, GetKeyValuesRequest req ) +// Throws a wrong_shard_server if the keys in the request or result depend on data outside this server OR if a large selector offset prevents +// all data from being read in one range read +{ + state int64_t resultSize = 0; + + ++data->counters.getRangeQueries; + ++data->counters.allQueries; + //++data->readQueueSizeMetric; + //data->maxQueryQueue = std::max( data->maxQueryQueue, data->counters.allQueries.getValue() - data->counters.finishedQueries.getValue()); + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + TaskPriority taskType = TaskPriority::DefaultEndpoint; + if (SERVER_KNOBS->FETCH_KEYS_LOWER_PRIORITY && req.isFetchKeys) { + taskType = TaskPriority::FetchKeys; + // } else if (false) { + // // Placeholder for up-prioritizing fetches for important requests + // taskType = TaskPriority::DefaultDelay; + } + wait( delay(0, taskType) ); + + try { + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Before"); + state Version version = wait( waitForVersion( data, req.version ) ); + + try { + state KeyRange cachedKeyRange = getCachedKeyRange( data, req.begin ); + + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterVersion"); + //.detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end); + } catch (Error& e) { TraceEvent("WrongShardServer", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("Shard", "None").detail("In", "getKeyValues>getShardKeyRange"); throw e; } + + if ( !selectorInRange(req.end, cachedKeyRange) && !(req.end.isFirstGreaterOrEqual() && req.end.getKey() == cachedKeyRange.end) ) { +// TraceEvent("WrongShardServer1", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkShardExtents"); + throw wrong_shard_server(); + } + + state int offset1; + state int offset2; + state Key begin = req.begin.isFirstGreaterOrEqual() ? req.begin.getKey() : findKey( data, req.begin, version, cachedKeyRange, &offset1 ); + state Key end = req.end.isFirstGreaterOrEqual() ? req.end.getKey() : findKey( data, req.end, version, cachedKeyRange, &offset2 ); + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterKeys"); + //.detail("Off1",offset1).detail("Off2",offset2).detail("ReqBegin",req.begin.getKey()).detail("ReqEnd",req.end.getKey()); + + // Offsets of zero indicate begin/end keys in this cachedKeyRange, which obviously means we can answer the query + // An end offset of 1 is also OK because the end key is exclusive, so if the first key of the next cachedKeyRange is the end the last actual key returned must be from this cachedKeyRange. + // A begin offset of 1 is also OK because then either begin is past end or equal to end (so the result is definitely empty) + if ((offset1 && offset1!=1) || (offset2 && offset2!=1)) { + TEST(true); // wrong_cache_server due to offset + // We could detect when offset1 takes us off the beginning of the database or offset2 takes us off the end, and return a clipped range rather + // than an error (since that is what the NativeAPI.getRange will do anyway via its "slow path"), but we would have to add some flags to the response + // to encode whether we went off the beginning and the end, since it needs that information. + //TraceEvent("WrongShardServer2", data->thisServerID).detail("Begin", req.begin.toString()).detail("End", req.end.toString()).detail("Version", version).detail("ShardBegin", shard.begin).detail("ShardEnd", shard.end).detail("In", "getKeyValues>checkOffsets").detail("BeginKey", begin).detail("EndKey", end).detail("BeginOffset", offset1).detail("EndOffset", offset2); + throw wrong_shard_server(); + } + + if (begin >= end) { + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.Send"); + //.detail("Begin",begin).detail("End",end); + + GetKeyValuesReply none; + none.version = version; + none.more = false; + req.reply.send( none ); + } else { + state int remainingLimitBytes = req.limitBytes; + + GetKeyValuesReply _r = readRange(data, version, KeyRangeRef(begin, end), req.limit, &remainingLimitBytes); + GetKeyValuesReply r = _r; + + if( req.debugID.present() ) + g_traceBatch.addEvent("TransactionDebug", req.debugID.get().first(), "storagecache.getKeyValues.AfterReadRange"); + //.detail("Begin",begin).detail("End",end).detail("SizeOf",r.data.size()); + if (EXPENSIVE_VALIDATION) { + for (int i = 0; i < r.data.size(); i++) + ASSERT(r.data[i].key >= begin && r.data[i].key < end); + ASSERT(r.data.size() <= std::abs(req.limit)); + } + + req.reply.send( r ); + + resultSize = req.limitBytes - remainingLimitBytes; + data->counters.bytesQueried += resultSize; + data->counters.rowsQueried += r.data.size(); + } + } catch (Error& e) { + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + + return Void(); +} + +ACTOR Future getKey( StorageCacheData* data, GetKeyRequest req ) { + state int64_t resultSize = 0; + + ++data->counters.getKeyQueries; + ++data->counters.allQueries; + + // Active load balancing runs at a very high priority (to obtain accurate queue lengths) + // so we need to downgrade here + wait( delay(0, TaskPriority::DefaultEndpoint) ); + + try { + state Version version = wait( waitForVersion( data, req.version ) ); + state KeyRange cachedKeyRange = getCachedKeyRange( data, req.sel ); + + state int offset; + Key k = findKey( data, req.sel, version, cachedKeyRange, &offset ); + + KeySelector updated; + if (offset < 0) + updated = firstGreaterOrEqual(k)+offset; // first thing on this shard OR (large offset case) smallest key retrieved in range read + else if (offset > 0) + updated = firstGreaterOrEqual(k)+offset-1; // first thing on next shard OR (large offset case) keyAfter largest key retrieved in range read + else + updated = KeySelectorRef(k,true,0); //found + + resultSize = k.size(); + data->counters.bytesQueried += resultSize; + ++data->counters.rowsQueried; + + GetKeyReply reply(updated); + req.reply.send(reply); + } + catch (Error& e) { + if (e.code() == error_code_wrong_shard_server) TraceEvent("WrongShardServer").detail("In","getKey"); + if(!canReplyWith(e)) + throw; + req.reply.sendError(e); + } + + ++data->counters.finishedQueries; + + return Void(); +} + +#pragma endregion + +bool expandMutation( MutationRef& m, StorageCacheData::VersionedData const& data, KeyRef eagerTrustedEnd, Arena& ar ) { + // After this function call, m should be copied into an arena immediately (before modifying data, shards, or eager) + if (m.type == MutationRef::ClearRange) { + // Expand the clear + const auto& d = data.atLatest(); + + // If another clear overlaps the beginning of this one, engulf it + auto i = d.lastLess(m.param1); + if (i && i->isClearTo() && i->getEndKey() >= m.param1) + m.param1 = i.key(); + + // If another clear overlaps the end of this one, engulf it; otherwise expand + i = d.lastLessOrEqual(m.param2); + if (i && i->isClearTo() && i->getEndKey() >= m.param2) { + m.param2 = i->getEndKey(); + } else { + // Expand to the next set or clear (from storage or latestVersion), and if it + // is a clear, engulf it as well + i = d.lower_bound(m.param2); + //KeyRef endKeyAtStorageVersion = m.param2 == eagerTrustedEnd ? eagerTrustedEnd : std::min( eager->getKeyEnd( m.param2 ), eagerTrustedEnd ); + // TODO check if the following is correct + KeyRef endKeyAtStorageVersion = eagerTrustedEnd; + if (!i || endKeyAtStorageVersion < i.key()) + m.param2 = endKeyAtStorageVersion; + else if (i->isClearTo()) + m.param2 = i->getEndKey(); + else + m.param2 = i.key(); + } + } + else if (m.type != MutationRef::SetValue && (m.type)) { + + Optional oldVal; + auto it = data.atLatest().lastLessOrEqual(m.param1); + if (it != data.atLatest().end() && it->isValue() && it.key() == m.param1) + oldVal = it->getValue(); + else if (it != data.atLatest().end() && it->isClearTo() && it->getEndKey() > m.param1) { + TEST(true); // Atomic op right after a clear. + } + + switch(m.type) { + case MutationRef::AddValue: + m.param2 = doLittleEndianAdd(oldVal, m.param2, ar); + break; + case MutationRef::And: + m.param2 = doAnd(oldVal, m.param2, ar); + break; + case MutationRef::Or: + m.param2 = doOr(oldVal, m.param2, ar); + break; + case MutationRef::Xor: + m.param2 = doXor(oldVal, m.param2, ar); + break; + case MutationRef::AppendIfFits: + m.param2 = doAppendIfFits(oldVal, m.param2, ar); + break; + case MutationRef::Max: + m.param2 = doMax(oldVal, m.param2, ar); + break; + case MutationRef::Min: + m.param2 = doMin(oldVal, m.param2, ar); + break; + case MutationRef::ByteMin: + m.param2 = doByteMin(oldVal, m.param2, ar); + break; + case MutationRef::ByteMax: + m.param2 = doByteMax(oldVal, m.param2, ar); + break; + case MutationRef::MinV2: + m.param2 = doMinV2(oldVal, m.param2, ar); + break; + case MutationRef::AndV2: + m.param2 = doAndV2(oldVal, m.param2, ar); + break; + case MutationRef::CompareAndClear: + if (oldVal.present() && m.param2 == oldVal.get()) { + m.type = MutationRef::ClearRange; + m.param2 = keyAfter(m.param1, ar); + return expandMutation(m, data, eagerTrustedEnd, ar); + } + return false; + } + m.type = MutationRef::SetValue; + } + + return true; +} + +// Applies a write mutation (SetValue or ClearRange) to the in-memory versioned data structure +void applyMutation( StorageCacheData *self, MutationRef const& m, Arena& arena, StorageCacheData::VersionedData &data ) { + // m is expected to be in arena already + // Clear split keys are added to arena + + if (m.type == MutationRef::SetValue) { + auto prev = data.atLatest().lastLessOrEqual(m.param1); + if (prev && prev->isClearTo() && prev->getEndKey() > m.param1) { + ASSERT( prev.key() <= m.param1 ); + KeyRef end = prev->getEndKey(); + // TODO double check if the insert version of the previous clear needs to be preserved for the "left half", + // insert() invalidates prev, so prev.key() is not safe to pass to it by reference + data.insert( KeyRef(prev.key()), ValueOrClearToRef::clearTo( m.param1 ), prev.insertVersion() ); // overwritten by below insert if empty + KeyRef nextKey = keyAfter(m.param1, arena); + if ( end != nextKey ) { + ASSERT( end > nextKey ); + // TODO double check if it's okay to let go of the the insert version of the "right half" + // FIXME: This copy is technically an asymptotic problem, definitely a waste of memory (copy of keyAfter is a waste, but not asymptotic) + data.insert( nextKey, ValueOrClearToRef::clearTo( KeyRef(arena, end) ) ); + } + } + data.insert( m.param1, ValueOrClearToRef::value(m.param2) ); + } else if (m.type == MutationRef::ClearRange) { + data.erase( m.param1, m.param2 ); + ASSERT( m.param2 > m.param1 ); + ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) ); + data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) ); + } +} + +template +void splitMutation(StorageCacheData* data, KeyRangeMap& map, MutationRef const& m, Version ver) { + if(isSingleKeyMutation((MutationRef::Type) m.type)) { + auto i = map.rangeContaining(m.param1); + if (i->value()) // If this key lies in the cached key-range on this server + data->addMutation( i->range(), ver, m ); + } + else if (m.type == MutationRef::ClearRange) { + KeyRangeRef mKeys( m.param1, m.param2 ); + auto r = map.intersectingRanges( mKeys ); + for(auto i = r.begin(); i != r.end(); ++i) { + if (i->value()) { // if this sub-range exists on this cache server + KeyRangeRef k = mKeys & i->range(); + data->addMutation( i->range(), ver, MutationRef((MutationRef::Type)m.type, k.begin, k.end) ); + } + } + } else + ASSERT(false); // Unknown mutation type in splitMutations +} + +void StorageCacheData::addMutation(KeyRangeRef const& cachedKeyRange, Version version, MutationRef const& mutation) { + MutationRef expanded = mutation; + auto& mLog = addVersionToMutationLog(version); + + if ( !expandMutation( expanded, data(), cachedKeyRange.end, mLog.arena()) ) { + return; + } + expanded = addMutationToMutationLog(mLog, expanded); + if (debugMutation("expandedMutation", version, expanded)) { + const char* type = + mutation.type == MutationRef::SetValue ? "SetValue" : + mutation.type == MutationRef::ClearRange ? "ClearRange" : + mutation.type == MutationRef::DebugKeyRange ? "DebugKeyRange" : + mutation.type == MutationRef::DebugKey ? "DebugKey" : + "UnknownMutation"; + printf("DEBUGMUTATION:\t%.6f\t%s\t%s\t%s\t%s\t%s\n", + now(), g_network->getLocalAddress().toString().c_str(), "originalMutation", + type, printable(mutation.param1).c_str(), printable(mutation.param2).c_str()); + printf(" Cached Key-range: %s - %s\n", printable(cachedKeyRange.begin).c_str(), printable(cachedKeyRange.end).c_str()); + } + applyMutation( this, expanded, mLog.arena(), mutableData() ); + printf("\nSCUpdate: Printing versioned tree after applying mutation\n"); + mutableData().printTree(version); + +} + +// Helper class for updating the storage cache (i.e. applying mutations) +class StorageCacheUpdater { +public: + StorageCacheUpdater() : currentVersion(invalidVersion), processedCacheStartKey(false) {} + StorageCacheUpdater(Version currentVersion) : currentVersion(currentVersion), processedCacheStartKey(false) {} + + void applyMutation(StorageCacheData* data, MutationRef const& m , Version ver) { + //TraceEvent("SCNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver); + + if(currentVersion != ver) { + currentVersion = ver; + data->mutableData().createNewVersion(ver); + } + + if (m.param1.startsWith( systemKeys.end )) { + //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); + applyPrivateCacheData( data, m ); + } else { + // FIXME: enable when debugMutation is active + //for(auto m = changes[c].mutations.begin(); m; ++m) { + // debugMutation("SCUpdateMutation", changes[c].version, *m); + //} + + splitMutation(data, data->cachedRangeMap, m, ver); + } + + //TODO + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + } + + Version currentVersion; +private: + KeyRef cacheStartKey; + bool nowAssigned; + bool processedCacheStartKey; + + // Applies private mutations, as the name suggests. It's basically establishes the key-ranges + //that this cache server is responsible for + // TODO Revisit during failure handling. Might we loose some private mutations? + void applyPrivateCacheData( StorageCacheData* data, MutationRef const& m ) { + TraceEvent(SevDebug, "SCPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString()); + + if (processedCacheStartKey) { + // we expect changes in pairs, [begin,end). This mutation is for end key of the range + ASSERT (m.type == MutationRef::SetValue && m.param1.startsWith(data->ck)); + KeyRangeRef keys( cacheStartKey.removePrefix(data->ck), m.param1.removePrefix(data->ck)); + data->cachedRangeMap.insert(keys, true); + fprintf(stderr, "SCPrivateCacheMutation: begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str()); + + processedCacheStartKey = false; + } else if (m.type == MutationRef::SetValue && m.param1.startsWith( data->ck )) { + // We expect changes in pairs, [begin,end), This mutation is for start key of the range + cacheStartKey = m.param1; + processedCacheStartKey = true; + } else { + fprintf(stderr, "SCPrivateCacheMutation: Unknown private mutation\n"); + ASSERT(false); // Unknown private mutation + } + } +}; + +// Compacts the in-memory VersionedMap, i.e. removes versions below the desiredOldestVersion +// TODO revisit if we change the data structure +ACTOR Future compactCache(StorageCacheData* data) { + loop { + //TODO understand this, should we add delay here? + //if (g_network->isSimulated()) { + // double endTime = g_simulator.checkDisabled(format("%s/compactCache", data->thisServerID.toString().c_str())); + // if(endTime > now()) { + // wait(delay(endTime - now(), TaskPriority::CompactCache)); + // } + //} + + // Wait until the desiredOldestVersion is greater than the current oldestVersion + wait( data->desiredOldestVersion.whenAtLeast( data->oldestVersion.get()+1 ) ); + wait( delay(0, TaskPriority::CompactCache) ); + + //TODO not really in use as of now. may need in some failure cases. Revisit and remove if no plausible use + state Promise compactionInProgress; + data->compactionInProgress = compactionInProgress.getFuture(); + state Version oldestVersion = data->oldestVersion.get(); + state Version desiredVersion = data->desiredOldestVersion.get(); + // Call the compaction routine that does the actual work, + // TODO It's a synchronous function call as of now. Should it asynch? + data->mutableData().compact(desiredVersion); + Future finishedForgetting = data->mutableData().forgetVersionsBeforeAsync( desiredVersion, + TaskPriority::CompactCache ); + data->oldestVersion.set( desiredVersion ); + wait( finishedForgetting ); + // TODO how do we yield here? This may not be enough, because compact() does the heavy lifting + // of compating the VersionedMap. We should probably look into per version compaction and then + // we can yield after compacting one version + wait( yield(TaskPriority::CompactCache) ); + + // TODO what flowlock to acquire during compaction? + compactionInProgress.send(Void()); + wait( delay(0, TaskPriority::CompactCache) ); //Setting compactionInProgess could cause the cache server to shut down, so delay to check for cancellation + } +} + +ACTOR Future pullAsyncData( StorageCacheData *data ) { + state Future dbInfoChange = Void(); + state Reference r; + state Version tagAt = 0; + + state StorageCacheUpdater updater(data->lastVersionWithData); + state Version ver = invalidVersion; + //data->lastTLogVersion = r->getMaxKnownVersion(); + //data->versionLag = std::max(0, data->lastTLogVersion - data->version.get()); + ++data->counters.updateBatches; + + loop { + loop { + choose { + when(wait( r ? r->getMore(TaskPriority::TLogCommit) : Never() ) ) { + break; + } + when( wait( dbInfoChange ) ) { + if( data->logSystem->get() ) + r = data->logSystem->get()->peek( data->thisServerID, tagAt, Optional(), cacheTag, true ); + else + r = Reference(); + dbInfoChange = data->logSystem->onChange(); + } + } + } + //FIXME: if the popped version is greater than our last version, we need to clear the cache + + //FIXME: ensure this can only read data from the current version + r->setProtocolVersion(currentProtocolVersion); + + // Now process the mutations + for (; r->hasMessage(); r->nextMessage()) { + ArenaReader& reader = *r->reader(); + + MutationRef msg; + reader >> msg; + fprintf(stderr, "%lld : %s\n", r->version().version, msg.toString().c_str()); + + if (r->version().version > ver && r->version().version > data->version.get()) { + ++data->counters.updateVersions; + ver = r->version().version; + } + if (ver != invalidVersion) // This change belongs to a version < minVersion + { + updater.applyMutation(data, msg, ver); + // TODO + //mutationBytes += msg.totalSize(); + data->counters.mutationBytes += msg.totalSize(); + ++data->counters.mutations; + switch(msg.type) { + case MutationRef::SetValue: + ++data->counters.setMutations; + break; + case MutationRef::ClearRange: + ++data->counters.clearRangeMutations; + break; + case MutationRef::AddValue: + case MutationRef::And: + case MutationRef::AndV2: + case MutationRef::AppendIfFits: + case MutationRef::ByteMax: + case MutationRef::ByteMin: + case MutationRef::Max: + case MutationRef::Min: + case MutationRef::MinV2: + case MutationRef::Or: + case MutationRef::Xor: + case MutationRef::CompareAndClear: + ++data->counters.atomicMutations; + break; + } + } + else + TraceEvent(SevError, "DiscardingPeekedData", data->thisServerID).detail("Mutation", msg.toString()).detail("Version", r->version().toString()); + + tagAt = r->version().version + 1; + } + + if(ver != invalidVersion) { + data->lastVersionWithData = ver; + } else { + // TODO double check + ver = r->version().version - 1; + } + + if(ver != invalidVersion && ver > data->version.get()) { + debugKeyRange("SCUpdate", ver, allKeys); + + data->mutableData().createNewVersion(ver); + + // TODO what about otherError + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + + // TODO may enable these later + //data->noRecentUpdates.set(false); + //data->lastUpdate = now(); + data->version.set( ver ); // Triggers replies to waiting gets for new version(s) + // TODO double check + //setDataVersion(data->thisServerID, data->version.get()); + + // TODO what about otherError + if (data->otherError.getFuture().isReady()) data->otherError.getFuture().get(); + + // we can get rid of versions beyond maxVerionsInMemory at any point. Update the + //desiredOldestVersion and that may invoke the compaction actor + Version maxVersionsInMemory = SERVER_KNOBS->MAX_READ_TRANSACTION_LIFE_VERSIONS; + Version proposedOldestVersion = data->version.get() - maxVersionsInMemory; + proposedOldestVersion = std::max(proposedOldestVersion, data->oldestVersion.get()); + data->desiredOldestVersion.set(proposedOldestVersion); + } + + // TODO implement a validate function for the cache + //validate(data); + + if(r->version().version >= data->lastTLogVersion) { + if(data->behind) { + TraceEvent("StorageCacheNoLongerBehind", data->thisServerID).detail("CursorVersion", r->version().version).detail("TLogVersion", data->lastTLogVersion); + } + data->behind = false; + } + + tagAt = std::max( tagAt, r->version().version); + } +} + +ACTOR Future storageCache(StorageServerInterface ssi, uint16_t id, Reference> db) { + state StorageCacheData self(ssi.id(), id); + state ActorCollection actors(false); + state Future dbInfoChange = Void(); + + // This helps identify the private mutations meant for this cache server + self.ck = cacheKeysPrefixFor( id ).withPrefix(systemKeys.begin); // FFFF/02cacheKeys/[this server]/ + + actors.add(waitFailureServer(ssi.waitFailure.getFuture())); + + // compactCache actor will periodically compact the cache when certain version condityion is met + actors.add(compactCache(&self)); + + // pullAsyncData actor pulls mutations from the TLog and also applies them. + actors.add(pullAsyncData(&self)); + + loop { + ++self.counters.loops; + choose { + when( wait( dbInfoChange ) ) { + dbInfoChange = db->onChange(); + self.logSystem->set(ILogSystem::fromServerDBInfo( ssi.id(), db->get(), true )); + } + when( GetValueRequest req = waitNext(ssi.getValue.getFuture()) ) { + // TODO do we need to add throttling for cache servers? Probably not + //actors.add(self->readGuard(req , getValueQ)); + actors.add(getValueQ(&self, req)); + } + when( WatchValueRequest req = waitNext(ssi.watchValue.getFuture()) ) { + ASSERT(false); + } + when (GetKeyRequest req = waitNext(ssi.getKey.getFuture())) { + actors.add(getKey(&self, req)); + } + when (GetKeyValuesRequest req = waitNext(ssi.getKeyValues.getFuture()) ) { + actors.add(getKeyValues(&self, req)); + } + when (GetShardStateRequest req = waitNext(ssi.getShardState.getFuture()) ) { + ASSERT(false); + } + when (StorageQueuingMetricsRequest req = waitNext(ssi.getQueuingMetrics.getFuture())) { + ASSERT(false); + } + //when( ReplyPromise reply = waitNext(ssi.getVersion.getFuture()) ) { + // ASSERT(false); + //} + when( ReplyPromise reply = waitNext(ssi.getKeyValueStoreType.getFuture()) ) { + ASSERT(false); + } + when(wait(actors.getResult())) {} + } + } +} diff --git a/fdbserver/TLogServer.actor.cpp b/fdbserver/TLogServer.actor.cpp index 39d39f7e5a..d794633905 100644 --- a/fdbserver/TLogServer.actor.cpp +++ b/fdbserver/TLogServer.actor.cpp @@ -959,6 +959,81 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD return Void(); } +ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { + if (self->ignorePopRequest) { + TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); + + if (self->toBePopped.find(inputTag) == self->toBePopped.end() + || to > self->toBePopped[inputTag]) { + self->toBePopped[inputTag] = to; + } + // add the pop to the toBePopped map + TraceEvent(SevDebug, "IgnoringPopRequest") + .detail("IgnorePopDeadline", self->ignorePopDeadline) + .detail("Tag", inputTag.toString()) + .detail("Version", to); + return Void(); + } + state Version upTo = to; + int8_t tagLocality = inputTag.locality; + if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { + upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); + tagLocality = tagLocalityLogRouter; + } + state Tag tag(tagLocality, inputTag.id); + auto tagData = logData->getTagData(tag); + if (!tagData) { + tagData = logData->createTagData(tag, upTo, true, true, false); + } else if (upTo > tagData->popped) { + tagData->popped = upTo; + tagData->poppedRecently = true; + tagData->requiresPoppedLocationUpdate = true; + + if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { + tagData->unpoppedRecovered = false; + logData->unpoppedRecoveredTags--; + TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); + if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { + logData->recoveryComplete.send(Void()); + } + } + + if (upTo > logData->persistentDataDurableVersion) + wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); + //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); + } + return Void(); +} + +ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { + // timeout check for ignorePopRequest + if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { + + TraceEvent("EnableTLogPlayAllIgnoredPops"); + // use toBePopped and issue all the pops + std::map::iterator it; + vector> ignoredPops; + self->ignorePopRequest = false; + self->ignorePopUid = ""; + self->ignorePopDeadline = 0.0; + for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { + TraceEvent("PlayIgnoredPop") + .detail("Tag", it->first.toString()) + .detail("Version", it->second); + ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); + } + self->toBePopped.clear(); + wait(waitForAll(ignoredPops)); + TraceEvent("ResetIgnorePopRequest") + .detail("Now", g_network->now()) + .detail("IgnorePopRequest", self->ignorePopRequest) + .detail("IgnorePopDeadline", self->ignorePopDeadline); + } + wait(tLogPopCore(self, req.tag, req.to, logData)); + req.reply.send(Void()); + return Void(); +} + // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources. // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important // work (e.g. commits). @@ -978,6 +1053,26 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; + //FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries. + // It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen. + Optional cachePopVersion; + for(auto& it : self->id_data) { + if(!it.second->stopped) { + if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) { + cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS; + } + break; + } + } + + if(cachePopVersion.present()) { + state std::vector> cachePopFutures; + for(auto& it : self->id_data) { + cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second)); + } + wait( waitForAll(cachePopFutures) ); + } + if(logData->stopped) { if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { @@ -1208,81 +1303,6 @@ std::deque> & getVersionMessages( Re return tagData->versionMessages; }; -ACTOR Future tLogPopCore( TLogData* self, Tag inputTag, Version to, Reference logData ) { - if (self->ignorePopRequest) { - TraceEvent(SevDebug, "IgnoringPopRequest").detail("IgnorePopDeadline", self->ignorePopDeadline); - - if (self->toBePopped.find(inputTag) == self->toBePopped.end() - || to > self->toBePopped[inputTag]) { - self->toBePopped[inputTag] = to; - } - // add the pop to the toBePopped map - TraceEvent(SevDebug, "IgnoringPopRequest") - .detail("IgnorePopDeadline", self->ignorePopDeadline) - .detail("Tag", inputTag.toString()) - .detail("Version", to); - return Void(); - } - state Version upTo = to; - int8_t tagLocality = inputTag.locality; - if (logData->logSystem->get().isValid() && logData->logSystem->get()->isPseudoLocality(tagLocality)) { - upTo = logData->logSystem->get()->popPseudoLocalityTag(tagLocality, to); - tagLocality = tagLocalityLogRouter; - } - state Tag tag(tagLocality, inputTag.id); - auto tagData = logData->getTagData(tag); - if (!tagData) { - tagData = logData->createTagData(tag, upTo, true, true, false); - } else if (upTo > tagData->popped) { - tagData->popped = upTo; - tagData->poppedRecently = true; - tagData->requiresPoppedLocationUpdate = true; - - if(tagData->unpoppedRecovered && upTo > logData->recoveredAt) { - tagData->unpoppedRecovered = false; - logData->unpoppedRecoveredTags--; - TraceEvent("TLogPoppedTag", logData->logId).detail("Tags", logData->unpoppedRecoveredTags).detail("Tag", tag.toString()).detail("DurableKCVer", logData->durableKnownCommittedVersion).detail("RecoveredAt", logData->recoveredAt); - if(logData->unpoppedRecoveredTags == 0 && logData->durableKnownCommittedVersion >= logData->recoveredAt && logData->recoveryComplete.canBeSet()) { - logData->recoveryComplete.send(Void()); - } - } - - if (upTo > logData->persistentDataDurableVersion) - wait(tagData->eraseMessagesBefore(upTo, self, logData, TaskPriority::TLogPop)); - //TraceEvent("TLogPop", self->dbgid).detail("Tag", tag.toString()).detail("To", upTo); - } - return Void(); -} - -ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ) { - // timeout check for ignorePopRequest - if (self->ignorePopRequest && (g_network->now() > self->ignorePopDeadline)) { - - TraceEvent("EnableTLogPlayAllIgnoredPops"); - // use toBePopped and issue all the pops - std::map::iterator it; - vector> ignoredPops; - self->ignorePopRequest = false; - self->ignorePopUid = ""; - self->ignorePopDeadline = 0.0; - for (it = self->toBePopped.begin(); it != self->toBePopped.end(); it++) { - TraceEvent("PlayIgnoredPop") - .detail("Tag", it->first.toString()) - .detail("Version", it->second); - ignoredPops.push_back(tLogPopCore(self, it->first, it->second, logData)); - } - self->toBePopped.clear(); - wait(waitForAll(ignoredPops)); - TraceEvent("ResetIgnorePopRequest") - .detail("Now", g_network->now()) - .detail("IgnorePopRequest", self->ignorePopRequest) - .detail("IgnorePopDeadline", self->ignorePopDeadline); - } - wait(tLogPopCore(self, req.tag, req.to, logData)); - req.reply.send(Void()); - return Void(); -} - void peekMessagesFromMemory( Reference self, TLogPeekRequest const& req, BinaryWriter& messages, Version& endVersion ) { ASSERT( !messages.getLength() ); diff --git a/fdbserver/TagPartitionedLogSystem.actor.cpp b/fdbserver/TagPartitionedLogSystem.actor.cpp index 89c0e9a71d..68b3e21b24 100644 --- a/fdbserver/TagPartitionedLogSystem.actor.cpp +++ b/fdbserver/TagPartitionedLogSystem.actor.cpp @@ -459,7 +459,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedisLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || - tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { + tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { lastBegin = std::max(lastBegin, log->startVersion); localSets.push_back(log); if(log->locality != tagLocalitySatellite) { @@ -486,7 +486,7 @@ struct TagPartitionedLogSystem : ILogSystem, ReferenceCountedisLocal && log->logServers.size() && (log->locality == tagLocalitySpecial || log->locality == tagLocalityUpgraded || log->locality == tag.locality || - tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || (tag.locality == tagLocalityUpgraded && log->locality != tagLocalitySatellite))) { + tag == txsTag || tag.locality == tagLocalityTxs || tag.locality == tagLocalityLogRouter || ((tag.locality == tagLocalityUpgraded || tag == cacheTag) && log->locality != tagLocalitySatellite))) { thisBegin = std::max(thisBegin, log->startVersion); localOldSets.push_back(log); if(log->locality != tagLocalitySatellite) { diff --git a/fdbserver/WorkerInterface.actor.h b/fdbserver/WorkerInterface.actor.h index c50ffde07f..b7b5a07fca 100644 --- a/fdbserver/WorkerInterface.actor.h +++ b/fdbserver/WorkerInterface.actor.h @@ -386,6 +386,7 @@ struct Role { static const Role LOG_ROUTER; static const Role DATA_DISTRIBUTOR; static const Role RATEKEEPER; + static const Role STORAGE_CACHE; static const Role COORDINATOR; std::string roleName; @@ -455,6 +456,7 @@ ACTOR Future logRouter(TLogInterface interf, InitializeLogRouterRequest re Reference> db); ACTOR Future dataDistributor(DataDistributorInterface ddi, Reference> db); ACTOR Future ratekeeper(RatekeeperInterface rki, Reference> db); +ACTOR Future storageCache(StorageServerInterface interf, uint16_t id, Reference> db); void registerThreadForProfiling(); void updateCpuProfiler(ProfilerRequest req); diff --git a/fdbserver/fdbserver.vcxproj b/fdbserver/fdbserver.vcxproj index 783bcb160c..70eb919936 100644 --- a/fdbserver/fdbserver.vcxproj +++ b/fdbserver/fdbserver.vcxproj @@ -54,6 +54,7 @@ + diff --git a/fdbserver/fdbserver.vcxproj.filters b/fdbserver/fdbserver.vcxproj.filters index 348278eea7..92b8df76e3 100644 --- a/fdbserver/fdbserver.vcxproj.filters +++ b/fdbserver/fdbserver.vcxproj.filters @@ -197,6 +197,7 @@ workloads + diff --git a/fdbserver/masterserver.actor.cpp b/fdbserver/masterserver.actor.cpp index 56e6bf1cbc..bfc8eb6a15 100644 --- a/fdbserver/masterserver.actor.cpp +++ b/fdbserver/masterserver.actor.cpp @@ -684,6 +684,9 @@ ACTOR Future readTransactionSystemState( Reference self, Refer Standalone> rawTags = wait( self->txnStateStore->readRange( serverTagKeys ) ); self->allTags.clear(); + if(self->lastEpochEnd > 0) { + self->allTags.push_back(cacheTag); + } if(self->forceRecovery) { self->safeLocality = oldLogSystem->getLogSystemConfig().tLogs[0].locality; @@ -1345,6 +1348,15 @@ ACTOR Future masterCore( Reference self ) { tr.set(recoveryCommitRequest.arena, coordinatorsKey, self->coordinators.ccf->getConnectionString().toString()); tr.set(recoveryCommitRequest.arena, logsKey, self->logSystem->getLogsValue()); tr.set(recoveryCommitRequest.arena, primaryDatacenterKey, self->myInterface.locality.dcId().present() ? self->myInterface.locality.dcId().get() : StringRef()); + + //FIXME: remove this code, caching the entire normal keyspace as a test of functionality + //TODO: caching disabled for this merge + //tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.begin), storageCacheValue({0})); + //tr.set(recoveryCommitRequest.arena, storageCacheKey(normalKeys.end), storageCacheValue({})); + //tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.begin), serverKeysTrue); + //tr.set(recoveryCommitRequest.arena, cacheKeysKey(0, normalKeys.end), serverKeysFalse); + //tr.set(recoveryCommitRequest.arena, cacheChangeKeyFor(0), BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned())); + //tr.set(recoveryCommitRequest.arena, cacheChangeKey, BinaryWriter::toValue(deterministicRandom()->randomUniqueID(),Unversioned())); tr.clear(recoveryCommitRequest.arena, tLogDatacentersKeys); for(auto& dc : self->primaryDcId) { @@ -1356,7 +1368,7 @@ ACTOR Future masterCore( Reference self ) { } } - applyMetadataMutations(self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore, NULL, NULL); + applyMetadataMutations(self->dbgid, recoveryCommitRequest.arena, tr.mutations.slice(mmApplied, tr.mutations.size()), self->txnStateStore, nullptr, nullptr); mmApplied = tr.mutations.size(); tr.read_snapshot = self->recoveryTransactionVersion; // lastEpochEnd would make more sense, but isn't in the initial window of the resolver(s) diff --git a/fdbserver/storageserver.actor.cpp b/fdbserver/storageserver.actor.cpp index 353ecce677..c72b3829fc 100644 --- a/fdbserver/storageserver.actor.cpp +++ b/fdbserver/storageserver.actor.cpp @@ -75,25 +75,6 @@ inline bool canReplyWith(Error e) { }; } -struct StorageServer; -class ValueOrClearToRef { -public: - static ValueOrClearToRef value(ValueRef const& v) { return ValueOrClearToRef(v, false); } - static ValueOrClearToRef clearTo(KeyRef const& k) { return ValueOrClearToRef(k, true); } - - bool isValue() const { return !isClear; }; - bool isClearTo() const { return isClear; } - - ValueRef const& getValue() const { ASSERT( isValue() ); return item; }; - KeyRef const& getEndKey() const { ASSERT(isClearTo()); return item; }; - -private: - ValueOrClearToRef( StringRef item, bool isClear ) : item(item), isClear(isClear) {} - - StringRef item; - bool isClear; -}; - struct AddingShard : NonCopyable { KeyRange keys; Future fetchClient; // holds FetchKeys() actor @@ -390,6 +371,8 @@ public: KeyRangeMap< Reference > shards; uint64_t shardChangeCounter; // max( shards->changecounter ) + KeyRangeMap cachedRangeMap; // indicates if a key-range is being cached + // newestAvailableVersion[k] // == invalidVersion -> k is unavailable at all versions // <= storageVersion -> k is unavailable at all versions (but might be read anyway from storage if we are in the process of committing makeShardDurable) @@ -1085,7 +1068,6 @@ void merge( Arena& arena, VectorRef& output ASSERT( output.size() <= originalLimit ); } -// readRange reads up to |limit| rows from the given range and version, combining data->storage and data->versionedData. // If limit>=0, it returns the first rows in the range (sorted ascending), otherwise the last rows (sorted descending). // readRange has O(|result|) + O(log |data|) cost ACTOR Future readRange( StorageServer* data, Version version, KeyRange range, int limit, int* pLimitBytes ) { @@ -1103,6 +1085,12 @@ ACTOR Future readRange( StorageServer* data, Version version, //state int originalLimitBytes = *pLimitBytes; //state bool track = rrid.first() == 0x1bc134c2f752187cLL; + // Check if the desired key-range intersects the cached key-ranges + // TODO Find a more efficient way to do it + // TODO Also need this check in single key/value lookup + auto cached = data->cachedRangeMap.intersectingRanges(range); + result.cached = (cached.begin() != cached.end()); + // FIXME: Review pLimitBytes behavior // if (limit >= 0) we are reading forward, else backward @@ -1279,10 +1267,10 @@ ACTOR Future readRange( StorageServer* data, Version version, return result; } -bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) { +//bool selectorInRange( KeySelectorRef const& sel, KeyRangeRef const& range ) { // Returns true if the given range suffices to at least begin to resolve the given KeySelectorRef - return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end); -} +// return sel.getKey() >= range.begin && (sel.isBackward() ? sel.getKey() <= range.end : sel.getKey() < range.end); +//} ACTOR Future findKey( StorageServer* data, KeySelectorRef sel, Version version, KeyRange range, int* pOffset) // Attempts to find the key indicated by sel in the data at version, within range. @@ -1774,11 +1762,6 @@ bool expandMutation( MutationRef& m, StorageServer::VersionedData const& data, U return true; } -bool isClearContaining( StorageServer::VersionedData::ViewAtVersion const& view, KeyRef key ) { - auto i = view.lastLessOrEqual(key); - return i && i->isClearTo() && i->getEndKey() > key; -} - void applyMutation( StorageServer *self, MutationRef const& m, Arena& arena, StorageServer::VersionedData &data ) { // m is expected to be in arena already // Clear split keys are added to arena @@ -1808,7 +1791,7 @@ void applyMutation( StorageServer *self, MutationRef const& m, Arena& arena, Sto } else if (m.type == MutationRef::ClearRange) { data.erase( m.param1, m.param2 ); ASSERT( m.param2 > m.param1 ); - ASSERT( !isClearContaining( data.atLatest(), m.param1 ) ); + ASSERT( !data.isClearContaining( data.atLatest(), m.param1 ) ); data.insert( m.param1, ValueOrClearToRef::clearTo(m.param2) ); self->watches.triggerRange( m.param1, m.param2 ); } @@ -2463,6 +2446,8 @@ void StorageServer::addMutation(Version version, MutationRef const& mutation, Ke printf(" eager: %s\n", printable( eagerReads->getKeyEnd( mutation.param2 ) ).c_str() ); } applyMutation( this, expanded, mLog.arena(), mutableData() ); + //printf("\nSSUpdate: Printing versioned tree after applying mutation\n"); + //mutableData().printTree(version); } struct OrderByVersion { @@ -2492,8 +2477,8 @@ static const KeyRef persistPrimaryLocality = LiteralStringRef( PERSIST_PREFIX "P class StorageUpdater { public: - StorageUpdater() : fromVersion(invalidVersion), currentVersion(invalidVersion), restoredVersion(invalidVersion), processedStartKey(false) {} - StorageUpdater(Version fromVersion, Version restoredVersion) : fromVersion(fromVersion), currentVersion(fromVersion), restoredVersion(restoredVersion), processedStartKey(false) {} + StorageUpdater() : fromVersion(invalidVersion), currentVersion(invalidVersion), restoredVersion(invalidVersion), processedStartKey(false), processedCacheStartKey(false) {} + StorageUpdater(Version fromVersion, Version restoredVersion) : fromVersion(fromVersion), currentVersion(fromVersion), restoredVersion(restoredVersion), processedStartKey(false), processedCacheStartKey(false) {} void applyMutation(StorageServer* data, MutationRef const& m, Version ver) { //TraceEvent("SSNewVersion", data->thisServerID).detail("VerWas", data->mutableData().latestVersion).detail("ChVer", ver); @@ -2505,8 +2490,12 @@ public: } if (m.param1.startsWith( systemKeys.end )) { - //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); - applyPrivateData( data, m ); + if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) + applyPrivateCacheData( data, m); + else { + //TraceEvent("PrivateData", data->thisServerID).detail("Mutation", m.toString()).detail("Version", ver); + applyPrivateData( data, m ); + } } else { // FIXME: enable when debugMutation is active //for(auto m = changes[c].mutations.begin(); m; ++m) { @@ -2528,6 +2517,9 @@ private: bool nowAssigned; bool processedStartKey; + KeyRef cacheStartKey; + bool processedCacheStartKey; + void applyPrivateData( StorageServer* data, MutationRef const& m ) { TraceEvent(SevDebug, "SSPrivateMutation", data->thisServerID).detail("Mutation", m.toString()); @@ -2588,6 +2580,37 @@ private: ASSERT(false); // Unknown private mutation } } + + void applyPrivateCacheData( StorageServer* data, MutationRef const& m ) { + TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Mutation", m.toString()); + + if (processedCacheStartKey) { + // Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end) + ASSERT((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)); + KeyRangeRef keys( cacheStartKey.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix ), + m.param1.removePrefix(systemKeys.begin).removePrefix( storageCachePrefix )); + data->cachedRangeMap.insert(keys, true); + //TraceEvent(SevDebug, "SSPrivateCacheMutation", data->thisServerID).detail("Begin", keys.begin).detail("End", keys.end); + //fprintf(stderr, "applyPrivateCacheData : begin: %s, end: %s\n", printable(keys.begin).c_str(), printable(keys.end).c_str()); + + //Figure out the affected shard ranges and maintain the cached key-range information in the in-memory map + // TODO revisit- we are not splitting the cached ranges based on shards as of now. + if (0) { + auto cachedRanges = data->shards.intersectingRanges(keys); + for(auto shard = cachedRanges.begin(); shard != cachedRanges.end(); ++shard) { + KeyRangeRef intersectingRange = shard.range() & keys; + data->cachedRangeMap.insert(KeyRangeRef(intersectingRange.begin, intersectingRange.end), true); + } + } + processedStartKey = false; + } else if ((m.type == MutationRef::SetValue) && m.param1.substr(1).startsWith(storageCachePrefix)) { + // Because of the implementation of the krm* functions, we expect changes in pairs, [begin,end) + cacheStartKey = m.param1; + processedCacheStartKey = true; + } else { + ASSERT(false); // Unknown private mutation + } + } }; ACTOR Future update( StorageServer* data, bool* pReceivedUpdate ) diff --git a/fdbserver/worker.actor.cpp b/fdbserver/worker.actor.cpp index fcc05bed66..98148540d3 100644 --- a/fdbserver/worker.actor.cpp +++ b/fdbserver/worker.actor.cpp @@ -397,23 +397,59 @@ ACTOR Future registrationClient( ProcessClass initialClass, Reference>> ddInterf, Reference>> rkInterf, - Reference> degraded) { + Reference> degraded, + PromiseStream< ErrorInfo > errors, + LocalityData locality, + Reference> dbInfo) { // Keeps the cluster controller (as it may be re-elected) informed that this worker exists // The cluster controller uses waitFailureClient to find out if we die, and returns from registrationReply (requiring us to re-register) // The registration request piggybacks optional distributor interface if it exists. state Generation requestGeneration = 0; state ProcessClass processClass = initialClass; + state Reference>>> scInterf( new AsyncVar>>() ); + state Future cacheProcessFuture; + state Future cacheErrorsFuture; loop { - RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), degraded->get()); + RegisterWorkerRequest request(interf, initialClass, processClass, asyncPriorityInfo->get(), requestGeneration++, ddInterf->get(), rkInterf->get(), scInterf->get(), degraded->get()); Future registrationReply = ccInterface->get().present() ? brokenPromiseToNever( ccInterface->get().get().registerWorker.getReply(request) ) : Never(); choose { when ( RegisterWorkerReply reply = wait( registrationReply )) { processClass = reply.processClass; asyncPriorityInfo->set( reply.priorityInfo ); + + if(!reply.storageCache.present()) { + cacheProcessFuture.cancel(); + scInterf->set(Optional>()); + } else if (!scInterf->get().present() || scInterf->get().get().first != reply.storageCache.get()) { + StorageServerInterface recruited; + recruited.locality = locality; + recruited.initEndpoints(); + + std::map details; + startRole( Role::STORAGE_CACHE, recruited.id(), interf.id(), details ); + + //DUMPTOKEN(recruited.getVersion); + DUMPTOKEN(recruited.getValue); + DUMPTOKEN(recruited.getKey); + DUMPTOKEN(recruited.getKeyValues); + DUMPTOKEN(recruited.getShardState); + DUMPTOKEN(recruited.waitMetrics); + DUMPTOKEN(recruited.splitMetrics); + DUMPTOKEN(recruited.getStorageMetrics); + DUMPTOKEN(recruited.waitFailure); + DUMPTOKEN(recruited.getQueuingMetrics); + DUMPTOKEN(recruited.getKeyValueStoreType); + DUMPTOKEN(recruited.watchValue); + + cacheProcessFuture = storageCache( recruited, reply.storageCache.get(), dbInfo ); + cacheErrorsFuture = forwardError(errors, Role::STORAGE_CACHE, recruited.id(), setWhenDoneOrError(cacheProcessFuture, scInterf, Optional>())); + scInterf->set(std::make_pair(reply.storageCache.get(), recruited)); + } } when ( wait( ccInterface->onChange() )) {} when ( wait( ddInterf->onChange() ) ) {} when ( wait( rkInterf->onChange() ) ) {} + when ( wait( scInterf->onChange() ) ) {} when ( wait( degraded->onChange() ) ) {} } } @@ -956,7 +992,7 @@ ACTOR Future workerServer( wait(waitForAll(recoveries)); recoveredDiskFiles.send(Void()); - errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded ) ); + errorForwarders.add( registrationClient( ccInterface, interf, asyncPriorityInfo, initialClass, ddInterf, rkInterf, degraded, errors, locality, dbInfo ) ); TraceEvent("RecoveriesComplete", interf.id()); @@ -1498,4 +1534,5 @@ const Role Role::TESTER("Tester", "TS"); const Role Role::LOG_ROUTER("LogRouter", "LR"); const Role Role::DATA_DISTRIBUTOR("DataDistributor", "DD"); const Role Role::RATEKEEPER("Ratekeeper", "RK"); +const Role Role::STORAGE_CACHE("StorageCache", "SC"); const Role Role::COORDINATOR("Coordinator", "CD"); diff --git a/flow/network.h b/flow/network.h index 9b5edc57f3..e479f2a597 100644 --- a/flow/network.h +++ b/flow/network.h @@ -75,6 +75,7 @@ enum class TaskPriority { DataDistribution = 3500, DiskWrite = 3010, UpdateStorage = 3000, + CompactCache = 2900, TLogSpilledPeekReply = 2800, FetchKeys = 2500, Low = 2000, diff --git a/tests/fast/CycleTest.txt b/tests/fast/CycleTest.txt index b9ce0f6a45..f01d8b1119 100644 --- a/tests/fast/CycleTest.txt +++ b/tests/fast/CycleTest.txt @@ -27,4 +27,4 @@ testTitle=Unclogged testName=Cycle transactionsPerSecond=250.0 testDuration=10.0 - expectedRate=0.80 \ No newline at end of file + expectedRate=0.80 From e1d380e8d0474453a3320e4784fd47a3d1c6514a Mon Sep 17 00:00:00 2001 From: Markus Pilman Date: Tue, 12 Nov 2019 13:12:08 -0800 Subject: [PATCH 167/184] Update documentation/tutorial/tutorial.actor.cpp Co-Authored-By: Jingyu Zhou --- documentation/tutorial/tutorial.actor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp index bf1d5c58b0..df30da4d07 100644 --- a/documentation/tutorial/tutorial.actor.cpp +++ b/documentation/tutorial/tutorial.actor.cpp @@ -1,5 +1,6 @@ /* - * fdbcli.actor.cpp + * tutorial.actor.cpp + * * This source file is part of the FoundationDB open source project * From 7e4c4ea98ea5754965d4645893b661ca4c37b785 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 12 Nov 2019 16:28:09 -0800 Subject: [PATCH 168/184] FastRestore:Load mutations before assign ranges to appliers --- fdbclient/RestoreWorkerInterface.actor.h | 32 +++++++++++++++++-- fdbserver/RestoreLoader.actor.cpp | 40 +++++++++++++++++++----- fdbserver/RestoreLoader.actor.h | 7 +++++ fdbserver/RestoreMaster.actor.cpp | 20 ++++++++++-- fdbserver/RestoreMaster.actor.h | 2 +- 5 files changed, 88 insertions(+), 13 deletions(-) diff --git a/fdbclient/RestoreWorkerInterface.actor.h b/fdbclient/RestoreWorkerInterface.actor.h index cbc9500e1c..d3e4790c9f 100644 --- a/fdbclient/RestoreWorkerInterface.actor.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -47,6 +47,7 @@ struct RestoreRecruitRoleRequest; struct RestoreSysInfoRequest; struct RestoreLoadFileRequest; struct RestoreVersionBatchRequest; +struct RestoreSendMutationsToAppliersRequest; struct RestoreSendMutationVectorVersionedRequest; struct RestoreSetApplierKeyRangeVectorRequest; struct RestoreSysInfo; @@ -125,10 +126,12 @@ struct RestoreLoaderInterface : RestoreRoleInterface { RequestStream heartbeat; RequestStream updateRestoreSysInfo; + // TODO: delete setApplierKeyRangeVectorRequest because sendMutations does the job RequestStream setApplierKeyRangeVectorRequest; RequestStream loadFile; + RequestStream sendMutations; RequestStream initVersionBatch; - RequestStream collectRestoreRoleInterfaces; // TODO: Change to collectRestoreRoleInterfaces + RequestStream collectRestoreRoleInterfaces; RequestStream finishRestore; bool operator==(RestoreWorkerInterface const& r) const { return id() == r.id(); } @@ -146,6 +149,7 @@ struct RestoreLoaderInterface : RestoreRoleInterface { updateRestoreSysInfo.getEndpoint(TaskPriority::LoadBalancedEndpoint); setApplierKeyRangeVectorRequest.getEndpoint(TaskPriority::LoadBalancedEndpoint); loadFile.getEndpoint(TaskPriority::LoadBalancedEndpoint); + sendMutations.getEndpoint(TaskPriority::LoadBalancedEndpoint); initVersionBatch.getEndpoint(TaskPriority::LoadBalancedEndpoint); collectRestoreRoleInterfaces.getEndpoint(TaskPriority::LoadBalancedEndpoint); finishRestore.getEndpoint(TaskPriority::LoadBalancedEndpoint); @@ -154,7 +158,7 @@ struct RestoreLoaderInterface : RestoreRoleInterface { template void serialize(Ar& ar) { serializer(ar, *(RestoreRoleInterface*)this, heartbeat, updateRestoreSysInfo, setApplierKeyRangeVectorRequest, - loadFile, initVersionBatch, collectRestoreRoleInterfaces, finishRestore); + loadFile, sendMutations, initVersionBatch, collectRestoreRoleInterfaces, finishRestore); } }; @@ -342,6 +346,29 @@ struct RestoreLoadFileRequest : TimedRequest { } }; +struct RestoreSendMutationsToAppliersRequest : TimedRequest { + constexpr static FileIdentifier file_identifier = 68827305; + + std::map rangeToApplier; + + ReplyPromise reply; + + RestoreSendMutationsToAppliersRequest() = default; + explicit RestoreSendMutationsToAppliersRequest(std::map rangeToApplier) + : rangeToApplier(rangeToApplier) {} + + template + void serialize(Ar& ar) { + serializer(ar, rangeToApplier, reply); + } + + std::string toString() { + std::stringstream ss; + ss << "RestoreSendMutationsToAppliersRequest keyToAppliers.size:" << rangeToApplier.size(); + return ss.str(); + } +}; + struct RestoreSendMutationVectorVersionedRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 69764565; @@ -393,6 +420,7 @@ struct RestoreVersionBatchRequest : TimedRequest { } }; +// TODO: To delete this request struct RestoreSetApplierKeyRangeVectorRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 92038306; diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 2f49caaeb5..7627deaa39 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -44,6 +44,8 @@ void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVector Reference self); ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Reference self, bool isSampling = false); +ACTOR Future handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req, + Reference self); ACTOR Future sendMutationsToApplier(Reference self, VersionedMutationsMap* kvOps, bool isRangeFile, Version startVersion, Version endVersion, int fileIndex); ACTOR static Future _parseLogFileToMutationsOnLoader( @@ -84,6 +86,10 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no self->initBackupContainer(req.param.url); actors.add(handleLoadFileRequest(req, self, false)); } + when(RestoreSendMutationsToAppliersRequest req = waitNext(loaderInterf.sendMutations.getFuture())) { + requestTypeStr = "sendMutations"; + actors.add(handleSendMutationsRequest(req, self)); + } when(RestoreVersionBatchRequest req = waitNext(loaderInterf.initVersionBatch.getFuture())) { requestTypeStr = "initVersionBatch"; wait(handleInitVersionBatchRequest(req, self)); @@ -144,10 +150,10 @@ ACTOR Future _processLoadingParam(LoadingParam param, Referenceid()).detail("StartProcessLoadParam", param.toString()); ASSERT(param.blockSize > 0); ASSERT(param.offset % param.blockSize == 0); // Parse file must be at block bondary. + ASSERT(self->kvOpsPerLP.find(param) == self->kvOpsPerLP.end()); - // Temporary data structure for parsing range and log files into (version, ) + // Temporary data structure for parsing log files into (version, ) // Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted - state VersionedMutationsMap kvOps; // mutationMap: Key is the unique identifier for a batch of mutation logs at the same version state SerializedMutationListMap mutationMap; state std::map, uint32_t> mutationPartMap; // Sanity check the data parsing is correct @@ -161,8 +167,9 @@ ACTOR Future _processLoadingParam(LoadingParam param, Reference(param.blockSize, param.length - j); if (param.isRangeFile) { - fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader( - &kvOps, self->bc, param.version, param.filename, readOffset, readLen, param.restoreRange)); + fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader(&self->kvOpsPerLP[param], self->bc, + param.version, param.filename, readOffset, + readLen, param.restoreRange)); } else { fileParserFutures.push_back(_parseLogFileToMutationsOnLoader( &processedFileOffset, &mutationMap, &mutationPartMap, self->bc, param.version, param.filename, @@ -172,12 +179,9 @@ ACTOR Future _processLoadingParam(LoadingParam param, ReferencekvOpsPerLP[param], &mutationMap); } - // Send the parsed mutation to applier who will apply the mutation to DB - wait(sendMutationsToApplier(self, &kvOps, param.isRangeFile, param.prevVersion, param.endVersion, param.fileIndex)); - TraceEvent("FastRestore").detail("Loader", self->id()).detail("FinishLoadingFile", param.filename); return Void(); @@ -196,6 +200,26 @@ ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, ReferenceprocessedFileParams.find(req.param) != self->processedFileParams.end()); wait(self->processedFileParams[req.param]); // wait on the processing of the req.param. + // TODO: Send sampled mutations back to master + req.reply.send(RestoreCommonReply(self->id())); + return Void(); +} + +ACTOR Future handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req, + Reference self) { + state int i = 0; + for (; i <= 1; i++) { + state bool useRangeFile = (i == 1); + // Send mutations from log files first to ensure log mutation at the same version is before the range kv + state std::map::iterator item = self->kvOpsPerLP.begin(); + for (; item != self->kvOpsPerLP.end(); item++) { + if (item->first.isRangeFile == useRangeFile) { + // Send the parsed mutation to applier who will apply the mutation to DB + wait(sendMutationsToApplier(self, &item->second, item->first.isRangeFile, item->first.prevVersion, + item->first.endVersion, item->first.fileIndex)); + } + } + } req.reply.send(RestoreCommonReply(self->id())); return Void(); } diff --git a/fdbserver/RestoreLoader.actor.h b/fdbserver/RestoreLoader.actor.h index 0c1f6023b2..d2cfdc9ccb 100644 --- a/fdbserver/RestoreLoader.actor.h +++ b/fdbserver/RestoreLoader.actor.h @@ -42,8 +42,14 @@ #include "flow/actorcompiler.h" // has to be last include +// Buffer for mutations parsed from a backup file +// struct ParsedMutationBuffer { +// VersionedMutationsMap kvOps; +// } + struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted { std::map> processedFileParams; + std::map kvOpsPerLP; // Buffered kvOps for each loading param // rangeToApplier is in master and loader. Loader uses this to determine which applier a mutation should be sent // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for @@ -79,6 +85,7 @@ struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted loadFilesOnLoaders(Reference self, return Void(); } +// Ask loaders to send its buffered mutations to appliers +ACTOR static Future sendMutationsFromLoaders(Reference self) { + TraceEvent("FastRestore").detail("SendMutationsFromLoaders", self->batchIndex); + + std::vector> requests; + for (auto& loader : self->loadersInterf) { + requests.push_back(std::make_pair(loader.first, RestoreSendMutationsToAppliersRequest(self->rangeToApplier))); + } + wait(sendBatchRequests(&RestoreLoaderInterface::sendMutations, self->loadersInterf, requests)); + + return Void(); +} + ACTOR static Future distributeWorkloadPerVersionBatch(Reference self, Database cx, RestoreRequest request, VersionBatch versionBatch) { ASSERT(!versionBatch.isEmpty()); @@ -315,13 +328,16 @@ ACTOR static Future distributeWorkloadPerVersionBatch(ReferenceloadersInterf.size() > 0); ASSERT(self->appliersInterf.size() > 0); - dummySampleWorkload(self); - wait(notifyLoaderAppliersKeyRange(self)); + dummySampleWorkload(self); // TODO: Delete + wait(notifyLoaderAppliersKeyRange(self)); // TODO: Delete // Parse log files and send mutations to appliers before we parse range files + // TODO: Allow loading both range and log files in parallel wait(loadFilesOnLoaders(self, cx, request, versionBatch, false)); wait(loadFilesOnLoaders(self, cx, request, versionBatch, true)); + wait(sendMutationsFromLoaders(self)); + wait(notifyApplierToApplyMutations(self)); return Void(); diff --git a/fdbserver/RestoreMaster.actor.h b/fdbserver/RestoreMaster.actor.h index 7f8822e829..3cfb0956b4 100644 --- a/fdbserver/RestoreMaster.actor.h +++ b/fdbserver/RestoreMaster.actor.h @@ -54,7 +54,7 @@ struct VersionBatch { struct RestoreMasterData : RestoreRoleData, public ReferenceCounted { // rangeToApplier is in master and loader node. Loader uses this to determine which applier a mutation should be sent. // KeyRef is the inclusive lower bound of the key range the applier (UID) is responsible for - std::map, UID> rangeToApplier; + std::map rangeToApplier; std::map versionBatches; // key is the beginVersion of the version batch int batchIndex; From 592f4c0fc466d1dd5232b3c4e2a0d880b3186fe1 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 12 Nov 2019 17:17:00 -0800 Subject: [PATCH 169/184] FastRestore:Remove RestoreSetApplierKeyRangeVectorRequest --- fdbclient/RestoreWorkerInterface.actor.h | 32 ++---------------------- fdbserver/RestoreLoader.actor.cpp | 26 ++++--------------- fdbserver/RestoreMaster.actor.cpp | 13 ---------- fdbserver/RestoreUtil.h | 4 +-- fdbserver/RestoreWorker.actor.cpp | 5 ++-- 5 files changed, 12 insertions(+), 68 deletions(-) diff --git a/fdbclient/RestoreWorkerInterface.actor.h b/fdbclient/RestoreWorkerInterface.actor.h index d3e4790c9f..58ba4f3de9 100644 --- a/fdbclient/RestoreWorkerInterface.actor.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -49,7 +49,6 @@ struct RestoreLoadFileRequest; struct RestoreVersionBatchRequest; struct RestoreSendMutationsToAppliersRequest; struct RestoreSendMutationVectorVersionedRequest; -struct RestoreSetApplierKeyRangeVectorRequest; struct RestoreSysInfo; struct RestoreApplierInterface; @@ -126,8 +125,6 @@ struct RestoreLoaderInterface : RestoreRoleInterface { RequestStream heartbeat; RequestStream updateRestoreSysInfo; - // TODO: delete setApplierKeyRangeVectorRequest because sendMutations does the job - RequestStream setApplierKeyRangeVectorRequest; RequestStream loadFile; RequestStream sendMutations; RequestStream initVersionBatch; @@ -147,7 +144,6 @@ struct RestoreLoaderInterface : RestoreRoleInterface { void initEndpoints() { heartbeat.getEndpoint(TaskPriority::LoadBalancedEndpoint); updateRestoreSysInfo.getEndpoint(TaskPriority::LoadBalancedEndpoint); - setApplierKeyRangeVectorRequest.getEndpoint(TaskPriority::LoadBalancedEndpoint); loadFile.getEndpoint(TaskPriority::LoadBalancedEndpoint); sendMutations.getEndpoint(TaskPriority::LoadBalancedEndpoint); initVersionBatch.getEndpoint(TaskPriority::LoadBalancedEndpoint); @@ -157,8 +153,8 @@ struct RestoreLoaderInterface : RestoreRoleInterface { template void serialize(Ar& ar) { - serializer(ar, *(RestoreRoleInterface*)this, heartbeat, updateRestoreSysInfo, setApplierKeyRangeVectorRequest, - loadFile, sendMutations, initVersionBatch, collectRestoreRoleInterfaces, finishRestore); + serializer(ar, *(RestoreRoleInterface*)this, heartbeat, updateRestoreSysInfo, loadFile, sendMutations, + initVersionBatch, collectRestoreRoleInterfaces, finishRestore); } }; @@ -420,30 +416,6 @@ struct RestoreVersionBatchRequest : TimedRequest { } }; -// TODO: To delete this request -struct RestoreSetApplierKeyRangeVectorRequest : TimedRequest { - constexpr static FileIdentifier file_identifier = 92038306; - - std::map, UID> rangeToApplier; - - ReplyPromise reply; - - RestoreSetApplierKeyRangeVectorRequest() = default; - explicit RestoreSetApplierKeyRangeVectorRequest(std::map, UID> rangeToApplier) - : rangeToApplier(rangeToApplier) {} - - template - void serialize(Ar& ar) { - serializer(ar, rangeToApplier, reply); - } - - std::string toString() { - std::stringstream ss; - ss << "RestoreVersionBatchRequest rangeToApplierSize:" << rangeToApplier.size(); - return ss.str(); - } -}; - struct RestoreRequest { constexpr static FileIdentifier file_identifier = 49589770; diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 7627deaa39..d589ecb632 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -40,8 +40,6 @@ void _parseSerializedMutation(VersionedMutationsMap* kvOps, SerializedMutationLi bool isSampling = false); void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self); -void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVectorRequest& req, - Reference self); ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Reference self, bool isSampling = false); ACTOR Future handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req, @@ -76,11 +74,6 @@ ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int no requestTypeStr = "updateRestoreSysInfo"; handleRestoreSysInfoRequest(req, self); } - when(RestoreSetApplierKeyRangeVectorRequest req = - waitNext(loaderInterf.setApplierKeyRangeVectorRequest.getFuture())) { - requestTypeStr = "setApplierKeyRangeVectorRequest"; - handleSetApplierKeyRangeVectorRequest(req, self); - } when(RestoreLoadFileRequest req = waitNext(loaderInterf.loadFile.getFuture())) { requestTypeStr = "loadFile"; self->initBackupContainer(req.param.url); @@ -131,20 +124,6 @@ void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Referenceid())); } -void handleSetApplierKeyRangeVectorRequest(const RestoreSetApplierKeyRangeVectorRequest& req, - Reference self) { - TraceEvent("FastRestore") - .detail("Loader", self->id()) - .detail("SetApplierKeyRangeVector", req.rangeToApplier.size()); - // Idempodent operation. OK to re-execute the duplicate cmd - if (self->rangeToApplier.empty()) { - self->rangeToApplier = req.rangeToApplier; - } else { - ASSERT(self->rangeToApplier == req.rangeToApplier); - } - req.reply.send(RestoreCommonReply(self->id())); -} - ACTOR Future _processLoadingParam(LoadingParam param, Reference self) { // Q: How to record the param's fields inside LoadingParam Refer to storageMetrics TraceEvent("FastRestore").detail("Loader", self->id()).detail("StartProcessLoadParam", param.toString()); @@ -207,6 +186,11 @@ ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Reference handleSendMutationsRequest(RestoreSendMutationsToAppliersRequest req, Reference self) { + if (self->rangeToApplier.empty()) { + self->rangeToApplier = req.rangeToApplier; + } else { + ASSERT(self->rangeToApplier == req.rangeToApplier); + } state int i = 0; for (; i <= 1; i++) { state bool useRangeFile = (i == 1); diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index f135d712ac..490f926815 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -51,7 +51,6 @@ ACTOR static Future distributeRestoreSysInfo(Reference ACTOR static Future>> collectRestoreRequests(Database cx); ACTOR static Future initializeVersionBatch(Reference self); -ACTOR static Future notifyLoaderAppliersKeyRange(Reference self); ACTOR static Future notifyApplierToApplyMutations(Reference self); ACTOR static Future notifyRestoreCompleted(Reference self, Database cx); @@ -329,7 +328,6 @@ ACTOR static Future distributeWorkloadPerVersionBatch(ReferenceappliersInterf.size() > 0); dummySampleWorkload(self); // TODO: Delete - wait(notifyLoaderAppliersKeyRange(self)); // TODO: Delete // Parse log files and send mutations to appliers before we parse range files // TODO: Allow loading both range and log files in parallel @@ -482,17 +480,6 @@ ACTOR static Future notifyApplierToApplyMutations(Reference notifyLoaderAppliersKeyRange(Reference self) { - std::vector> requests; - for (auto& loader : self->loadersInterf) { - requests.push_back(std::make_pair(loader.first, RestoreSetApplierKeyRangeVectorRequest(self->rangeToApplier))); - } - wait(sendBatchRequests(&RestoreLoaderInterface::setApplierKeyRangeVectorRequest, self->loadersInterf, requests)); - - return Void(); -} - // Ask all loaders and appliers to perform housecleaning at the end of restore and // Register the restoreRequestDoneKey to signal the end of restore ACTOR static Future notifyRestoreCompleted(Reference self, Database cx) { diff --git a/fdbserver/RestoreUtil.h b/fdbserver/RestoreUtil.h index a645d3a391..698cc33af2 100644 --- a/fdbserver/RestoreUtil.h +++ b/fdbserver/RestoreUtil.h @@ -34,8 +34,8 @@ #include #include -#define SevFRMutationInfo SevVerbose -//#define SevFRMutationInfo SevInfo +//#define SevFRMutationInfo SevVerbose +#define SevFRMutationInfo SevInfo enum class RestoreRole { Invalid = 0, Master = 1, Loader, Applier }; BINARY_SERIALIZABLE(RestoreRole); diff --git a/fdbserver/RestoreWorker.actor.cpp b/fdbserver/RestoreWorker.actor.cpp index a1253a3757..becbc75ddb 100644 --- a/fdbserver/RestoreWorker.actor.cpp +++ b/fdbserver/RestoreWorker.actor.cpp @@ -1,5 +1,5 @@ /* - * Restore.actor.cpp + * RestoreWorker.actor.cpp * * This source file is part of the FoundationDB open source project * @@ -98,8 +98,9 @@ ACTOR Future handleRecruitRoleRequest(RestoreRecruitRoleRequest req, Refer self->loaderInterf = RestoreLoaderInterface(); self->loaderInterf.get().initEndpoints(); RestoreLoaderInterface& recruited = self->loaderInterf.get(); - DUMPTOKEN(recruited.setApplierKeyRangeVectorRequest); DUMPTOKEN(recruited.initVersionBatch); + DUMPTOKEN(recruited.loadFile); + DUMPTOKEN(recruited.sendMutations); DUMPTOKEN(recruited.collectRestoreRoleInterfaces); DUMPTOKEN(recruited.finishRestore); actors->add(restoreLoaderCore(self->loaderInterf.get(), req.nodeIndex, cx)); From 9e36b897e6fd1636828559a99994a8bac5f4b8b6 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Tue, 12 Nov 2019 18:23:14 -0800 Subject: [PATCH 170/184] FastRestore:Loaders must send to appliers log files data before range files --- fdbclient/RestoreWorkerInterface.actor.h | 10 ++++++---- fdbserver/RestoreLoader.actor.cpp | 20 +++++++++----------- fdbserver/RestoreLoader.actor.h | 5 ----- fdbserver/RestoreMaster.actor.cpp | 16 ++++++++++++---- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/fdbclient/RestoreWorkerInterface.actor.h b/fdbclient/RestoreWorkerInterface.actor.h index 58ba4f3de9..e0664a4b8a 100644 --- a/fdbclient/RestoreWorkerInterface.actor.h +++ b/fdbclient/RestoreWorkerInterface.actor.h @@ -346,21 +346,23 @@ struct RestoreSendMutationsToAppliersRequest : TimedRequest { constexpr static FileIdentifier file_identifier = 68827305; std::map rangeToApplier; + bool useRangeFile; // Send mutations parsed from range file? ReplyPromise reply; RestoreSendMutationsToAppliersRequest() = default; - explicit RestoreSendMutationsToAppliersRequest(std::map rangeToApplier) - : rangeToApplier(rangeToApplier) {} + explicit RestoreSendMutationsToAppliersRequest(std::map rangeToApplier, bool useRangeFile) + : rangeToApplier(rangeToApplier), useRangeFile(useRangeFile) {} template void serialize(Ar& ar) { - serializer(ar, rangeToApplier, reply); + serializer(ar, rangeToApplier, useRangeFile, reply); } std::string toString() { std::stringstream ss; - ss << "RestoreSendMutationsToAppliersRequest keyToAppliers.size:" << rangeToApplier.size(); + ss << "RestoreSendMutationsToAppliersRequest keyToAppliers.size:" << rangeToApplier.size() + << " useRangeFile:" << useRangeFile; return ss.str(); } }; diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index d589ecb632..72276f61e5 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -191,19 +191,17 @@ ACTOR Future handleSendMutationsRequest(RestoreSendMutationsToAppliersRequ } else { ASSERT(self->rangeToApplier == req.rangeToApplier); } - state int i = 0; - for (; i <= 1; i++) { - state bool useRangeFile = (i == 1); - // Send mutations from log files first to ensure log mutation at the same version is before the range kv - state std::map::iterator item = self->kvOpsPerLP.begin(); - for (; item != self->kvOpsPerLP.end(); item++) { - if (item->first.isRangeFile == useRangeFile) { - // Send the parsed mutation to applier who will apply the mutation to DB - wait(sendMutationsToApplier(self, &item->second, item->first.isRangeFile, item->first.prevVersion, - item->first.endVersion, item->first.fileIndex)); - } + + // Send mutations from log files first to ensure log mutation at the same version is before the range kv + state std::map::iterator item = self->kvOpsPerLP.begin(); + for (; item != self->kvOpsPerLP.end(); item++) { + if (item->first.isRangeFile == req.useRangeFile) { + // Send the parsed mutation to applier who will apply the mutation to DB + wait(sendMutationsToApplier(self, &item->second, item->first.isRangeFile, item->first.prevVersion, + item->first.endVersion, item->first.fileIndex)); } } + req.reply.send(RestoreCommonReply(self->id())); return Void(); } diff --git a/fdbserver/RestoreLoader.actor.h b/fdbserver/RestoreLoader.actor.h index d2cfdc9ccb..83331fb26e 100644 --- a/fdbserver/RestoreLoader.actor.h +++ b/fdbserver/RestoreLoader.actor.h @@ -42,11 +42,6 @@ #include "flow/actorcompiler.h" // has to be last include -// Buffer for mutations parsed from a backup file -// struct ParsedMutationBuffer { -// VersionedMutationsMap kvOps; -// } - struct RestoreLoaderData : RestoreRoleData, public ReferenceCounted { std::map> processedFileParams; std::map kvOpsPerLP; // Buffered kvOps for each loading param diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index 490f926815..eac12844c0 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -308,12 +308,15 @@ ACTOR static Future loadFilesOnLoaders(Reference self, } // Ask loaders to send its buffered mutations to appliers -ACTOR static Future sendMutationsFromLoaders(Reference self) { - TraceEvent("FastRestore").detail("SendMutationsFromLoaders", self->batchIndex); +ACTOR static Future sendMutationsFromLoaders(Reference self, bool useRangeFile) { + TraceEvent("FastRestore") + .detail("SendMutationsFromLoaders", self->batchIndex) + .detail("UseRangeFiles", useRangeFile); std::vector> requests; for (auto& loader : self->loadersInterf) { - requests.push_back(std::make_pair(loader.first, RestoreSendMutationsToAppliersRequest(self->rangeToApplier))); + requests.push_back( + std::make_pair(loader.first, RestoreSendMutationsToAppliersRequest(self->rangeToApplier, useRangeFile))); } wait(sendBatchRequests(&RestoreLoaderInterface::sendMutations, self->loadersInterf, requests)); @@ -334,7 +337,11 @@ ACTOR static Future distributeWorkloadPerVersionBatch(Reference self) { } else { self->rangeToApplier[StringRef(keyrangeSplitter[i].toString())] = applier.first; } + i++; } self->logApplierKeyRange(); } From b4aa72303ffb6ef5f4bf47fcf8bba7bd5c510ef3 Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 13 Nov 2019 13:30:34 -0800 Subject: [PATCH 171/184] Add [[nodiscard]] for whenAtLeast, and make Notified generic --- fdbclient/Notified.h | 105 +++++++++++++++--------------------------- flow/TDMetric.actor.h | 9 ++-- 2 files changed, 41 insertions(+), 73 deletions(-) diff --git a/fdbclient/Notified.h b/fdbclient/Notified.h index 80a87192f0..cd42f96240 100644 --- a/fdbclient/Notified.h +++ b/fdbclient/Notified.h @@ -25,103 +25,70 @@ #include "fdbclient/FDBTypes.h" #include "flow/TDMetric.actor.h" -struct NotifiedVersion { - NotifiedVersion( StringRef& name, StringRef const &id, Version version = 0 ) : val(name, id, version) { val = version; } - NotifiedVersion( Version version = 0 ) : val(StringRef(), StringRef(), version) {} +template +struct IsMetricHandle : std::false_type {}; +template +struct IsMetricHandle> : std::true_type {}; - void initMetric(const StringRef& name, const StringRef &id) { - Version version = val; - val.init(name, id); - val = version; - } +template +struct Notified { + explicit Notified(ValueType v = 0) { val = v; } - Future whenAtLeast( Version limit ) { - if (val >= limit) - return Void(); + [[nodiscard]] Future whenAtLeast(const ValueType& limit) { + if (val >= limit) return Void(); Promise p; - waiting.push( std::make_pair(limit,p) ); + waiting.push(std::make_pair(limit, p)); return p.getFuture(); } - Version get() const { return val; } + [[nodiscard]] ValueType get() const { return val; } - void set( Version v ) { - ASSERT( v >= val ); + void initMetric(const StringRef& name, const StringRef& id) { + if constexpr (IsMetricHandle::value) { + Version version = val; + val.init(name, id); + val = version; + } else { + TraceEvent(SevError, "InvalidNotifiedOperation") + .detail("Reason", "Notified where T is not a metric: Can't use initMetric"); + } + } + + void set(const ValueType& v) { + ASSERT(v >= val); if (v != val) { val = v; std::vector> toSend; - while ( waiting.size() && v >= waiting.top().first ) { + while (waiting.size() && v >= waiting.top().first) { Promise p = std::move(waiting.top().second); waiting.pop(); toSend.push_back(p); } - for(auto& p : toSend) { + for (auto& p : toSend) { p.send(Void()); } } } - void operator=( Version v ) { - set( v ); + void operator=(const ValueType& v) { set(v); } + + Notified(Notified&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(std::move(r.val)) {} + void operator=(Notified&& r) BOOST_NOEXCEPT { + waiting = std::move(r.waiting); + val = std::move(r.val); } - NotifiedVersion(NotifiedVersion&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(std::move(r.val)) {} - void operator=(NotifiedVersion&& r) BOOST_NOEXCEPT { waiting = std::move(r.waiting); val = std::move(r.val); } - private: - typedef std::pair> Item; + using Item = std::pair>; struct ItemCompare { bool operator()(const Item& a, const Item& b) { return a.first > b.first; } }; std::priority_queue, ItemCompare> waiting; - VersionMetricHandle val; + T val; }; -struct NotifiedDouble { - explicit NotifiedDouble( double val = 0 ) : val(val) {} - - Future whenAtLeast( double limit ) { - if (val >= limit) - return Void(); - Promise p; - waiting.push( std::make_pair(limit,p) ); - return p.getFuture(); - } - - double get() const { return val; } - - void set( double v ) { - ASSERT( v >= val ); - if (v != val) { - val = v; - - std::vector> toSend; - while ( waiting.size() && v >= waiting.top().first ) { - Promise p = std::move(waiting.top().second); - waiting.pop(); - toSend.push_back(p); - } - for(auto& p : toSend) { - p.send(Void()); - } - } - } - - void operator=( double v ) { - set( v ); - } - - NotifiedDouble(NotifiedDouble&& r) BOOST_NOEXCEPT : waiting(std::move(r.waiting)), val(r.val) {} - void operator=(NotifiedDouble&& r) BOOST_NOEXCEPT { waiting = std::move(r.waiting); val = r.val; } - -private: - typedef std::pair> Item; - struct ItemCompare { - bool operator()(const Item& a, const Item& b) { return a.first > b.first; } - }; - std::priority_queue, ItemCompare> waiting; - double val; -}; +using NotifiedVersion = Notified; +using NotifiedDouble = Notified; #endif diff --git a/flow/TDMetric.actor.h b/flow/TDMetric.actor.h index 32eb8ceaae..73205b2481 100755 --- a/flow/TDMetric.actor.h +++ b/flow/TDMetric.actor.h @@ -1350,10 +1350,11 @@ typedef ContinuousMetric> StringMetric; // template struct MetricHandle { - template - MetricHandle(StringRef const &name = StringRef(), StringRef const &id = StringRef(), ValueType const &initial = ValueType()) - : ref(T::getOrCreateInstance(name, id, true, initial)) { - } + using ValueType = typename T::ValueType; + + MetricHandle(StringRef const& name = StringRef(), StringRef const& id = StringRef(), + ValueType const& initial = ValueType()) + : ref(T::getOrCreateInstance(name, id, true, initial)) {} // Initialize this handle to point to a new or existing metric with (name, id). If a new metric is created then the handle's // current metric's current value will be the new metric's initial value. This allows Metric handle users to treate their From 1f547eab23f9ed7e55a8653c8a0b508f8635ebef Mon Sep 17 00:00:00 2001 From: Andrew Noyes Date: Wed, 13 Nov 2019 15:32:52 -0800 Subject: [PATCH 172/184] Version -> ValueType --- fdbclient/Notified.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fdbclient/Notified.h b/fdbclient/Notified.h index cd42f96240..d0cd4ec846 100644 --- a/fdbclient/Notified.h +++ b/fdbclient/Notified.h @@ -45,9 +45,9 @@ struct Notified { void initMetric(const StringRef& name, const StringRef& id) { if constexpr (IsMetricHandle::value) { - Version version = val; + ValueType v = val; val.init(name, id); - val = version; + val = v; } else { TraceEvent(SevError, "InvalidNotifiedOperation") .detail("Reason", "Notified where T is not a metric: Can't use initMetric"); From 6e2a6082ea56a41bc00e7d71a629af6da414c4cd Mon Sep 17 00:00:00 2001 From: mpilman Date: Wed, 13 Nov 2019 17:26:01 -0800 Subject: [PATCH 173/184] addressed review comments --- documentation/tutorial/tutorial.actor.cpp | 30 ++++++++++------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/documentation/tutorial/tutorial.actor.cpp b/documentation/tutorial/tutorial.actor.cpp index df30da4d07..d0be6a3e2b 100644 --- a/documentation/tutorial/tutorial.actor.cpp +++ b/documentation/tutorial/tutorial.actor.cpp @@ -53,15 +53,14 @@ ACTOR Future simpleTimer() { // A actor that demonstrates how choose-when // blocks work. ACTOR Future someFuture(Future ready) { - loop { - choose { - when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } - when(int r = wait(ready)) { - std::cout << format("Ready %d\n", r); - wait(delay(double(r))); - std::cout << "Done\n"; - return Void(); - } + // loop choose {} works as well here - the braces are optional + loop choose { + when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } + when(int r = wait(ready)) { + std::cout << format("Ready %d\n", r); + wait(delay(double(r))); + std::cout << "Done\n"; + return Void(); } } } @@ -76,12 +75,9 @@ ACTOR Future promiseDemo() { } ACTOR Future eventLoop(AsyncTrigger* trigger) { - loop { - - choose { - when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } - when(wait(trigger->onTrigger())) { std::cout << "Triggered!\n"; } - } + loop choose { + when(wait(delay(0.5))) { std::cout << "Still waiting...\n"; } + when(wait(trigger->onTrigger())) { std::cout << "Triggered!\n"; } } } @@ -418,14 +414,14 @@ int main(int argc, char* argv[]) { if (arg == "-p") { isServer = true; if (i + 1 >= argc) { - std::cout << "Excpecting an argument after -p\n"; + std::cout << "Expecting an argument after -p\n"; return 1; } port = std::string(argv[++i]); continue; } else if (arg == "-s") { if (i + 1 >= argc) { - std::cout << "Excpecting an argument after -s\n"; + std::cout << "Expecting an argument after -s\n"; return 1; } serverAddress = NetworkAddress::parse(argv[++i]); From 5144e57e11c108185018dc03fb041547b8d37a27 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Thu, 14 Nov 2019 14:49:51 -0800 Subject: [PATCH 174/184] Reenable restart tests in from_5.* --- tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7b5f118051..d24ea3a9df 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -165,10 +165,10 @@ add_fdb_test( restarting/from_6.2.0/SnapCycleRestart-2.txt) add_fdb_test( TEST_FILES restarting/from_5.1.7/DrUpgradeRestart-1.txt - restarting/from_5.1.7/DrUpgradeRestart-2.txt IGNORE) + restarting/from_5.1.7/DrUpgradeRestart-2.txt) add_fdb_test( TEST_FILES restarting/from_5.2.0/ClientTransactionProfilingCorrectness-1.txt - restarting/from_5.2.0/ClientTransactionProfilingCorrectness-2.txt IGNORE) + restarting/from_5.2.0/ClientTransactionProfilingCorrectness-2.txt) add_fdb_test(TEST_FILES slow/ApiCorrectness.txt) add_fdb_test(TEST_FILES slow/ApiCorrectnessAtomicRestore.txt) add_fdb_test(TEST_FILES slow/ApiCorrectnessSwitchover.txt) From 3f5491318dce09a7a8eb3546ba30064d2f0914c3 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 13 Nov 2019 10:57:21 -0800 Subject: [PATCH 175/184] FastRestore:Fix bug that cause nondeterminism 1) Use map iterator instead of pointer to maintain stability when map is inserted or deleted 2) dummySampleWorkload: clear rangeToApplier data in each sampling phase. otherwise, we can have an increasing number of keys assigned to the applier. --- fdbserver/RestoreLoader.actor.cpp | 20 +++++++++++-------- fdbserver/RestoreMaster.actor.cpp | 9 +++++---- ...kupAndParallelRestoreCorrectness.actor.cpp | 2 +- fdbserver/workloads/ParallelRestore.actor.cpp | 2 +- 4 files changed, 19 insertions(+), 14 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 72276f61e5..9b97a8c843 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -36,7 +36,7 @@ typedef std::map, uint32_t> SerializedMutationPartMap; bool isRangeMutation(MutationRef m); void splitMutation(Reference self, MutationRef m, Arena& mvector_arena, VectorRef& mvector, Arena& nodeIDs_arena, VectorRef& nodeIDs); -void _parseSerializedMutation(VersionedMutationsMap* kvOps, SerializedMutationListMap* mutationMap, +void _parseSerializedMutation(std::map::iterator kvOpsIter, SerializedMutationListMap* mutationMap, bool isSampling = false); void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self); @@ -50,7 +50,7 @@ ACTOR static Future _parseLogFileToMutationsOnLoader( NotifiedVersion* pProcessedFileOffset, SerializedMutationListMap* mutationMap, SerializedMutationPartMap* mutationPartMap, Reference bc, Version version, std::string fileName, int64_t readOffset, int64_t readLen, KeyRange restoreRange, Key addPrefix, Key removePrefix, Key mutationLogPrefix); -ACTOR static Future _parseRangeFileToMutationsOnLoader(VersionedMutationsMap* kvOps, +ACTOR static Future _parseRangeFileToMutationsOnLoader(std::map::iterator kvOpsIter, Reference bc, Version version, std::string fileName, int64_t readOffset_input, int64_t readLen_input, KeyRange restoreRange); @@ -130,6 +130,10 @@ ACTOR Future _processLoadingParam(LoadingParam param, Reference 0); ASSERT(param.offset % param.blockSize == 0); // Parse file must be at block bondary. ASSERT(self->kvOpsPerLP.find(param) == self->kvOpsPerLP.end()); + // NOTE: map's iterator is guaranteed to be stable, but pointer may not. + //state VersionedMutationsMap* kvOps = &self->kvOpsPerLP[param]; + self->kvOpsPerLP.insert(std::make_pair(param, VersionedMutationsMap())); + state std::map::iterator kvOpsPerLPIter = self->kvOpsPerLP.find(param); // Temporary data structure for parsing log files into (version, ) // Must use StandAlone to save mutations, otherwise, the mutationref memory will be corrupted @@ -146,7 +150,7 @@ ACTOR Future _processLoadingParam(LoadingParam param, Reference(param.blockSize, param.length - j); if (param.isRangeFile) { - fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader(&self->kvOpsPerLP[param], self->bc, + fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader(kvOpsPerLPIter, self->bc, param.version, param.filename, readOffset, readLen, param.restoreRange)); } else { @@ -158,7 +162,7 @@ ACTOR Future _processLoadingParam(LoadingParam param, ReferencekvOpsPerLP[param], &mutationMap); + _parseSerializedMutation(kvOpsPerLPIter, &mutationMap); } TraceEvent("FastRestore").detail("Loader", self->id()).detail("FinishLoadingFile", param.filename); @@ -434,8 +438,8 @@ bool isRangeMutation(MutationRef m) { // we may not get the entire mutation list for the version encoded_list_of_mutations: // [mutation1][mutation2]...[mutationk], where // a mutation is encoded as [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][keyContent][valueContent] -void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationListMap* pmutationMap, bool isSampling) { - VersionedMutationsMap& kvOps = *pkvOps; +void _parseSerializedMutation(std::map::iterator kvOpsIter, SerializedMutationListMap* pmutationMap, bool isSampling) { + VersionedMutationsMap& kvOps = kvOpsIter->second; SerializedMutationListMap& mutationMap = *pmutationMap; for (auto& m : mutationMap) { @@ -477,11 +481,11 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL } // Parsing the data blocks in a range file -ACTOR static Future _parseRangeFileToMutationsOnLoader(VersionedMutationsMap* pkvOps, +ACTOR static Future _parseRangeFileToMutationsOnLoader(std::map::iterator kvOpsIter, Reference bc, Version version, std::string fileName, int64_t readOffset, int64_t readLen, KeyRange restoreRange) { - state VersionedMutationsMap& kvOps = *pkvOps; + state VersionedMutationsMap& kvOps = kvOpsIter->second; // The set of key value version is rangeFile.version. the key-value set in the same range file has the same version Reference inFile = wait(bc->readFile(fileName)); diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index eac12844c0..e28948693f 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -352,19 +352,20 @@ ACTOR static Future distributeWorkloadPerVersionBatch(Reference self) { int numAppliers = self->appliersInterf.size(); - std::vector keyrangeSplitter; + std::vector keyrangeSplitter; // We will use the splitter at [1, numAppliers - 1]. The first splitter is normalKeys.begin int i; - for (i = 0; i < numAppliers - 1; i++) { - keyrangeSplitter.push_back(deterministicRandom()->randomUniqueID()); + for (i = 0; i < numAppliers; i++) { + keyrangeSplitter.push_back(Key(deterministicRandom()->randomUniqueID().toString())); } std::sort(keyrangeSplitter.begin(), keyrangeSplitter.end()); i = 0; + self->rangeToApplier.clear(); for (auto& applier : self->appliersInterf) { if (i == 0) { self->rangeToApplier[normalKeys.begin] = applier.first; } else { - self->rangeToApplier[StringRef(keyrangeSplitter[i].toString())] = applier.first; + self->rangeToApplier[Key(keyrangeSplitter[i])] = applier.first; } i++; } diff --git a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp index 0047633a13..389764353e 100644 --- a/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp +++ b/fdbserver/workloads/BackupAndParallelRestoreCorrectness.actor.cpp @@ -119,7 +119,7 @@ struct BackupAndParallelRestoreCorrectnessWorkload : TestWorkload { return; } - printf("[CheckDB] KV Number. Prev DB:%d Current DB:%d\n", self->dbKVs.size(), newDbKVs.size()); + printf("[CheckDB] KV Number. Prev DB:%ld Current DB:%ld\n", self->dbKVs.size(), newDbKVs.size()); // compare the KV pairs in the DB printf("------------------Now print out the diff between the prev DB and current DB-------------------\n"); if (self->dbKVs.size() >= newDbKVs.size()) { diff --git a/fdbserver/workloads/ParallelRestore.actor.cpp b/fdbserver/workloads/ParallelRestore.actor.cpp index aac39b592d..c877048a43 100644 --- a/fdbserver/workloads/ParallelRestore.actor.cpp +++ b/fdbserver/workloads/ParallelRestore.actor.cpp @@ -45,7 +45,7 @@ struct RunRestoreWorkerWorkload : TestWorkload { for (int i = 0; i < num_myWorkers; ++i) { myWorkers.push_back(_restoreWorker(cx, LocalityData())); } - printf("RunParallelRestoreWorkerWorkload, wait on reply from %d restore workers\n", myWorkers.size()); + printf("RunParallelRestoreWorkerWorkload, wait on reply from %ld restore workers\n", myWorkers.size()); worker = waitForAll(myWorkers); printf("RunParallelRestoreWorkerWorkload, got all replies from restore workers\n"); return Void(); From ed8d3f163c812ca493cc3225bcc0c8805faab984 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 15 Nov 2019 12:26:51 -0800 Subject: [PATCH 176/184] Rename hgVersion to sourceVersion. --- .gitignore | 4 ++-- fdbbackup/backup.actor.cpp | 6 +++--- fdbcli/fdbcli.actor.cpp | 6 +++--- fdbclient/NativeAPI.actor.cpp | 4 ++-- fdbclient/ThreadSafeTransaction.actor.cpp | 4 ++-- fdbrpc/fdbrpc.vcxproj | 4 ++-- fdbserver/SimulatedCluster.actor.cpp | 4 ++-- fdbserver/fdbserver.actor.cpp | 6 +++--- flow/CMakeLists.txt | 4 ++-- flow/SourceVersion.h.cmake | 2 ++ flow/flow.vcxproj | 4 ++-- flow/hgVersion.h.cmake | 2 -- flow/local.mk | 10 +++++----- flow/version.cpp | 6 +++--- 14 files changed, 33 insertions(+), 33 deletions(-) create mode 100644 flow/SourceVersion.h.cmake delete mode 100644 flow/hgVersion.h.cmake diff --git a/.gitignore b/.gitignore index 65c99da30e..001107847d 100644 --- a/.gitignore +++ b/.gitignore @@ -30,9 +30,9 @@ bindings/python/MANIFEST bindings/ruby/lib/fdboptions.rb bindings/ruby/fdb.gemspec fdbclient/vexillographer/obj/ -fdbrpc/hgVersion*.h +fdbrpc/SourceVersion*.h fdbrpc/libeio/config.h -flow/hgVersion*.h +flow/SourceVersion*.h generated.mk versions.h packaging/msi/FDBInstaller.wix* diff --git a/fdbbackup/backup.actor.cpp b/fdbbackup/backup.actor.cpp index 5bcb836e9f..38e7d0fb73 100644 --- a/fdbbackup/backup.actor.cpp +++ b/fdbbackup/backup.actor.cpp @@ -826,7 +826,7 @@ const KeyRef exeFastRestoreAgent = LiteralStringRef("fastrestore_agent"); // mus const KeyRef exeDatabaseAgent = LiteralStringRef("dr_agent"); const KeyRef exeDatabaseBackup = LiteralStringRef("fdbdr"); -extern const char* getHGVersion(); +extern const char* getSourceVersion(); #ifdef _WIN32 void parentWatcher(void *parentHandle) { @@ -842,7 +842,7 @@ void parentWatcher(void *parentHandle) { static void printVersion() { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("source version %s\n", getHGVersion()); + printf("source version %s\n", getSourceVersion()); printf("protocol %llx\n", (long long) currentProtocolVersion.version()); } @@ -3459,7 +3459,7 @@ int main(int argc, char* argv[]) { TraceEvent("ProgramStart") .setMaxEventLength(12000) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION ) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL)) diff --git a/fdbcli/fdbcli.actor.cpp b/fdbcli/fdbcli.actor.cpp index 7bf4ab54ab..ae5350f76a 100644 --- a/fdbcli/fdbcli.actor.cpp +++ b/fdbcli/fdbcli.actor.cpp @@ -54,7 +54,7 @@ #include "flow/actorcompiler.h" // This must be the last #include. -extern const char* getHGVersion(); +extern const char* getSourceVersion(); std::vector validOptions; @@ -563,7 +563,7 @@ void initHelp() { void printVersion() { printf("FoundationDB CLI " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("source version %s\n", getHGVersion()); + printf("source version %s\n", getSourceVersion()); printf("protocol %" PRIx64 "\n", currentProtocolVersion.version()); } @@ -2623,7 +2623,7 @@ ACTOR Future cli(CLIOptions opt, LineNoise* plinenoise) { if (opt.trace) { TraceEvent("CLIProgramStart") .setMaxEventLength(12000) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(NULL)) diff --git a/fdbclient/NativeAPI.actor.cpp b/fdbclient/NativeAPI.actor.cpp index 7740b562b6..34bbc60ed3 100644 --- a/fdbclient/NativeAPI.actor.cpp +++ b/fdbclient/NativeAPI.actor.cpp @@ -60,7 +60,7 @@ #endif #include "flow/actorcompiler.h" // This must be the last #include. -extern const char* getHGVersion(); +extern const char* getSourceVersion(); using std::max; using std::min; @@ -791,7 +791,7 @@ Database Database::createDatabase( Reference connFile, in openTraceFile(NetworkAddress(publicIP, ::getpid()), networkOptions.traceRollSize, networkOptions.traceMaxLogsSize, networkOptions.traceDirectory.get(), "trace", networkOptions.traceLogGroup); TraceEvent("ClientStart") - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detail("ClusterFile", connFile->getFilename().c_str()) diff --git a/fdbclient/ThreadSafeTransaction.actor.cpp b/fdbclient/ThreadSafeTransaction.actor.cpp index 7772aae862..c71482b3b9 100644 --- a/fdbclient/ThreadSafeTransaction.actor.cpp +++ b/fdbclient/ThreadSafeTransaction.actor.cpp @@ -333,9 +333,9 @@ void ThreadSafeTransaction::reset() { onMainThreadVoid( [tr](){ tr->reset(); }, NULL ); } -extern const char* getHGVersion(); +extern const char* getSourceVersion(); -ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getHGVersion(), currentProtocolVersion)), transportId(0) {} +ThreadSafeApi::ThreadSafeApi() : apiVersion(-1), clientVersion(format("%s,%s,%llx", FDB_VT_VERSION, getSourceVersion(), currentProtocolVersion)), transportId(0) {} void ThreadSafeApi::selectApiVersion(int apiVersion) { this->apiVersion = apiVersion; diff --git a/fdbrpc/fdbrpc.vcxproj b/fdbrpc/fdbrpc.vcxproj index b77c8d24f8..801321b336 100644 --- a/fdbrpc/fdbrpc.vcxproj +++ b/fdbrpc/fdbrpc.vcxproj @@ -163,8 +163,8 @@ - echo const char *hgVersion = "Current version id not currently supported within Windows."; > hgVersion.temp.h && fc /b hgVersion.temp.h hgVersion.h > nul || copy hgVersion.temp.h hgVersion.h > nul - Checking HG source version + echo const char *sourceVersion = "Current version id not currently supported within Windows."; > SourceVersion.temp.h && fc /b SourceVersion.temp.h SourceVersion.h > nul || copy SourceVersion.temp.h SourceVersion.h > nul + Checking source version fake.out diff --git a/fdbserver/SimulatedCluster.actor.cpp b/fdbserver/SimulatedCluster.actor.cpp index 4c56421b1f..d8e4988b2c 100644 --- a/fdbserver/SimulatedCluster.actor.cpp +++ b/fdbserver/SimulatedCluster.actor.cpp @@ -43,7 +43,7 @@ #undef min extern "C" int g_expect_full_pointermap; -extern const char* getHGVersion(); +extern const char* getSourceVersion(); const int MACHINE_REBOOT_TIME = 10; @@ -232,7 +232,7 @@ ACTOR Future simulatedFDBDRebooter(Referenceexcluded) .detail("UsingSSL", sslEnabled); TraceEvent("ProgramStart").detail("Cycles", cycles).detail("RandomId", randomId) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detail("DataFolder", *dataFolder) diff --git a/fdbserver/fdbserver.actor.cpp b/fdbserver/fdbserver.actor.cpp index cac1789297..d3f7377046 100644 --- a/fdbserver/fdbserver.actor.cpp +++ b/fdbserver/fdbserver.actor.cpp @@ -183,7 +183,7 @@ extern void createTemplateDatabase(); // FIXME: this really belongs in a header somewhere since it is actually used. extern IPAddress determinePublicIPAutomatically(ClusterConnectionString const& ccs); -extern const char* getHGVersion(); +extern const char* getSourceVersion(); extern void flushTraceFileVoid(); @@ -518,7 +518,7 @@ void* parentWatcher(void *arg) { static void printVersion() { printf("FoundationDB " FDB_VT_PACKAGE_NAME " (v" FDB_VT_VERSION ")\n"); - printf("source version %s\n", getHGVersion()); + printf("source version %s\n", getSourceVersion()); printf("protocol %" PRIx64 "\n", currentProtocolVersion.version()); } @@ -1672,7 +1672,7 @@ int main(int argc, char* argv[]) { TraceEvent("ProgramStart") .setMaxEventLength(12000) .detail("RandomSeed", opts.randomSeed) - .detail("SourceVersion", getHGVersion()) + .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detail("FileSystem", opts.fileSystemPath) diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt index 233e4e369f..ace8930c72 100644 --- a/flow/CMakeLists.txt +++ b/flow/CMakeLists.txt @@ -63,7 +63,7 @@ set(FLOW_SRCS XmlTraceLogFormatter.cpp actorcompiler.h error_definitions.h - ${CMAKE_CURRENT_BINARY_DIR}/hgVersion.h + ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h flat_buffers.h flat_buffers.cpp flow.cpp @@ -78,7 +78,7 @@ set(FLOW_SRCS stacktrace.h version.cpp) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/hgVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/hgVersion.h) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/SourceVersion.h.cmake ${CMAKE_CURRENT_BINARY_DIR}/SourceVersion.h) add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS}) target_include_directories(flow PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/flow/SourceVersion.h.cmake b/flow/SourceVersion.h.cmake new file mode 100644 index 0000000000..d4b4a390ab --- /dev/null +++ b/flow/SourceVersion.h.cmake @@ -0,0 +1,2 @@ +#pragma once +#define sourceVersion "${CURRENT_GIT_VERSION}" diff --git a/flow/flow.vcxproj b/flow/flow.vcxproj index fc0fa2a412..8c3336253c 100644 --- a/flow/flow.vcxproj +++ b/flow/flow.vcxproj @@ -142,8 +142,8 @@ - echo const char *hgVersion = "Current version id not currently supported within Windows."; > hgVersion.temp.h && fc /b hgVersion.temp.h hgVersion.h > nul || copy hgVersion.temp.h hgVersion.h > nul - Checking HG source version + echo const char *SourceVersion = "Current version id not currently supported within Windows."; > SourceVersion.temp.h && fc /b SourceVersion.temp.h SourceVersion.h > nul || copy SourceVersion.temp.h SourceVersion.h > nul + Checking source version diff --git a/flow/hgVersion.h.cmake b/flow/hgVersion.h.cmake deleted file mode 100644 index 7083caa285..0000000000 --- a/flow/hgVersion.h.cmake +++ /dev/null @@ -1,2 +0,0 @@ -#pragma once -#define hgVersion "${CURRENT_GIT_VERSION}" diff --git a/flow/local.mk b/flow/local.mk index 6ff17bb62e..6c6d0d69bb 100644 --- a/flow/local.mk +++ b/flow/local.mk @@ -28,12 +28,12 @@ ifeq ($(PLATFORM),osx) flow_LDFLAGS += -framework CoreFoundation -framework IOKit endif -GENERATED_SOURCES += flow/hgVersion.h versions.h +flow_GENERATED_SOURCES += flow/SourceVersion.h versions.h -flow/hgVersion.h: FORCE - @echo "Checking hgVersion.h" - @echo "const char *hgVersion = \"$(VERSION_ID)\";" > flow/hgVersion.h.new - @([ -e flow/hgVersion.h ] && diff -q flow/hgVersion.h flow/hgVersion.h.new >/dev/null && rm flow/hgVersion.h.new) || mv flow/hgVersion.h.new flow/hgVersion.h +flow/SourceVersion.h: FORCE + @echo "Checking SourceVersion.h" + @echo "const char *sourceVersion = \"$(VERSION_ID)\";" > flow/SourceVersion.h.new + @([ -e flow/SourceVersion.h ] && diff -q flow/SourceVersion.h flow/SourceVersion.h.new >/dev/null && rm flow/SourceVersion.h.new) || mv flow/SourceVersion.h.new flow/SourceVersion.h lib/libflow.a: bin/coverage.flow.xml diff --git a/flow/version.cpp b/flow/version.cpp index 61e1a6d2ef..2b2ffe8f68 100644 --- a/flow/version.cpp +++ b/flow/version.cpp @@ -18,8 +18,8 @@ * limitations under the License. */ -#include "flow/hgVersion.h" +#include "flow/SourceVersion.h" -const char* getHGVersion() { - return hgVersion; +const char* getSourceVersion() { + return sourceVersion; } From b5a450b4c6735230b310913c3043174e8b84f1b1 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 15 Nov 2019 12:41:08 -0800 Subject: [PATCH 177/184] Fix capitalization error --- flow/flow.vcxproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flow/flow.vcxproj b/flow/flow.vcxproj index 8c3336253c..1adada93f4 100644 --- a/flow/flow.vcxproj +++ b/flow/flow.vcxproj @@ -142,7 +142,7 @@ - echo const char *SourceVersion = "Current version id not currently supported within Windows."; > SourceVersion.temp.h && fc /b SourceVersion.temp.h SourceVersion.h > nul || copy SourceVersion.temp.h SourceVersion.h > nul + echo const char *sourceVersion = "Current version id not currently supported within Windows."; > SourceVersion.temp.h && fc /b SourceVersion.temp.h SourceVersion.h > nul || copy SourceVersion.temp.h SourceVersion.h > nul Checking source version From fdc7d8a676007c2a12679087c1be93d832daa153 Mon Sep 17 00:00:00 2001 From: "A.J. Beamon" Date: Fri, 15 Nov 2019 13:03:01 -0800 Subject: [PATCH 178/184] Add hgVersion*.h back to the gitignore file for now --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 001107847d..7b23facbe3 100644 --- a/.gitignore +++ b/.gitignore @@ -30,8 +30,10 @@ bindings/python/MANIFEST bindings/ruby/lib/fdboptions.rb bindings/ruby/fdb.gemspec fdbclient/vexillographer/obj/ +fdbrpc/hgVersion*.h fdbrpc/SourceVersion*.h fdbrpc/libeio/config.h +flow/hgVersion*.h flow/SourceVersion*.h generated.mk versions.h From 2c227a7049a97ede59d444fe240a509481f7f982 Mon Sep 17 00:00:00 2001 From: negoyal Date: Tue, 19 Nov 2019 17:41:48 -0800 Subject: [PATCH 179/184] Missing cacheTag pop changes in OldTLogServer 6_2 version. --- fdbserver/OldTLogServer_6_2.actor.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index 567502cfcb..b07eb904b3 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -962,6 +962,26 @@ ACTOR Future updateStorage( TLogData* self ) { state FlowLock::Releaser commitLockReleaser; + //FIXME: This policy for calculating the cache pop version could end up popping recent data in the remote DC after two consecutive recoveries. + // It also does not protect against spilling the cache tag directly, so it is theoretically possible to spill this tag; which is not intended to ever happen. + Optional cachePopVersion; + for(auto& it : self->id_data) { + if(!it.second->stopped) { + if(it.second->version.get() - it.second->unrecoveredBefore > SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT + SERVER_KNOBS->MAX_CACHE_VERSIONS) { + cachePopVersion = it.second->version.get() - SERVER_KNOBS->MAX_CACHE_VERSIONS; + } + break; + } + } + + if(cachePopVersion.present()) { + state std::vector> cachePopFutures; + for(auto& it : self->id_data) { + cachePopFutures.push_back(tLogPop(self, TLogPopRequest(cachePopVersion.get(),0,cacheTag), it.second)); + } + wait( waitForAll(cachePopFutures) ); + } + if(logData->stopped) { if (self->bytesInput - self->bytesDurable >= self->targetVolatileBytes) { while(logData->persistentDataDurableVersion != logData->version.get()) { From b6f35c573eebd68840b546abd9f4e9a4a2959743 Mon Sep 17 00:00:00 2001 From: negoyal Date: Wed, 20 Nov 2019 10:43:24 -0800 Subject: [PATCH 180/184] Forward declare tLogPop in 6_2. --- fdbserver/OldTLogServer_6_2.actor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fdbserver/OldTLogServer_6_2.actor.cpp b/fdbserver/OldTLogServer_6_2.actor.cpp index b07eb904b3..d42f0a4d52 100644 --- a/fdbserver/OldTLogServer_6_2.actor.cpp +++ b/fdbserver/OldTLogServer_6_2.actor.cpp @@ -943,6 +943,8 @@ ACTOR Future updatePersistentData( TLogData* self, Reference logD return Void(); } +ACTOR Future tLogPop( TLogData* self, TLogPopRequest req, Reference logData ); + // This function (and updatePersistentData, which is called by this function) run at a low priority and can soak up all CPU resources. // For this reason, they employ aggressive use of yields to avoid causing slow tasks that could introduce latencies for more important // work (e.g. commits). From 343bcd104a22fb5f71823feff2c4bbac9f8a8e2f Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 20 Nov 2019 21:04:18 -0800 Subject: [PATCH 181/184] FastRestore:Apply Clang format --- fdbserver/RestoreLoader.actor.cpp | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 9b97a8c843..1d98ba6121 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -36,8 +36,8 @@ typedef std::map, uint32_t> SerializedMutationPartMap; bool isRangeMutation(MutationRef m); void splitMutation(Reference self, MutationRef m, Arena& mvector_arena, VectorRef& mvector, Arena& nodeIDs_arena, VectorRef& nodeIDs); -void _parseSerializedMutation(std::map::iterator kvOpsIter, SerializedMutationListMap* mutationMap, - bool isSampling = false); +void _parseSerializedMutation(std::map::iterator kvOpsIter, + SerializedMutationListMap* mutationMap, bool isSampling = false); void handleRestoreSysInfoRequest(const RestoreSysInfoRequest& req, Reference self); ACTOR Future handleLoadFileRequest(RestoreLoadFileRequest req, Reference self, @@ -50,10 +50,9 @@ ACTOR static Future _parseLogFileToMutationsOnLoader( NotifiedVersion* pProcessedFileOffset, SerializedMutationListMap* mutationMap, SerializedMutationPartMap* mutationPartMap, Reference bc, Version version, std::string fileName, int64_t readOffset, int64_t readLen, KeyRange restoreRange, Key addPrefix, Key removePrefix, Key mutationLogPrefix); -ACTOR static Future _parseRangeFileToMutationsOnLoader(std::map::iterator kvOpsIter, - Reference bc, Version version, - std::string fileName, int64_t readOffset_input, - int64_t readLen_input, KeyRange restoreRange); +ACTOR static Future _parseRangeFileToMutationsOnLoader( + std::map::iterator kvOpsIter, Reference bc, Version version, + std::string fileName, int64_t readOffset_input, int64_t readLen_input, KeyRange restoreRange); ACTOR Future restoreLoaderCore(RestoreLoaderInterface loaderInterf, int nodeIndex, Database cx) { state Reference self = @@ -131,7 +130,7 @@ ACTOR Future _processLoadingParam(LoadingParam param, ReferencekvOpsPerLP.find(param) == self->kvOpsPerLP.end()); // NOTE: map's iterator is guaranteed to be stable, but pointer may not. - //state VersionedMutationsMap* kvOps = &self->kvOpsPerLP[param]; + // state VersionedMutationsMap* kvOps = &self->kvOpsPerLP[param]; self->kvOpsPerLP.insert(std::make_pair(param, VersionedMutationsMap())); state std::map::iterator kvOpsPerLPIter = self->kvOpsPerLP.find(param); @@ -150,9 +149,8 @@ ACTOR Future _processLoadingParam(LoadingParam param, Reference(param.blockSize, param.length - j); if (param.isRangeFile) { - fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader(kvOpsPerLPIter, self->bc, - param.version, param.filename, readOffset, - readLen, param.restoreRange)); + fileParserFutures.push_back(_parseRangeFileToMutationsOnLoader( + kvOpsPerLPIter, self->bc, param.version, param.filename, readOffset, readLen, param.restoreRange)); } else { fileParserFutures.push_back(_parseLogFileToMutationsOnLoader( &processedFileOffset, &mutationMap, &mutationPartMap, self->bc, param.version, param.filename, @@ -438,7 +436,8 @@ bool isRangeMutation(MutationRef m) { // we may not get the entire mutation list for the version encoded_list_of_mutations: // [mutation1][mutation2]...[mutationk], where // a mutation is encoded as [type:uint32_t][keyLength:uint32_t][valueLength:uint32_t][keyContent][valueContent] -void _parseSerializedMutation(std::map::iterator kvOpsIter, SerializedMutationListMap* pmutationMap, bool isSampling) { +void _parseSerializedMutation(std::map::iterator kvOpsIter, + SerializedMutationListMap* pmutationMap, bool isSampling) { VersionedMutationsMap& kvOps = kvOpsIter->second; SerializedMutationListMap& mutationMap = *pmutationMap; @@ -481,10 +480,9 @@ void _parseSerializedMutation(std::map::ite } // Parsing the data blocks in a range file -ACTOR static Future _parseRangeFileToMutationsOnLoader(std::map::iterator kvOpsIter, - Reference bc, Version version, - std::string fileName, int64_t readOffset, int64_t readLen, - KeyRange restoreRange) { +ACTOR static Future _parseRangeFileToMutationsOnLoader( + std::map::iterator kvOpsIter, Reference bc, Version version, + std::string fileName, int64_t readOffset, int64_t readLen, KeyRange restoreRange) { state VersionedMutationsMap& kvOps = kvOpsIter->second; // The set of key value version is rangeFile.version. the key-value set in the same range file has the same version From 78f10f15b3a237c6bc209e44813fd4fa0df4b360 Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Thu, 21 Nov 2019 22:47:01 -0800 Subject: [PATCH 182/184] FastRestore:replace insert with emplace for map and vector This resolves the review suggestions. --- fdbserver/RestoreLoader.actor.cpp | 2 +- fdbserver/RestoreMaster.actor.cpp | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 1d98ba6121..bb28ae4536 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -131,7 +131,7 @@ ACTOR Future _processLoadingParam(LoadingParam param, ReferencekvOpsPerLP.find(param) == self->kvOpsPerLP.end()); // NOTE: map's iterator is guaranteed to be stable, but pointer may not. // state VersionedMutationsMap* kvOps = &self->kvOpsPerLP[param]; - self->kvOpsPerLP.insert(std::make_pair(param, VersionedMutationsMap())); + self->kvOpsPerLP.emplace(param, VersionedMutationsMap()); state std::map::iterator kvOpsPerLPIter = self->kvOpsPerLP.find(param); // Temporary data structure for parsing log files into (version, ) diff --git a/fdbserver/RestoreMaster.actor.cpp b/fdbserver/RestoreMaster.actor.cpp index e28948693f..f7dfc13b56 100644 --- a/fdbserver/RestoreMaster.actor.cpp +++ b/fdbserver/RestoreMaster.actor.cpp @@ -315,8 +315,7 @@ ACTOR static Future sendMutationsFromLoaders(Reference std::vector> requests; for (auto& loader : self->loadersInterf) { - requests.push_back( - std::make_pair(loader.first, RestoreSendMutationsToAppliersRequest(self->rangeToApplier, useRangeFile))); + requests.emplace_back(loader.first, RestoreSendMutationsToAppliersRequest(self->rangeToApplier, useRangeFile)); } wait(sendBatchRequests(&RestoreLoaderInterface::sendMutations, self->loadersInterf, requests)); @@ -365,7 +364,7 @@ void dummySampleWorkload(Reference self) { if (i == 0) { self->rangeToApplier[normalKeys.begin] = applier.first; } else { - self->rangeToApplier[Key(keyrangeSplitter[i])] = applier.first; + self->rangeToApplier[keyrangeSplitter[i]] = applier.first; } i++; } From 9927a9013f31125ccf2d6b77eba9f46903f28c53 Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 22 Nov 2019 11:47:25 -0800 Subject: [PATCH 183/184] Use sizeof() to replace constant numbers --- fdbserver/RestoreLoader.actor.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index 2f49caaeb5..fb40ef79e4 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -363,13 +363,14 @@ bool concatenateBackupMutationForLogFile(std::map, Standal std::string prefix = "||\t"; std::stringstream ss; StringRef val = val_input.contents(); + const int key_suffix_len = sizeof(uint8_t) + sizeof(Version) + sizeof(uint32_t); StringRefReaderMX reader(val, restore_corrupted_data()); StringRefReaderMX readerKey(key_input, restore_corrupted_data()); // read key_input! - int logRangeMutationFirstLength = key_input.size() - 1 - 8 - 4; + int logRangeMutationFirstLength = key_input.size() - key_suffix_len; bool concatenated = false; - ASSERT_WE_THINK(key_input.size() >= 1 + 8 + 4); + ASSERT_WE_THINK(key_input.size() >= key_suffix_len); if (logRangeMutationFirstLength > 0) { // Strip out the [logRangeMutation.first]; otherwise, the following readerKey.consume will produce wrong value @@ -377,10 +378,10 @@ bool concatenateBackupMutationForLogFile(std::map, Standal } readerKey.consume(); // uint8_t hashValue = readerKey.consume() - uint64_t commitVersion = readerKey.consumeNetworkUInt64(); + Version commitVersion = readerKey.consumeNetworkUInt64(); uint32_t part = readerKey.consumeNetworkUInt32(); // Use commitVersion as id - Standalone id = StringRef((uint8_t*)&commitVersion, 8); + Standalone id = StringRef((uint8_t*)&commitVersion, sizeof(Version)); if (mutationMap.find(id) == mutationMap.end()) { mutationMap.insert(std::make_pair(id, val_input)); @@ -442,10 +443,11 @@ void _parseSerializedMutation(VersionedMutationsMap* pkvOps, SerializedMutationL StringRefReaderMX vReader(val, restore_corrupted_data()); vReader.consume(); // Consume the includeVersion - uint32_t val_length_decoded = - vReader.consume(); // Parse little endian value, confirmed it is correct! - ASSERT(val_length_decoded == - val.size() - 12); // 12 is the length of [includeVersion:uint64_t][val_length:uint32_t] + // TODO(xumengpanda): verify the protocol version is compatible and raise error if needed + + // Parse little endian value, confirmed it is correct! + uint32_t val_length_decoded = vReader.consume(); + ASSERT(val_length_decoded == val.size() - sizeof(uint64_t) - sizeof(uint32_t)); while (1) { // stop when reach the end of the string From 037e808253545dba1cdc9e94be6bf725b6843a0d Mon Sep 17 00:00:00 2001 From: Jingyu Zhou Date: Fri, 22 Nov 2019 13:12:04 -0800 Subject: [PATCH 184/184] Address review comments by changing variable names --- fdbserver/RestoreLoader.actor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fdbserver/RestoreLoader.actor.cpp b/fdbserver/RestoreLoader.actor.cpp index fb40ef79e4..75f2c65440 100644 --- a/fdbserver/RestoreLoader.actor.cpp +++ b/fdbserver/RestoreLoader.actor.cpp @@ -363,14 +363,14 @@ bool concatenateBackupMutationForLogFile(std::map, Standal std::string prefix = "||\t"; std::stringstream ss; StringRef val = val_input.contents(); - const int key_suffix_len = sizeof(uint8_t) + sizeof(Version) + sizeof(uint32_t); + const int key_prefix_len = sizeof(uint8_t) + sizeof(Version) + sizeof(uint32_t); StringRefReaderMX reader(val, restore_corrupted_data()); StringRefReaderMX readerKey(key_input, restore_corrupted_data()); // read key_input! - int logRangeMutationFirstLength = key_input.size() - key_suffix_len; + int logRangeMutationFirstLength = key_input.size() - key_prefix_len; bool concatenated = false; - ASSERT_WE_THINK(key_input.size() >= key_suffix_len); + ASSERT_WE_THINK(key_input.size() >= key_prefix_len); if (logRangeMutationFirstLength > 0) { // Strip out the [logRangeMutation.first]; otherwise, the following readerKey.consume will produce wrong value